In [1]:
import pandas as pd

In [2]:
chicago = pd.read_csv("chicago.csv")
chicago.dropna(how = "all", inplace = True)
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [3]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1.2+ MB


In [4]:
chicago["Department"].unique()

array(['WATER MGMNT', 'POLICE', 'GENERAL SERVICES', 'CITY COUNCIL',
       'STREETS & SAN', 'OEMC', 'AVIATION', 'FIRE', 'FAMILY & SUPPORT',
       'IPRA', 'PUBLIC LIBRARY', 'BUSINESS AFFAIRS', 'TRANSPORTN',
       'HEALTH', "MAYOR'S OFFICE", 'LAW', 'FINANCE', 'CULTURAL AFFAIRS',
       'COMMUNITY DEVELOPMENT', 'BUILDINGS', 'ANIMAL CONTRL',
       'CITY CLERK', 'BOARD OF ELECTION', 'INSPECTOR GEN', 'TREASURER',
       'DISABILITIES', 'HUMAN RESOURCES', 'DoIT', 'BUDGET & MGMT',
       'PROCUREMENT', 'HUMAN RELATIONS', 'BOARD OF ETHICS',
       'POLICE BOARD', 'ADMIN HEARNG', 'LICENSE APPL COMM'], dtype=object)

In [5]:
chicago["Department"].nunique()

35

In [6]:
# Nós da uma boa aproxy de quais colunas podemos utilizar para otimizarmos o consumo de memória do nosso Datafrane

chicago.nunique() # Nos dá uma overview sobre a quantidade de valores únicos para cada coluna do nosso Dataframe

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [7]:
# Otimizando o consumo de memória do nosso Datafrane

chicago["Department"] = chicago["Department"].astype("category")

In [8]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


## **Common String Methods - .lower(), .upper(), .title() and .len()**

In [9]:
chicago = pd.read_csv(filepath_or_buffer = "chicago.csv")
chicago.dropna(how = "all", inplace = True)
chicago["Department"] = chicago["Department"].astype("category")

chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [10]:
"Hello World".lower()

'hello world'

In [11]:
"Hello World".upper()

'HELLO WORLD'

In [12]:
"heLlo woRld".title()

'Hello World'

In [13]:
# Para aplicarmos métodos comuns de strings em valores de Séries Pandas, precisamos extender a Série para a propriedade 'str'

chicago["Name"].str.lower()

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32057    zygadlo,  michael j
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
Name: Name, Length: 32062, dtype: object

In [14]:
chicago["Name"] = chicago["Name"].str.title()
chicago["Position Title"] = chicago["Position Title"].str.title()
chicago["Department"] = chicago["Department"].str.title()

In [15]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,$90744.00
1,"Aaron, Jeffery M",Police Officer,Police,$84450.00
2,"Aaron, Karina",Police Officer,Police,$84450.00


In [16]:
# Utilizando o método .len() para verificarmos a quantidade de caracteres em cada string da nossa coluna

chicago["Department"].str.len()

0        11
1         6
2         6
3        16
4        11
         ..
32057    16
32058     6
32059     6
32060     6
32061     4
Name: Department, Length: 32062, dtype: int64

## **The .str.replace() Method**

In [17]:
chicago = pd.read_csv("chicago.csv")
chicago.dropna(how = "all", inplace = True)
chicago["Department"] = chicago["Department"].astype("category")

chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [18]:
"Hello world".replace("l", "!")

'He!!o wor!d'

In [19]:
chicago["Department"] = chicago["Department"].str.title().str.replace("Mgmnt", "Management")

In [20]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,Water Management,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,Police,$84450.00
2,"AARON, KARINA",POLICE OFFICER,Police,$84450.00


In [21]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype("float")

  chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype("float")


In [22]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,Water Management,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,Police,84450.0
2,"AARON, KARINA",POLICE OFFICER,Police,84450.0


In [23]:
chicago["Employee Annual Salary"].sum()

2571506375.36

## **Filtering with String Methods**

In [24]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")

chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [25]:
# Antes de utilizarmos uma filtragem de dados com strings, uma boa prática é normalizar esses dados com os métodos "lower" ou "upper" 
# Utilizando o método "contains" para verificar se uma palavra está presente nos valores de nossa Série pandas

position_title_contain_water_word = chicago["Position Title"].str.lower().str.contains("water")
chicago.query("@position_title_contain_water_word")

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [26]:
mask = chicago["Position Title"].str.lower().str.startswith("water")
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [27]:
mask = chicago["Position Title"].str.lower().str.endswith("ist")
chicago.query("@mask")

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


## **More String Methods - .strip(), .lstrip() and .rstrip()**

In [28]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [31]:
"            Hello World           ".lstrip()

'Hello World           '

In [32]:
"            Hello World           ".rstrip()

'            Hello World'

In [33]:
"            Hello World           ".strip()

'Hello World'

In [38]:
chicago["Name"] = chicago["Name"].str.strip()

In [39]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


## **String Methods on Index and Columns**

In [41]:
chicago = pd.read_csv("chicago.csv", index_col = ["Name"]).dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [46]:
chicago.index = chicago.index.str.strip().str.title()

In [47]:
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [50]:
chicago.columns = chicago.columns.str.upper()

In [51]:
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


## **Split Strings by Characters with .str.split() Method**

In [52]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [53]:
"Hello my name is Lucas".split(" ")

['Hello', 'my', 'name', 'is', 'Lucas']

In [64]:
# com o código abaixo estamos obtendo os sobrenomes e normalizando-os para obtermos os nomes mais comuns em chicago
chicago["Name"].str.split(",").str.get(0).str.title().value_counts().nlargest(5)

Williams    293
Johnson     244
Smith       241
Brown       185
Jones       183
Name: Name, dtype: int64

In [71]:
# Encontrar a primeira palavra mais comum entre os titulos existentes no dataset
# Precisamos validar também se teremos pelo menos um elemento gerado para cada lista, se não podemos obter um erro.

chicago["Position Title"].str.split(" ").str.get(0).str.title().value_counts().head(5)

Police             10856
Firefighter-Emt     1509
Sergeant            1186
Pool                 918
Firefighter          810
Name: Position Title, dtype: int64

## **More Practice with Splits**

In [78]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [86]:
# Obtendo os 5 primeiros nomes mais populares de Chicago

chicago["Name"].str.split(",").str.get(1).str.strip().str.split(" ").str.get(0).value_counts().head(5)

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

## **The expand and n Parameters of the str.split() Method**

In [87]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [91]:
chicago["Name"].str.lower().str.replace("aaron", "lucas") # troca a ocorrência da palavra na string

0            lucas,  elvia j
1          lucas,  jeffery m
2             lucas,  karina
3        lucas,  kimberlei r
4        abad jr,  vicente m
                ...         
32057    zygadlo,  michael j
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
Name: Name, Length: 32062, dtype: object

In [94]:
chicago["Name"].replace("AARON,  ELVIA J", "LUCAS") # altera apenas o valor inteiro do registro observado

0                      LUCAS
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

In [109]:
chicago[["First Name", "Last Name"]] = chicago["Name"].str.split(",", expand = True) # .rename(columns = {0: "First Name", 1: "Last Name"})

In [110]:
chicago.head(1)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J


In [115]:
 chicago[["First Title Word", "Remaining Words for Title"]] = chicago["Position Title"].str.split(" ", expand = True, n = 1) # o Dataframe gerado terá o shape adequado de acordo com a lista gerada com o maior tamanho

In [116]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words for Title
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
