In [1]:
import pandas as pd

In [17]:
chicago = pd.read_csv("chicago.csv")
chicago.dropna(how = "all", inplace = True)
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [5]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [9]:
chicago["Department"].unique()

array(['WATER MGMNT', 'POLICE', 'GENERAL SERVICES', 'CITY COUNCIL',
       'STREETS & SAN', 'OEMC', 'AVIATION', 'FIRE', 'FAMILY & SUPPORT',
       'IPRA', 'PUBLIC LIBRARY', 'BUSINESS AFFAIRS', 'TRANSPORTN',
       'HEALTH', "MAYOR'S OFFICE", 'LAW', 'FINANCE', 'CULTURAL AFFAIRS',
       'COMMUNITY DEVELOPMENT', 'BUILDINGS', 'ANIMAL CONTRL',
       'CITY CLERK', 'BOARD OF ELECTION', 'INSPECTOR GEN', 'TREASURER',
       'DISABILITIES', 'HUMAN RESOURCES', 'DoIT', 'BUDGET & MGMT',
       'PROCUREMENT', 'HUMAN RELATIONS', 'BOARD OF ETHICS',
       'POLICE BOARD', 'ADMIN HEARNG', 'LICENSE APPL COMM', nan],
      dtype=object)

In [10]:
chicago["Department"].nunique()

35

In [11]:
# Nós da uma boa aproxy de quais colunas podemos utilizar para otimizarmos o consumo de memória do nosso Datafrane

chicago.nunique() # Nos dá uma overview sobre a quantidade de valores únicos para cada coluna do nosso Dataframe

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [13]:
# Otimizando o consumo de memória do nosso Datafrane

chicago["Department"] = chicago["Department"].astype("category")

In [14]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


## **Common String Methods - .lower(), .upper(), .title() and .len()**

In [19]:
chicago = pd.read_csv(filepath_or_buffer = "chicago.csv")
chicago.dropna(how = "all", inplace = True)
chicago["Department"] = chicago["Department"].astype("category")

chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [20]:
"Hello World".lower()

'hello world'

In [21]:
"Hello World".upper()

'HELLO WORLD'

In [23]:
"heLlo woRld".title()

'Hello World'

In [26]:
# Para aplicarmos métodos comuns de strings em valores de Séries Pandas, precisamos extender a Série para a propriedade 'str'

chicago["Name"].str.lower()

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32057    zygadlo,  michael j
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
Name: Name, Length: 32062, dtype: object

In [32]:
chicago["Name"] = chicago["Name"].str.title()
chicago["Position Title"] = chicago["Position Title"].str.title()
chicago["Department"] = chicago["Department"].str.title()

In [33]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,$90744.00
1,"Aaron, Jeffery M",Police Officer,Police,$84450.00
2,"Aaron, Karina",Police Officer,Police,$84450.00


In [34]:
# Utilizando o método .len() para verificarmos a quantidade de caracteres em cada string da nossa coluna

chicago["Department"].str.len()

0        11
1         6
2         6
3        16
4        11
         ..
32057    16
32058     6
32059     6
32060     6
32061     4
Name: Department, Length: 32062, dtype: int64

## **The .str.replace() Method**

In [35]:
chicago = pd.read_csv("chicago.csv")
chicago.dropna(how = "all", inplace = True)
chicago["Department"] = chicago["Department"].astype("category")

chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [36]:
"Hello world".replace("l", "!")

'He!!o wor!d'

In [39]:
chicago["Department"] = chicago["Department"].str.title().str.replace("Mgmnt", "Management")

In [40]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,Water Management,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,Police,$84450.00
2,"AARON, KARINA",POLICE OFFICER,Police,$84450.00


In [44]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype("float")

  chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype("float")


In [45]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,Water Management,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,Police,84450.0
2,"AARON, KARINA",POLICE OFFICER,Police,84450.0


In [46]:
chicago["Employee Annual Salary"].sum()

2571506375.36