In [1]:
import pandas as pd

# Import Data & Optimization

In [2]:
# import data
chicago = pd.read_csv('data/chicago.csv')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [3]:
# use info to determine memory usage
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [4]:
# identify unique values (Department is a good candidate for optimization)
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [5]:
# apply category optimization and display memory usage
chicago['Department'] = chicago['Department'].astype('category')
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


In [6]:
# identify NaN values
chicago_nan_sum = chicago.isna().sum()
chicago_nan = chicago.isna()

print(chicago_nan_sum)
print('\n')
print(chicago_nan)

Name                      1
Position Title            1
Department                1
Employee Annual Salary    1
dtype: int64


        Name  Position Title  Department  Employee Annual Salary
0      False           False       False                   False
1      False           False       False                   False
2      False           False       False                   False
3      False           False       False                   False
4      False           False       False                   False
...      ...             ...         ...                     ...
32058  False           False       False                   False
32059  False           False       False                   False
32060  False           False       False                   False
32061  False           False       False                   False
32062   True            True        True                    True

[32063 rows x 4 columns]


In [7]:
# drop NaN values
chicago = chicago.dropna(how = 'any')
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


# Common String Methods

In [8]:
# lower method 
# when calling string methods on series or dataframes, the method must be prefixed with .str
chicago['Name'] = chicago['Name'].str.lower()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"aaron, elvia j",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"aaron, jeffery m",POLICE OFFICER,POLICE,$84450.00
2,"aaron, karina",POLICE OFFICER,POLICE,$84450.00
3,"aaron, kimberlei r",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"abad jr, vicente m",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [9]:
# upper method 
chicago['Name'] = chicago['Name'].str.upper()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [10]:
# title method
chicago['Name'] = chicago['Name'].str.title()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
2,"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
3,"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
