In [1]:
import pandas as pd

# Import Data & Optimization

In [2]:
# import data
chicago = pd.read_csv('data/chicago.csv')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [3]:
# use info to determine memory usage
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [4]:
# identify unique values (Department is a good candidate for optimization)
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [5]:
# apply category optimization and display memory usage
chicago['Department'] = chicago['Department'].astype('category')
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


In [6]:
# identify NaN values
chicago_nan_sum = chicago.isna().sum()
chicago_nan = chicago.isna()

print(chicago_nan_sum)
print('\n')
print(chicago_nan)

Name                      1
Position Title            1
Department                1
Employee Annual Salary    1
dtype: int64


        Name  Position Title  Department  Employee Annual Salary
0      False           False       False                   False
1      False           False       False                   False
2      False           False       False                   False
3      False           False       False                   False
4      False           False       False                   False
...      ...             ...         ...                     ...
32058  False           False       False                   False
32059  False           False       False                   False
32060  False           False       False                   False
32061  False           False       False                   False
32062   True            True        True                    True

[32063 rows x 4 columns]


In [7]:
# drop NaN values
chicago = chicago.dropna(how = 'all')
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


# Common String Methods

In [8]:
# lower method 
# when calling string methods on series or dataframes, the method must be prefixed with .str
chicago['Name'] = chicago['Name'].str.lower()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"aaron, elvia j",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"aaron, jeffery m",POLICE OFFICER,POLICE,$84450.00
2,"aaron, karina",POLICE OFFICER,POLICE,$84450.00
3,"aaron, kimberlei r",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"abad jr, vicente m",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [9]:
# upper method 
chicago['Name'] = chicago['Name'].str.upper()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [10]:
# title method
chicago['Name'] = chicago['Name'].str.title()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
2,"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
3,"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [11]:
# replace method (character(s) to replace, new character(s))
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
2,"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
3,"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [12]:
# title method for remaining columns
chicago['Position Title'] = chicago['Position Title'].str.title()
chicago['Department'] = chicago['Department'].str.title()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,$90744.00
1,"Aaron, Jeffery M",Police Officer,Police,$84450.00
2,"Aaron, Karina",Police Officer,Police,$84450.00
3,"Aaron, Kimberlei R",Chief Contract Expediter,General Services,$89880.00
4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Management,$106836.00


In [13]:
# replace and astype methods to convert salary values to float
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)
chicago.head()

  chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
1,"Aaron, Jeffery M",Police Officer,Police,84450.0
2,"Aaron, Karina",Police Officer,Police,84450.0
3,"Aaron, Kimberlei R",Chief Contract Expediter,General Services,89880.0
4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Management,106836.0


In [14]:
# strip method (strips both left and right sides of blank space)
# good practice to strip all text columns in case there is accidental white space
# rstrip and lstrip remove white space from respective side, unless there is a specific use case we should use strip
chicago['Name'] = chicago['Name'].str.strip()
chicago['Position Title'] = chicago['Position Title'].str.strip()
chicago['Department'] = chicago['Department'].str.strip()

chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
1,"Aaron, Jeffery M",Police Officer,Police,84450.0
2,"Aaron, Karina",Police Officer,Police,84450.0
3,"Aaron, Kimberlei R",Chief Contract Expediter,General Services,89880.0
4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Management,106836.0


In [15]:
# split and get method to split name
chicago['First Name'] = chicago['Name'].str.split(', ').str.get(1).str.strip().str.split(' ').str.get(0)
chicago['Middle Initial'] = chicago['Name'].str.split(', ').str.get(1).str.strip().str.split(' ').str.get(1)
chicago['Last Name'] = chicago['Name'].str.split(', ').str.get(0)

# strip new columns to ensure there is no whitespace
chicago['First Name'] = chicago['First Name'].str.strip()
chicago['Middle Initial'] = chicago['Middle Initial'].str.strip()
chicago['Last Name'] = chicago['Last Name'].str.strip()

# reorder columns
chicago = chicago.reindex(columns = ['Name', 'First Name', 'Middle Initial', 'Last Name', 'Position Title', 'Department', 'Employee Annual Salary'])

chicago.head()

Unnamed: 0,Name,First Name,Middle Initial,Last Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Elvia,J,Aaron,Water Rate Taker,Water Management,90744.0
1,"Aaron, Jeffery M",Jeffery,M,Aaron,Police Officer,Police,84450.0
2,"Aaron, Karina",Karina,,Aaron,Police Officer,Police,84450.0
3,"Aaron, Kimberlei R",Kimberlei,R,Aaron,Chief Contract Expediter,General Services,89880.0
4,"Abad Jr, Vicente M",Vicente,M,Abad Jr,Civil Engineer Iv,Water Management,106836.0


In [16]:
# alternate method using expand and n parameter
# expand = True returns a data frame instead of a list
# n parameter limits the amount of splits 
chicago['Position Title'].str.split(' ', expand = True, n = 1).head()

Unnamed: 0,0,1
0,Water,Rate Taker
1,Police,Officer
2,Police,Officer
3,Chief,Contract Expediter
4,Civil,Engineer Iv


# Filtering w/ String Methods

In [30]:
# filter using contains method
# call lower method to normalize data
water_filter = chicago['Position Title'].str.lower().str.contains('water')
chicago[water_filter].head()

Unnamed: 0,Name,First Name,Middle Initial,Last Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Elvia,J,Aaron,Water Rate Taker,Water Management,90744.0
554,"Aluise, Vincent G",Vincent,G,Aluise,Foreman Of Water Pipe Construction,Water Management,102440.0
671,"Ander, Perry A",Perry,A,Ander,Water Chemist Ii,Water Management,82044.0
685,"Anderson, Andrew J",Andrew,J,Anderson,District Superintendent Of Water Distribution,Water Management,109272.0
702,"Anderson, Donald",Donald,,Anderson,Foreman Of Water Pipe Construction,Water Management,102440.0


In [32]:
# filter using startswith method
water_start_filter = chicago['Position Title'].str.lower().str.startswith('water')
chicago[water_start_filter].head()

Unnamed: 0,Name,First Name,Middle Initial,Last Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Elvia,J,Aaron,Water Rate Taker,Water Management,90744.0
671,"Ander, Perry A",Perry,A,Ander,Water Chemist Ii,Water Management,82044.0
1054,"Ashley, Karma T",Karma,T,Ashley,Water Chemist Ii,Water Management,82044.0
1079,"Atkins, Joanna M",Joanna,M,Atkins,Water Chemist Ii,Water Management,82044.0
1181,"Azeem, Mohammed A",Mohammed,A,Azeem,Water Chemist Ii,Water Management,53172.0


In [33]:
# filter using endswith method
ist_ends_filter = chicago['Position Title'].str.lower().str.endswith('ist')
chicago[ist_ends_filter].head()

Unnamed: 0,Name,First Name,Middle Initial,Last Name,Position Title,Department,Employee Annual Salary
184,"Afroz, Nayyar",Nayyar,,Afroz,Psychiatrist,Health,99840.0
308,"Alarcon, Luis J",Luis,J,Alarcon,Loan Processing Specialist,Community Development,81948.0
422,"Allain, Carolyn",Carolyn,,Allain,Senior Telecommunications Specialist,Doit,89880.0
472,"Allen, Robert",Robert,,Allen,Machinist,Water Management,94328.0
705,"Anderson, Edward M",Edward,M,Anderson,Sr Procurement Specialist,Procurement,91476.0


# Using String Methods on Index and Column Labels

In [35]:
# import data with index column
chicago_indexed = pd.read_csv('data/chicago.csv', index_col = 'Name').dropna(how = 'all')
chicago_indexed.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [37]:
# output array of index labels
chicago_indexed.index

Index(['AARON,  ELVIA J', 'AARON,  JEFFERY M', 'AARON,  KARINA',
       'AARON,  KIMBERLEI R', 'ABAD JR,  VICENTE M', 'ABARCA,  ANABEL',
       'ABARCA,  EMMANUEL', 'ABASCAL,  REECE E', 'ABBASI,  CHRISTOPHER',
       'ABBATACOLA,  ROBERT J',
       ...
       'ZWIT,  JEFFREY J', 'ZWOLFER,  MATTHEW W', 'ZYCH,  MATEUSZ',
       'ZYDEK,  BRYAN', 'ZYGADLO,  JOHN P', 'ZYGADLO,  MICHAEL J',
       'ZYGOWICZ,  PETER J', 'ZYMANTAS,  MARK E', 'ZYRKOWSKI,  CARLO E',
       'ZYSKOWSKI,  DARIUSZ'],
      dtype='object', name='Name', length=32062)

In [39]:
# strip and title on array of index labels
chicago_indexed.index = chicago_indexed.index.str.strip().str.title()
chicago_indexed.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [43]:
# upper on column labels
chicago.columns = chicago.columns.str.upper()
chicago.head()

Unnamed: 0,NAME,FIRST NAME,MIDDLE INITIAL,LAST NAME,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
0,"Aaron, Elvia J",Elvia,J,Aaron,Water Rate Taker,Water Management,90744.0
1,"Aaron, Jeffery M",Jeffery,M,Aaron,Police Officer,Police,84450.0
2,"Aaron, Karina",Karina,,Aaron,Police Officer,Police,84450.0
3,"Aaron, Kimberlei R",Kimberlei,R,Aaron,Chief Contract Expediter,General Services,89880.0
4,"Abad Jr, Vicente M",Vicente,M,Abad Jr,Civil Engineer Iv,Water Management,106836.0
