In [28]:
import pandas as pd
import numpy as np


In [29]:
# Read the three files into python as dataframes

file1 = pd.read_excel('files_for_lab/excel_files/file1.xlsx')
file2 = pd.read_excel('files_for_lab/excel_files/file2.xlsx')
file3 = pd.read_excel('files_for_lab/excel_files/file3.xlsx')


In [30]:
# Show the DataFrame's shape.

print("File1: ", file1.shape)
print("File2: ", file2.shape)
print("File3: ", file3.shape)


File1:  (1071, 11)
File2:  (996, 11)
File3:  (7070, 11)


In [31]:
# Standardize header names.

cols = []

for col in file1.columns:
    cols.append(col.lower().replace(' ', '_'))
file1.columns = cols

cols = []
for col in file2.columns:
    cols.append(col.lower().replace(' ', '_'))
file2.columns = cols

cols = []
for col in file3.columns:
    cols.append(col.lower().replace(' ', '_'))
file3.columns = cols


In [32]:
# Rearrange the columns in the dataframe as needed
# First let's compare the columns-labels of our 3 dataframes...

display(file1.head(1))
print(file1.columns == file2.columns)

display(file2.head(1))
print(file1.columns == file3.columns)

display(file3.head(1))
print(file2.columns == file3.columns)


Unnamed: 0,customer,st,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,RB50392,Washington,,Master,,0,1000,1/0/00,Personal Auto,Four-Door Car,2.704934


[ True  True  True  True  True  True  True  True False False False]


Unnamed: 0,customer,st,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,total_claim_amount,policy_type,vehicle_class
0,GS98873,Arizona,F,Bachelor,323912.47%,16061,88,1/0/00,633.6,Personal Auto,Four-Door Car


[ True False False  True False  True  True  True  True False False]


Unnamed: 0,customer,state,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,total_claim_amount,vehicle_class
0,SA25987,Washington,3479.137523,High School or Below,M,0,104,0,Personal Auto,499.2,Two-Door Car


[ True False False  True False  True  True  True False False  True]


In [33]:
# ... then we rename column 'st' to 'state' and reorder the columns of dataframe 'file2' and 'file3'.

file1 = file1.rename(columns={'st':'state'})
file2 = file2.rename(columns={'st':'state'})

file2 = file2[ list(file1.columns) ]
file3 = file3[ list(file1.columns) ]

# Check if tables have same column order
print(file1.columns == file2.columns)
print(file1.columns == file3.columns)
print(file2.columns == file3.columns)


[ True  True  True  True  True  True  True  True  True  True  True]
[ True  True  True  True  True  True  True  True  True  True  True]
[ True  True  True  True  True  True  True  True  True  True  True]


In [34]:
# Concatenate the three dataframes

data = pd.concat([file1, file2, file3], axis=0)
data.shape


(9137, 11)

In [35]:
# Which columns are numerical?

data.select_dtypes(include=np.number).columns


Index(['income', 'monthly_premium_auto', 'total_claim_amount'], dtype='object')

In [36]:
# Which columns are categorical ?

data.select_dtypes(include=['object']).columns #.tolist()


Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',
       'number_of_open_complaints', 'policy_type', 'vehicle_class'],
      dtype='object')

In [37]:
# Perform the data cleaning operations mentioned so far in class

# Delete the column education and the number of open complaints from the dataframe.
data = data.drop(['education','number_of_open_complaints'], axis=1)


In [38]:
# Correct the values in the column customer lifetime value.
# They are given as a percent, so multiply them by 100 and change dtype to numerical type.
#col = data['customer_lifetime_value'].str.replace('%', '') #* 100
#pd.to_numeric(data['customer_lifetime_value'], errors='coerce')

def clean_column_3(value):
    if type(value) == str and value.find('%') != -1:
        return float(value.replace('%', ''))
    else:
        return float(value)

#data['customer_lifetime_value'] 
data['customer_lifetime_value'] = data['customer_lifetime_value'].apply(clean_column_3)
display(data)

Unnamed: 0,customer,state,gender,customer_lifetime_value,income,monthly_premium_auto,policy_type,vehicle_class,total_claim_amount
0,RB50392,Washington,,,0,1000,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,6.979536e+05,0,94,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,1.288743e+06,48767,108,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,7.645862e+05,0,106,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,5.363077e+05,36357,68,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,2.340599e+04,71941,73,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,3.096511e+03,21604,79,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,8.163890e+03,0,85,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,7.524442e+03,21941,96,Personal Auto,Four-Door Car,691.200000


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9137 entries, 0 to 7069
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   customer                 9137 non-null   object 
 1   state                    9137 non-null   object 
 2   gender                   9015 non-null   object 
 3   customer_lifetime_value  9130 non-null   float64
 4   income                   9137 non-null   int64  
 5   monthly_premium_auto     9137 non-null   int64  
 6   policy_type              9137 non-null   object 
 7   vehicle_class            9137 non-null   object 
 8   total_claim_amount       9137 non-null   float64
dtypes: float64(2), int64(2), object(5)
memory usage: 713.8+ KB


In [39]:
# Check for duplicate rows in the data and remove if any.
data = data.drop_duplicates()


In [40]:
# Filter out the data for customers who have an income of 0 or less.
data = data[data['income'] > 0]
display(data)


Unnamed: 0,customer,state,gender,customer_lifetime_value,income,monthly_premium_auto,policy_type,vehicle_class,total_claim_amount
2,AI49188,Nevada,F,1.288743e+06,48767,108,Personal Auto,Two-Door Car,566.472247
4,GA49547,Washington,M,5.363077e+05,36357,68,Personal Auto,Four-Door Car,17.269323
5,OC83172,Oregon,F,8.256298e+05,62902,69,Personal Auto,Two-Door Car,159.383042
6,XZ87318,Oregon,F,5.380899e+05,55350,67,Corporate Auto,Four-Door Car,321.600000
8,DY87989,Oregon,M,2.412750e+06,14072,71,Corporate Auto,Four-Door Car,511.200000
...,...,...,...,...,...,...,...,...,...
7063,TF56202,California,M,5.032165e+03,66367,64,Personal Auto,Two-Door Car,307.200000
7064,YM19146,California,F,4.100399e+03,47761,104,Personal Auto,Four-Door Car,541.282007
7065,LA72316,California,M,2.340599e+04,71941,73,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,3.096511e+03,21604,79,Corporate Auto,Four-Door Car,379.200000
