# Functions

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv')
display(df.head())

Unnamed: 0,Customer,ST,GENDER,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323


1. Rename columns

In [3]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function picks a Pandas DataFrame and renames specific columns in lower case

    Inputs:
    df: Pandas DataFrame

    Outputs:
    A Pandas DataFrame with columns in lower case
    '''

    columns = []
    for colname in df.columns:
        columns.append(colname.lower())
            
    df.columns = columns
    df = df.rename(columns={'st':'state',
                                'customer lifetime value':'customer_lifetime_value',
                                'monthly premium auto':'monthly_premium_auto',
                                'number of open complaints':'number_of_open_complaints',
                                'policy type':'policy_type',
                                'vehicle class':'vehicle_class',
                                'total claim amount':'total_claim_amount'}, inplace=True)
        
    return df

In [4]:
rename_columns(df)
display(df)

Unnamed: 0,customer,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
4003,,,,,,,,,,,
4004,,,,,,,,,,,
4005,,,,,,,,,,,
4006,,,,,,,,,,,


2. Cleaning invalid values

In [5]:
def replace_values(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function picks a Pandas DataFrame and replace incorrect values. Also, it changes the variable customer_lifetime_value to float64

    Inputs:
    df: Pandas DataFrame

    Outputs:
    A Pandas DataFrame with the corrected values
    '''

    df['gender'].replace('Femal', 'F', inplace=True)
    df['gender'].replace('female', 'F', inplace=True)
    df['gender'].replace('Male', 'M', inplace=True)
    df['state'].replace('Cali', 'California', inplace=True)
    df['state'].replace('AZ', 'Arizona', inplace=True)
    df['state'].replace('WA', 'Washington', inplace=True)
    df['education'].replace('Bachelors', 'Bachelor', inplace=True)
    df['vehicle_class'].replace('Sports Car', 'Luxury', inplace=True)
    df['vehicle_class'].replace('Luxury SUV', 'Luxury', inplace=True)
    df['vehicle_class'].replace('Luxury Car', 'Luxury', inplace=True)
    df['gender'] = df['gender'].fillna('U')

    
    df['customer_lifetime_value'] = df['customer_lifetime_value'].replace('%', '', regex=True).astype('float64')

    df[['first', 'second', 'third']] = df['number_of_open_complaints'].str.split('/', expand=True)
    df.drop(['number_of_open_complaints', 'first', 'third'], axis=1, inplace=True)
    df.rename(columns={"second": "number_of_open_complaints"}, inplace=True)
    
    return df

In [6]:
replace_values(df)
display(df)

Unnamed: 0,customer,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,policy_type,vehicle_class,total_claim_amount,number_of_open_complaints
0,RB50392,Washington,U,Master,,0.0,1000.0,Personal Auto,Four-Door Car,2.704934,0
1,QZ44356,Arizona,F,Bachelor,697953.59,0.0,94.0,Personal Auto,Four-Door Car,1131.464935,0
2,AI49188,Nevada,F,Bachelor,1288743.17,48767.0,108.0,Personal Auto,Two-Door Car,566.472247,0
3,WW63253,California,M,Bachelor,764586.18,0.0,106.0,Corporate Auto,SUV,529.881344,0
4,GA49547,Washington,M,High School or Below,536307.65,36357.0,68.0,Personal Auto,Four-Door Car,17.269323,0
...,...,...,...,...,...,...,...,...,...,...,...
4003,,,U,,,,,,,,
4004,,,U,,,,,,,,
4005,,,U,,,,,,,,
4006,,,U,,,,,,,,


3. Dealing with null values

In [7]:
def handle_na_values(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function picks a Pandas DataFrame and cleans null values by dropping rows with specific values. 
    Inputs:
    df: Pandas DataFrame

    Outputs:
    A Pandas DataFrame with rows that doesn't contain NaN values
    '''
    
    rows_to_drop = df[df['customer_lifetime_value'].isna()==True].index
    df = df.drop(rows_to_drop, axis=0, inplace=True)
        
    return df

# For some reason this function is also dropping the rows that have at least one NaN value but I don't understand why.

In [8]:
handle_na_values(df)
display(df)

Unnamed: 0,customer,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,policy_type,vehicle_class,total_claim_amount,number_of_open_complaints
1,QZ44356,Arizona,F,Bachelor,697953.59,0.0,94.0,Personal Auto,Four-Door Car,1131.464935,0
2,AI49188,Nevada,F,Bachelor,1288743.17,48767.0,108.0,Personal Auto,Two-Door Car,566.472247,0
3,WW63253,California,M,Bachelor,764586.18,0.0,106.0,Corporate Auto,SUV,529.881344,0
4,GA49547,Washington,M,High School or Below,536307.65,36357.0,68.0,Personal Auto,Four-Door Car,17.269323,0
5,OC83172,Oregon,F,Bachelor,825629.78,62902.0,69.0,Personal Auto,Two-Door Car,159.383042,0
...,...,...,...,...,...,...,...,...,...,...,...
1066,TM65736,Oregon,M,Master,305955.03,38644.0,78.0,Personal Auto,Four-Door Car,361.455219,1
1067,VJ51327,California,F,High School or Below,2031499.76,63209.0,102.0,Personal Auto,SUV,207.320041,2
1068,GS98873,Arizona,F,Bachelor,323912.47,16061.0,88.0,Personal Auto,Four-Door Car,633.600000,0
1069,CW49887,California,F,Master,462680.11,79487.0,114.0,Special Auto,SUV,547.200000,0


4. Change data types

In [10]:
def convert_data_types(df: pd.DataFrame) -> pd.DataFrame:
    '''
    This function picks a Pandas DataFrame and changes data types to int64

    Inputs:
    df: Pandas DataFrame

    Outputs:
    A Pandas DataFrame with the corrected data types
    '''

    numeric_cols = df.select_dtypes('number')

    lst_cols = numeric_cols.columns

    df[lst_cols] = df[lst_cols].applymap(np.int64)
    df.dtypes
    
    return df

In [11]:
convert_data_types(df)
df.dtypes

customer                     object
state                        object
gender                       object
education                    object
customer_lifetime_value       int64
income                        int64
monthly_premium_auto          int64
policy_type                  object
vehicle_class                object
total_claim_amount            int64
number_of_open_complaints    object
dtype: object