## Data Engineering

In [1]:
import pandas as pd


In [2]:
# read data into pandas dataframes
household_df = pd.read_csv("HOUSEHOLDS.csv")
customers_df = pd.read_csv("CUSTOMERS.csv")
cars_df = pd.read_csv("CARS.csv")

#### REMOVING WHITESPACES IN DATA

In [3]:
def trim_col_names(df):
    
    # remove whitespace at end for column names
    for col_name in df.columns:
        stripped_col_name = col_name.strip()
        if col_name != stripped_col_name:
            print(f"Trimming {col_name} for whitespace")
            df = df.rename(str.strip, axis='columns')
    return df


In [4]:
household_df = trim_col_names(household_df)
customers_df = trim_col_names(customers_df)
cars_df = trim_col_names(cars_df)

Trimming ZIP  for whitespace


In [5]:
def drop_null_rows_and_columns(df):
    print(f"Before Cleaning, DataFrame Shape: {df.shape}")
    df = df.dropna(axis = 1, how = "all")

    df = df.dropna(axis = 0, how = "all")
    print(f"After Cleaning, DataFrame Shape: {df.shape}")
    return df

In [6]:
households_df_clean = drop_null_rows_and_columns(household_df)

Before Cleaning, DataFrame Shape: (500000, 10)
After Cleaning, DataFrame Shape: (500000, 10)


In [7]:
customers_df_clean = drop_null_rows_and_columns(customers_df)

Before Cleaning, DataFrame Shape: (499999, 20)
After Cleaning, DataFrame Shape: (499999, 5)


In [8]:
cars_df_clean = drop_null_rows_and_columns(cars_df)

Before Cleaning, DataFrame Shape: (500000, 16)
After Cleaning, DataFrame Shape: (500000, 16)


In [9]:
def get_missing_or_malformed_values_columns(df):
    for col, col_type in df.dtypes.items():
        if col_type == "object":
            if ((df[df[col] == "#REF!"].shape[0] > 0) or (df[df[col] == ""].shape[0] > 0)) :
                print(col)

In [10]:
get_missing_or_malformed_values_columns(households_df_clean)

In [11]:
get_missing_or_malformed_values_columns(customers_df_clean)

Employment Type
Income


In [13]:
# impute malformed string in Income column with average Income
# 50429.18711428178
customers_df_clean["Income"].astype("int").mean()



ValueError: invalid literal for int() with base 10: '#REF!'

In [13]:
# cast Income column as Integer

customers_df_clean["Income"] = customers_df_clean["Income"].astype("int")

In [14]:
# impute malformed string in employment type column with most frequent employment type 

customers_df_clean.loc[(customers_df_clean["Employment Type"] == "#REF!"),"Employment Type"] = customers_df_clean[customers_df_clean["Employment Type"] != "#REF!"] \
.groupby("Employment Type") \
.size() \
.sort_values(ascending = False) \
.reset_index() \
.iloc[0]["Employment Type"]


In [15]:
get_missing_or_malformed_values_columns(customers_df_clean)

In [16]:
get_missing_or_malformed_values_columns(cars_df_clean)

In [17]:
cars_df_clean = cars_df_clean.rename({"Car ID":"CAR_ID"}, axis = "columns")

In [18]:
# join housesholds and customer dataframe
inter_df = pd.merge(households_df_clean, customers_df_clean , how = "inner", on ="CUST_ID")

In [19]:
# join above dataframe with cars dataframe
final_df = pd.merge(inter_df, cars_df_clean, on =["CAR_ID","State"], how = "inner")

In [20]:
# write dataframe to a parquet file
final_df.to_parquet("./combined_data.parquet")

In [21]:
final_df.head()

Unnamed: 0,HH_ID,CUST_ID,CAR_ID,Active HH,HH Start Date,Phone Number,ZIP,State,Country,Referral Source,...,Vehicle Value,Annual Miles Driven,Business Use,Antique Vehicle,Lien,Lease,Driver Safety Discount,Vehicle Safety Discount,Claim Payout,6 Month Premium Amount
0,219790301,801198110,844435,1,11/18/22,(709) 379-9036,70442,OK,USA,Other,...,50000.0,56,0,1,1,0,0,1,0,42.89
1,219790301,281855167,410619,1,11/18/22,(740) 565-4060,70442,OK,USA,Other,...,8151.75,12136,0,0,0,1,0,1,0,58.887695
2,219790301,688373183,192812,1,11/18/22,(117) 457-9582,70442,OK,USA,Other,...,1651.2,14674,0,0,1,0,0,1,0,322.307381
3,219790301,752746800,752033,1,11/18/22,(536) 797-5920,70442,OK,USA,Other,...,500.0,15762,0,0,1,0,0,0,0,232.670242
4,464806390,114187354,23783,1,10/9/20,(152) 373-1773,42706,NY,USA,Event,...,18873.9,10154,0,0,1,0,0,1,0,167.432014


In [22]:
import pyarrow.parquet as pq
schema = pq.read_schema('combined_data.parquet')
schema


HH_ID: int64
CUST_ID: int64
CAR_ID: int64
Active HH: int64
HH Start Date: string
Phone Number: string
ZIP: int64
State: string
Country: string
Referral Source: string
Date of Birth: string
Marital Status: string
Employment Type: string
Income: int64
Status: string
Model Year: int64
Make: string
Body Style: string
Vehicle Value: double
Annual Miles Driven: int64
Business Use: int64
Antique Vehicle: int64
Lien: int64
Lease: int64
Driver Safety Discount: int64
Vehicle Safety Discount: int64
Claim Payout: int64
6 Month Premium Amount: double
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 3635