In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

customer_df = pd.read_csv("/Users/suso/BootcampDA/Unit4/Labs/lab-cleaning-categorical-data/files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv")
new_column_names = {
    'Customer': 'customer',
    'State': 'state',
    'Customer Lifetime Value': 'customer_lifetime_value',
    'Response': 'response',
    'Coverage': 'coverage',
    'Education': 'education',
    'Effective To Date': 'effective_to_date',
    'EmploymentStatus': 'employment_status',
    'Gender': 'gender',
    'Income': 'income',
    'Location Code': 'location_code',
    'Marital Status': 'marital_status',
    'Monthly Premium Auto': 'monthly_premium_auto',
    'Months Since Last Claim': 'months_since_last_claim',
    'Months Since Policy Inception': 'months_since_policy_inception',
    'Number of Open Complaints': 'number_of_open_complaints',
    'Number of Policies': 'number_of_policies',
    'Policy Type': 'policy_type',
    'Policy': 'policy',
    'Renew Offer Type': 'renew_offer_type',
    'Sales Channel': 'sales_channel',
    'Total Claim Amount': 'total_claim_amount',
    'Vehicle Class': 'vehicle_class',
    'Vehicle Size': 'vehicle_size',
}
customer_df = customer_df.rename(columns=new_column_names)

customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])

customer_df = customer_df.drop('customer', axis=1)

numerical_df = customer_df.select_dtypes(include=['int64', 'float64'])
categorical_df = customer_df.select_dtypes(include=['object'])

import numpy as np

def remove_outliers_zscore(customer_df, numerical_df, threshold=3):
    z_scores = np.abs((numerical_df - numerical_df.mean()) / numerical_df.std())
    outliers = (z_scores > threshold).any(axis=1)
    return customer_df[~outliers]

cleaned_df = remove_outliers_zscore(customer_df, numerical_df)

cleaned_df_copy = cleaned_df.copy()

from sklearn.preprocessing import MinMaxScaler

numericals = cleaned_df_copy.select_dtypes(include=['int64', 'float64'])
categoricals = cleaned_df_copy.select_dtypes(include=['object'])

scaler = MinMaxScaler()
numericals = scaler.fit_transform(numericals)

cleaned_df_copy["coverage"] = cleaned_df_copy["coverage"].map({"Basic": 0, "Extended": 1, "Premium": 2})
cleaned_df_copy["state"] = cleaned_df_copy["state"].map({"Washington": 0, "Arizona": 1, "Nevada": 2, "California": 3, "Oregon": 4})
cleaned_df_copy["response"] = cleaned_df_copy["response"].map({"No": 0, "Yes": 1})
cleaned_df_copy["education"] = cleaned_df_copy["education"].map({"Bachelor": 0, "College": 1, "Master": 2, "High School or Below": 3, "Doctor": 4})
cleaned_df_copy["employment_status"] = cleaned_df_copy["employment_status"].map({"Employed": 0, "Unemployed": 1, "Medical Leave": 2, "Disabled": 3, "Retired": 4})
cleaned_df_copy["gender"] = cleaned_df_copy["gender"].map({"F": 0, "M": 1})
cleaned_df_copy["location_code"] = cleaned_df_copy["location_code"].map({"Suburban": 0, "Rural": 1, "Urban": 2})
cleaned_df_copy["marital_status"] = cleaned_df_copy["marital_status"].map({"Married": 0, "Single": 1, "Divorced": 2})
cleaned_df_copy["policy_type"] = cleaned_df_copy["policy_type"].map({"Corporate Auto": 0, "Personal Auto": 1, "Special Auto": 2})
cleaned_df_copy["policy"] = cleaned_df_copy["policy"].map({"Corporate L3": 0, "Personal L3": 1, "Corporate L2": 2, "Personal L1": 3, "Special L2": 4, "Corporate L1": 5, "Personal L2": 6, "Special L1": 7, "Special L3": 8})
cleaned_df_copy["renew_offer_type"] = cleaned_df_copy["renew_offer_type"].map({"Offer1": 0, "Offer3": 1, "Offer2": 2, "Offer4": 3})
cleaned_df_copy["sales_channel"] = cleaned_df_copy["sales_channel"].map({"Agent": 0, "Call Center": 1, "Web": 2, "Branch": 3})
cleaned_df_copy["vehicle_class"] = cleaned_df_copy["vehicle_class"].map({"Two-Door Car": 0, "Four-Door Car": 1, "SUV": 2, "Sports Car": 3, "Luxury SUV": 4, "Luxury Car": 5})
cleaned_df_copy["vehicle_size"] = cleaned_df_copy["vehicle_size"].map({"Medsize": 0, "Small": 1, "Large": 2})

cleaned_df_copy['day_of_week'] = cleaned_df_copy['effective_to_date'].dt.dayofweek
cleaned_df_copy['month'] = cleaned_df_copy['effective_to_date'].dt.month

cleaned_df_copy = cleaned_df_copy.drop('effective_to_date', axis=1)

In [10]:
#1. In this final lab, we will model our data. Import sklearn train_test_split and separate the data.

from sklearn.model_selection import train_test_split
target_variable = cleaned_df_copy['customer_lifetime_value']
independent_vars = cleaned_df_copy[['income', 'total_claim_amount', 'number_of_policies', 'months_since_policy_inception', 'number_of_open_complaints', 'monthly_premium_auto', 'months_since_last_claim', 'coverage', 'state', 'response', 'education', 'employment_status', 'gender', 'location_code', 'marital_status', 'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'vehicle_class', 'vehicle_size', 'day_of_week', 'month']]

In [11]:
X = independent_vars
y = target_variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
#2. Try a simple linear regression with all the data to see whether we are getting good results.

from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X, y)

y_pred = model.predict(X)



In [17]:
#3. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def train_and_test_models(models, X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    results = {}

    for model in models:
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)

        results[type(model).__name__] = mse

    return results

In [19]:
#4. Use the function to check LinearRegressor and KNeighborsRegressor.

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

models = [LinearRegression(), KNeighborsRegressor()]

results = train_and_test_models(models, X, y)

for model, mse in results.items():
    print(f"{model}: MSE = {mse}")

LinearRegression: MSE = 20536702.650426116
KNeighborsRegressor: MSE = 23524204.858369853


In [21]:
#5. You can check also the MLPRegressor for this task!
from sklearn.neural_network import MLPRegressor

models = [LinearRegression(), KNeighborsRegressor(), MLPRegressor()]

results = train_and_test_models(models, X, y)

for model, mse in results.items():
    print(f"{model}: MSE = {mse}")

LinearRegression: MSE = 20536702.650426116
KNeighborsRegressor: MSE = 23524204.858369853
MLPRegressor: MSE = 20605800.64233061
