# Lab | Comparing regression models


For this lab, we will be using the same dataset we used in the previous labs. We recommend using the same notebook since you will be reusing the same variables you previous created and used in labs. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
customer_df = pd.read_csv('files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv')

#### Results of the last 2 labs ...

In [3]:
def rename_columns(data):
    cols = []
    for col in data.columns:
        cols.append(col.lower().replace(' ', '_'))
    return cols
customer_df.columns = rename_columns(customer_df)

In [4]:
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'], infer_datetime_format=True)


In [5]:
categorical_df = customer_df.select_dtypes(object)
categorical_df.shape

(9134, 15)

In [6]:
for col in categorical_df.columns:
    print(f"Column: {col}")
    display(categorical_df[col].value_counts(dropna=False))
    print()

Column: customer


BU79786    1
PU81096    1
CO75086    1
WW52683    1
XO38850    1
          ..
HS14476    1
YL91587    1
CT18212    1
EW35231    1
Y167826    1
Name: customer, Length: 9134, dtype: int64


Column: state


California    3150
Oregon        2601
Arizona       1703
Nevada         882
Washington     798
Name: state, dtype: int64


Column: response


No     7826
Yes    1308
Name: response, dtype: int64


Column: coverage


Basic       5568
Extended    2742
Premium      824
Name: coverage, dtype: int64


Column: education


Bachelor                2748
College                 2681
High School or Below    2622
Master                   741
Doctor                   342
Name: education, dtype: int64


Column: employmentstatus


Employed         5698
Unemployed       2317
Medical Leave     432
Disabled          405
Retired           282
Name: employmentstatus, dtype: int64


Column: gender


F    4658
M    4476
Name: gender, dtype: int64


Column: location_code


Suburban    5779
Rural       1773
Urban       1582
Name: location_code, dtype: int64


Column: marital_status


Married     5298
Single      2467
Divorced    1369
Name: marital_status, dtype: int64


Column: policy_type


Personal Auto     6788
Corporate Auto    1968
Special Auto       378
Name: policy_type, dtype: int64


Column: policy


Personal L3     3426
Personal L2     2122
Personal L1     1240
Corporate L3    1014
Corporate L2     595
Corporate L1     359
Special L2       164
Special L3       148
Special L1        66
Name: policy, dtype: int64


Column: renew_offer_type


Offer1    3752
Offer2    2926
Offer3    1432
Offer4    1024
Name: renew_offer_type, dtype: int64


Column: sales_channel


Agent          3477
Branch         2567
Call Center    1765
Web            1325
Name: sales_channel, dtype: int64


Column: vehicle_class


Four-Door Car    4621
Two-Door Car     1886
SUV              1796
Sports Car        484
Luxury SUV        184
Luxury Car        163
Name: vehicle_class, dtype: int64


Column: vehicle_size


Medsize    6424
Small      1764
Large       946
Name: vehicle_size, dtype: int64




In [7]:
categorical_df.dtypes

customer            object
state               object
response            object
coverage            object
education           object
employmentstatus    object
gender              object
location_code       object
marital_status      object
policy_type         object
policy              object
renew_offer_type    object
sales_channel       object
vehicle_class       object
vehicle_size        object
dtype: object

So the column 'Effective To Date' doesn't make sense as categorical, so I convert it to datetime and remove it from **categorical_df**.


In [8]:
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'], infer_datetime_format=True)

In [9]:
def adjust_policy(value):
    '''
    Replace '... L1' to 1, '... L2' to 2 and '... L3' to 3
    Input -> value  Value of column 'Policy'
    Output -> 1, 2 or 3
    '''
    if value.find('L1') != -1:
        return 1
    elif value.find('L2') != -1:
        return 2
    else:
        return 3

# Add new numerical column named 'Policy Liability' and drop 'policy'
customer_df['policy_liability'] = customer_df['policy'].apply(adjust_policy)
customer_df = customer_df.drop(['policy'], axis=1)

In [10]:
customer_df['renew_offer_type'] = customer_df['renew_offer_type'].replace({
    'Offer1': 1,
    'Offer2': 2,
    'Offer3': 3,
    'Offer4': 4
})

In [11]:
customer_df['state'] = customer_df['state'].replace({'Nevada':'Other', 'Washington':'Other'})

In [12]:
customer_df['coverage'] = customer_df['coverage'].replace({'Premium':'Extended'})

In [13]:
customer_df['education'] = customer_df['education'].replace({
    'Master':'Higher Grade',
    'Doctor':'Higher Grade'
})

In [14]:
customer_df['employmentstatus'] = customer_df['employmentstatus'].replace({
        'Medical Leave':'Unemployed',
        'Disabled':'Unemployed',
        'Unemployed':'Unemployed'
})

In [15]:
customer_df['marital_status'] = customer_df['marital_status'].replace({
    'Divorced':'Single'
})

In [16]:
customer_df['policy_type'] = customer_df['policy_type'].rename({'Special Auto':'Corporate Auto'})

In [17]:
customer_df['vehicle_class'] = customer_df['vehicle_class'].rename({
    'Sports Car':'Other',
    'Luxury SUV':'Other',
    'Luxury Car':'Other'
})

### Here begins the current lab...

In [18]:
# So I decided to remove the outliers before applying the train/test-split
def remove_outliers(data):
    data2 = data.copy()
    numeric = data2.select_dtypes(np.number)
    for col in numeric.columns:
        if col != 'total_claim_amount':
            iqr = np.percentile(data2[col],75) - np.percentile(data2[col],25)
            upper_limit = np.percentile(data[col],75) + 1.5*iqr
            lower_limit = np.percentile(data[col],25) - 1.5*iqr
            data2 = data2[(data2[col] > lower_limit) & (data2[col] < upper_limit)]
        return data2

custumer_df = remove_outliers(customer_df)

#### In this final lab, we will model our data. Import sklearn `train_test_split` and separate the data.

In [19]:
X = customer_df.drop(['total_claim_amount'], axis=1)
y = customer_df['total_claim_amount']

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train_discrete = X_train.select_dtypes(np.int64)
X_train_continuous = X_train.select_dtypes([np.float64, np.datetime64])
X_train_cat = X_train.select_dtypes(object)

X_test_discrete = X_test.select_dtypes(np.int64)
X_test_continuous = X_test.select_dtypes([np.float64, np.datetime64])
X_test_cat = X_test.select_dtypes(object)

#### We will start with removing outliers, if you have not already done so.  We have discussed different methods to remove outliers. Use the one you feel more comfortable with, define a function for that. Use the function to remove the outliers and apply it to the dataframe.

Already done before splitting the data

#### The time variable can be useful. Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.

In [21]:
X_train_discrete['day']   = pd.to_datetime(X_train_continuous['effective_to_date']).dt.day
X_train_discrete['month'] = pd.to_datetime(X_train_continuous['effective_to_date']).dt.month
X_train_discrete['year']  = pd.to_datetime(X_train_continuous['effective_to_date']).dt.year
X_train_continuous = X_train_continuous.drop(['effective_to_date'], axis=1)

X_test_discrete['day']   = pd.to_datetime(X_test_continuous['effective_to_date']).dt.day
X_test_discrete['month'] = pd.to_datetime(X_test_continuous['effective_to_date']).dt.month
X_test_discrete['year']  = pd.to_datetime(X_test_continuous['effective_to_date']).dt.year
X_test_continuous = X_test_continuous.drop(['effective_to_date'], axis=1)

#### Normalize the continuous variables. You can use any one method you want.

In [22]:
from sklearn.preprocessing import PowerTransformer

pT = PowerTransformer()
pT.fit(X_train_continuous)

X_train_continuous_trans_np = pT.transform(X_train_continuous)
X_test_continuous_trans_np = pT.transform(X_test_continuous)


X_train_continuous_trans = pd.DataFrame(X_train_continuous_trans_np, columns=X_train_continuous.columns,
                                       index=X_train_continuous.index)
X_test_continuous_trans = pd.DataFrame(X_test_continuous_trans_np, columns=X_test_continuous.columns,
                                       index=X_test_continuous.index)

#### Encode the categorical variables (See the hint below for encoding categorical data!!!)

In [23]:
def encode_categorical(data):
    data = data.drop(['customer'], axis=1)
    return pd.get_dummies(data, drop_first=True)
#                          columns=['state', 'marital_status',
#                                   'policy_type', 'sales_channel',
#                                   'vehicle_class'],
#                          drop_first=True)

X_train_cat_encoded = encode_categorical(X_train_cat)
X_test_cat_encoded = encode_categorical(X_test_cat)

In [24]:
X_train_final = pd.concat([X_train_discrete, X_train_continuous_trans, X_train_cat_encoded], axis=1)
X_test_final = pd.concat([X_test_discrete, X_test_continuous_trans, X_test_cat_encoded], axis=1)

#### Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

In [25]:
X_train_final.dtypes

income                              int64
monthly_premium_auto                int64
months_since_last_claim             int64
months_since_policy_inception       int64
number_of_open_complaints           int64
number_of_policies                  int64
renew_offer_type                    int64
policy_liability                    int64
day                                 int64
month                               int64
year                                int64
customer_lifetime_value           float64
state_California                    uint8
state_Oregon                        uint8
state_Other                         uint8
response_Yes                        uint8
coverage_Extended                   uint8
education_College                   uint8
education_High School or Below      uint8
education_Higher Grade              uint8
employmentstatus_Retired            uint8
employmentstatus_Unemployed         uint8
gender_M                            uint8
location_code_Suburban            

#### Try a simple linear regression with all the data to see whether we are getting good results.



In [26]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train_final, y_train)
print(f'Train score: {lm.score(X_train_final, y_train)}')
print(f'Test score: {lm.score(X_test_final, y_test)}')

Train score: 0.7666710725006447
Test score: 0.7531457607816229


#### Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.


In [27]:
def train_models(list_of_models, X_train, y_train):
    for model in list_of_models:
        model.fit(X_train, y_train)
    return list_of_models


In [28]:
from sklearn.metrics import r2_score


def evaluate_models(list_of_models, X_train, y_train, X_test, y_test):
    
    train_scores = []
    test_scores = []
    
    for model in list_of_models:
        y_train_pred = model.predict(X_train)
        train_scores.append(r2_score(y_train, y_train_pred))

        y_test_pred = model.predict(X_test)
        test_scores.append(r2_score(y_test, y_test_pred))
        
    return train_scores, test_scores

#### Use the function to check `LinearRegressor` and `KNeighborsRegressor`.

In [29]:
from sklearn.neighbors import KNeighborsRegressor

list_of_models = [
    LinearRegression(),
    KNeighborsRegressor(n_neighbors=4)
]

list_of_trained_models = train_models(list_of_models, X_train_final, y_train)
train_scores, test_scores = evaluate_models(list_of_trained_models,
                                            X_train_final, y_train,
                                            X_test_final, y_test)
for i in range(len(list_of_models)):
    print(f'Model: {list_of_trained_models[i]}')
    print(f'    Train-Score: {train_scores[i]}')
    print(f'    Test-Score:  {test_scores[i]}')

Model: LinearRegression()
    Train-Score: 0.7666710725006447
    Test-Score:  0.7531457607816229
Model: KNeighborsRegressor(n_neighbors=4)
    Train-Score: 0.6546799993133535
    Test-Score:  0.38441243903272915


#### You can check also the `MLPRegressor` for this task!

In [30]:
from sklearn.neural_network import MLPRegressor

list_of_models.append(MLPRegressor())

list_of_trained_models = train_models(list_of_models, X_train_final, y_train)
train_scores, test_scores = evaluate_models(list_of_trained_models,
                                            X_train_final, y_train,
                                            X_test_final, y_test)
for i in range(len(list_of_models)):
    print(f'Model: {list_of_trained_models[i]}')
    print(f'    Train-Score: {train_scores[i]}')
    print(f'    Test-Score:  {test_scores[i]}')

Model: LinearRegression()
    Train-Score: 0.7666710725006447
    Test-Score:  0.7531457607816229
Model: KNeighborsRegressor(n_neighbors=4)
    Train-Score: 0.6546799993133535
    Test-Score:  0.38441243903272915
Model: MLPRegressor()
    Train-Score: 0.6278612801473662
    Test-Score:  0.6066093958148033


#### Check and discuss the results.

So our best result on the training set is 0.76 using a linear regression model and the best score on the test set
is 0.75 with a linear regression model, too.