<a href="https://colab.research.google.com/github/Prabhuarasu/FutureDataScienceLegends/blob/main/cross_sell_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.compose import ColumnTransformer

In [67]:
import joblib

In [68]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission_iA3afxn.csv')

In [69]:
train.shape,test.shape,sub.shape

((381109, 12), (127037, 11), (127037, 2))

In [70]:
train.dtypes

Unnamed: 0,0
id,int64
Gender,object
Age,int64
Driving_License,int64
Region_Code,float64
Previously_Insured,int64
Vehicle_Age,object
Vehicle_Damage,object
Annual_Premium,float64
Policy_Sales_Channel,float64


In [71]:
train.isna().sum()

Unnamed: 0,0
id,0
Gender,0
Age,0
Driving_License,0
Region_Code,0
Previously_Insured,0
Vehicle_Age,0
Vehicle_Damage,0
Annual_Premium,0
Policy_Sales_Channel,0


In [72]:
train.head(2)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0


In [73]:
tgt_col =['Response']
ign_cols=['id']
cat_cols= train.select_dtypes(include= 'object').columns
num_cols= train.select_dtypes(exclude='object').columns

In [74]:
print(tgt_col,ign_cols,cat_cols,num_cols,sep='\n')

['Response']
['id']
Index(['Gender', 'Vehicle_Age', 'Vehicle_Damage'], dtype='object')
Index(['id', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')


In [75]:
train[cat_cols].head(2)

Unnamed: 0,Gender,Vehicle_Age,Vehicle_Damage
0,Male,> 2 Years,Yes
1,Male,1-2 Year,No


In [76]:
train[num_cols].head(2)

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,44,1,28.0,0,40454.0,26.0,217,1
1,2,76,1,3.0,0,33536.0,26.0,183,0


In [77]:
# to see numerical columns after removing the target columns and not required columns
num_cols = train.select_dtypes(exclude='object').drop(columns=ign_cols+tgt_col).columns
train[num_cols].head(2)

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage
0,44,1,28.0,0,40454.0,26.0,217
1,76,1,3.0,0,33536.0,26.0,183


In [78]:
cat_pipe_encode = Pipeline(
    steps=[
        ('impute_cat',SimpleImputer(strategy='most_frequent')),
        ('ohe',OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [79]:
num_pipe_encode =Pipeline(
    steps=[
        ('impute_num',SimpleImputer(strategy='mean')),
        ('scale',StandardScaler())
    ]
)

In [80]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [90]:
 mymodel = LogisticRegression()

In [82]:
preprocess= ColumnTransformer(
    transformers=[
        ('num',num_pipe_encode,num_cols),
        ('cat',cat_pipe_encode,cat_cols)
    ]
)

In [93]:
model_pipeline= Pipeline(
    steps=[
        ('preprocess',preprocess),
        ('model',mymodel)
    ]
)

In [84]:
# split the train dataset to train and validation
X = train.drop(columns = ign_cols + tgt_col)
X.head(2)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183


In [85]:
y=train[tgt_col]
y.head(2)

Unnamed: 0,Response
0,1
1,0


In [86]:
train_X,val_X,train_y,val_y=train_test_split(X,y,test_size=0.1,random_state=42)
train_X.shape,val_X.shape,train_y.shape,val_y.shape

((342998, 10), (38111, 10), (342998, 1), (38111, 1))

In [87]:
train.shape,int(test.shape[0]*9),int(train.shape[0]*1)

((381109, 12), 1143333, 381109)

In [94]:
model_pipeline.fit(train_X, train_y)

In [96]:
model_pipeline.predict(train_X)

array([0, 0, 0, ..., 0, 0, 0])

In [98]:
def model_train_val_eval(train_x,val_X,train_y,val_y,model_pipeline):
    model_pipeline.fit(train_x,train_y)
    pred_train = model_pipeline.predict(train_x)
    pred_val = model_pipeline.predict(val_X)

    print('Train F1 score:', f1_score(train_y,pred_train))
    print('val F1 score:', f1_score(val_y,pred_val))

In [99]:
model_train_val_eval(train_X,val_X,train_y,val_y,model_pipeline)

Train F1 score: 0.0003337783711615487
val F1 score: 0.0


In [100]:
sub.head(2)

Unnamed: 0,id,Response
0,381110,0
1,381111,0


In [101]:
train.dtypes

Unnamed: 0,0
id,int64
Gender,object
Age,int64
Driving_License,int64
Region_Code,float64
Previously_Insured,int64
Vehicle_Age,object
Vehicle_Damage,object
Annual_Premium,float64
Policy_Sales_Channel,float64


In [104]:
joblib.dump(model_pipeline,'model_pipeline.pkl')

['model_pipeline.pkl']

In [105]:
train.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [106]:
train.columns.to_frame().index

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [107]:
train.head(1)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
