# Code 6
- Preprocessing Improvement
- Decision Tree

## 1/ Import Libraries

In [1]:
#CodeSection1
# from google.colab import drive
# drive.mount('/mntDrive') 

In [2]:
#CodeSection2
import pandas as pd
import numpy as np

## 2/ Import Data

In [3]:
#CodeSection3
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

## 3/ Combine both Datasets (train, test)

In [4]:
#CodeSection4
all_data = [train, test]

### Check Combined Data 
- Missing Value
- Data Types

In [5]:
#CodeSection5
# Combine both to check Missing Value
pd.concat([train, test], axis = 0).isnull().sum()

id                           0
Gender                       0
Age                          0
Driving_License              0
Region_Code                  0
Previously_Insured           0
Vehicle_Age                  0
Vehicle_Damage               0
Annual_Premium               0
Policy_Sales_Channel         0
Vintage                      0
Response                127037
dtype: int64

## 4/ Preprocessing Improvement

### 4.1/ Identify Numerical and Categorical Features

In [6]:
train.describe(include='all')

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,381109.0,381109,381109.0,381109.0,381109.0,381109.0,381109,381109,381109.0,381109.0,381109.0,381109.0
unique,,2,,,,,3,2,,,,
top,,Male,,,,,1-2 Year,Yes,,,,
freq,,206089,,,,,200316,192413,,,,
mean,190555.0,,38.822584,0.997869,26.388807,0.45821,,,30564.389581,112.034295,154.347397,0.122563
std,110016.836208,,15.511611,0.04611,13.229888,0.498251,,,17213.155057,54.203995,83.671304,0.327936
min,1.0,,20.0,0.0,0.0,0.0,,,2630.0,1.0,10.0,0.0
25%,95278.0,,25.0,1.0,15.0,0.0,,,24405.0,29.0,82.0,0.0
50%,190555.0,,36.0,1.0,28.0,0.0,,,31669.0,133.0,154.0,0.0
75%,285832.0,,49.0,1.0,35.0,1.0,,,39400.0,152.0,227.0,0.0


In [7]:
train.shape

(381109, 12)

In [9]:
#CodeSection6
# Identify all Numerical and Categorical features
numerical_features = ['Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Annual_Premium','Policy_Sales_Channel','Vintage']
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

### 4.2/ Outlier Strategy

In [10]:
#CodeSection7

# Write a loop to do the same
for num_var in numerical_features:
    Q1 = train[num_var].quantile(0.25)
    Q3 = train[num_var].quantile(0.75)

    IQR = Q3-Q1

    Lower_Whisker = Q1 - 1.5*IQR
    Upper_Whisker = Q3 + 1.5*IQR

    train[num_var] = train[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)
    test[num_var] = test[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)

### 4.3/ Import Libraries
- Missing Value Imputation - SimpleImputer
- Preprocessing - StandardScaler, OrdinalEncoder
- Pipeline - make_pipeline, make_column_transformer
- Model - Decision Tree

In [11]:
#CodeSection8

# Import SimpleImputer
from sklearn.impute import SimpleImputer

# Import StandardScaler
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Import Decision tree
from sklearn.tree import DecisionTreeClassifier

### 4.4/ Build Pipeline

In [12]:
#CodeSection9
# Create Preprocessor Pipeline
preprocessor = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='median'),
                  StandardScaler()), numerical_features),
    (make_pipeline(SimpleImputer(strategy='most_frequent'),
                  OrdinalEncoder()), categorical_features)
)

### 4.5/ Divide Data into X and y

In [13]:
#CodeSection10
X = train.drop(['id','Response'], axis =  1)
y = train['Response']

## 5/ Build Model and Fit

In [16]:
#CodeSection11
# Create Model Pipeline and Initiate Model
model = make_pipeline(preprocessor, DecisionTreeClassifier( max_depth= 5,class_weight='balanced'))

In [17]:
#CodeSection12 
# Fit Model
model.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Driving_License',
                                                   'Region_Code',
                                                   'Previously_Insured',
                                                   'Annual_Premium',
                                                   'Policy_Sales_Channel',
                                                   'Vintage']),
                                                 ('pipeline-2',
                                                  P

## 6/ Check Accuracy of Model on Train Data

In [18]:
#CodeSection13
# Predict on Train Data
y_pred = model.predict(X)

### We can use a Accuracy Function from Metrics
- Check Train Accuracy

In [19]:
#CodeSection14
# Import metrics library
from sklearn.metrics import roc_auc_score

In [20]:
#CodeSection15
# get Actual "y" variables (use "y_true" as variable)
y_true = train['Response']

In [22]:
#CodeSection16
# Print Train Accuracy
print(f" Train Accuracy : {roc_auc_score(y_true, y_pred):0.1%}")

 Train Accuracy : 79.2%


## 7/ Predict and Submission

### Predict on "Test Data"

In [23]:
#CodeSection17
# Get all the X Variables from the Test Dataset
X_test = test.drop(['id'], axis =  1)

In [24]:
#CodeSection18
# Predict on X_test Data ("X_test_prep")
X_test_prep = model.predict(X_test)

### Create Submission File

In [25]:
#CodeSection19
submission = pd.DataFrame({
    'id' : test['id'],
    'Response' : X_test_prep
})

### Export Submission File

In [26]:
#CodeSection20
submission.to_csv('output/O6_DT_Pipeline.csv', index = False)

In [27]:
# LB Accuracy :  0.7947542792109008