## Load libraries

In [30]:
import io
from google.colab import drive
import time

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer


## Mount Google Drive
1. Click play  
2. Open authentication link and allow access   
3. Copy unique authorisation code into text box  

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


## Import data
1. Train data
2. Test *data*

In [4]:
train_data = pd.read_csv("/content/drive/My Drive/Regression Challenge Shared Folder/Data/Train.csv", index_col="ID") 
test_data = pd.read_csv("/content/drive/My Drive/Regression Challenge Shared Folder/Data/Test.csv", index_col="ID")

## Data Formatting and Pre-processing

In [5]:
#Define target columns
y_columns = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ','GYSR', 'SOP4', 'RVSZ',
                 'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X','K6QO', 'QBOL',
                 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']

# Remove rows with missing target
train_data.dropna(axis=0, subset=y_columns, inplace=True)
test_data.dropna(axis=0, subset=y_columns, inplace=True)

#convert date using pandas
train_data['join_date'] = pd.to_datetime(train_data['join_date'])
test_data['join_date'] = pd.to_datetime(test_data['join_date'])

#add age column (== birth_year)
train_data['age'] = 2020 - train_data["birth_year"]
test_data['age'] = 2020 - test_data["birth_year"]

#add age_joined column (age of client when joined)
train_data['age_join'] = train_data['join_date'].dt.year - train_data["birth_year"]
test_data['age_join'] = test_data['join_date'].dt.year - test_data["birth_year"]

#period_client (== join_date in years; duration)
train_data['period_client'] = 2020 - train_data['join_date'].dt.year
test_data['period_client'] = 2020 - test_data['join_date'].dt.year

In [11]:
#Remove columns with high cardinality
#train_data = train_data.drop(["occupation_code"], axis=1)
#test_data = test_data.drop(["occupation_code"], axis=1)

# Convert string date to seconds
train_data['join_date'] = (train_data['join_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') 
test_data['join_date'] = (test_data['join_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') 

In [13]:
#format categorical columns
train_data["sex"] = train_data["sex"].astype('category')
train_data["marital_status"] = train_data["marital_status"].astype('category')
train_data["branch_code"] = train_data["branch_code"].astype('category')
train_data["occupation_category_code"] = train_data["occupation_category_code"].astype('category')
train_data["occupation_code"] = train_data["occupation_code"].astype('category')

test_data["sex"] = test_data["sex"].astype('category')
test_data["marital_status"] = test_data["marital_status"].astype('category')
test_data["branch_code"] = test_data["branch_code"].astype('category')
test_data["occupation_category_code"] = test_data["occupation_category_code"].astype('category')
test_data["occupation_code"] = test_data["occupation_code"].astype('category')

#for col in train_data.columns[6:27]:
#  train_data[col] = train_data[col].astype('category')

In [14]:
#Separate target from predictors
y_train = train_data[y_columns]
X_train = train_data.drop(y_columns, axis = 1)

X_test = test_data.drop(y_columns, axis = 1)
y_test = test_data[y_columns]

#remove synonymous columns
#X_train = X_train.drop(["birth_year"], axis=1)
#X_test = X_test.drop(["birth_year"], axis=1)

#X_train = X_train.drop(['join_date'], axis=1)
#X_test = X_test.drop(['join_date'], axis=1)

In [15]:
# Select categorical columns and numerical columns
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype.name == "category"]

numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_test = X_test[my_cols].copy()

In [16]:
# Preprocessing for numerical data

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

## Classification and Regression Tree Model

In [17]:
SEED = 1234
#instantiate decision tree model with muti-classifier output
dt = DecisionTreeClassifier(random_state=SEED)

#setup pipeline to bundle preprocessing and modeling code
steps = [('preprocessor', preprocessor), ('model', dt)]
pipeline = Pipeline(steps)

In [18]:
#hyperparameter tuning with GridSearch
parameters = {'model__max_depth': [3, None],
              'model__max_features': ['auto', 'sqrt', 'log2'],
              "model__min_samples_leaf": list(range(1, 12, 3)),
              "model__min_samples_split": list(range(2, 15, 3)),
              'model__criterion': ["gini", "entropy"]}

search_dt = GridSearchCV(pipeline, param_grid = parameters, cv = 5, n_jobs = -1)
search_dt.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [19]:
print("Best Hyper Parameters:",search_dt.best_params_)

Best Hyper Parameters: {'model__criterion': 'gini', 'model__max_depth': None, 'model__max_features': 'auto', 'model__min_samples_leaf': 7, 'model__min_samples_split': 2}


### Training Prediction

In [20]:
train_prediction = search_dt.predict_proba(X_train)
train_prediction = np.array(train_prediction)[:, :, 1]
train_prediction = train_prediction.transpose()

train_prediction = pd.DataFrame(train_prediction, index=y_train.index, columns=y_train.columns)
print("Log loss:", log_loss(y_train, train_prediction))

Log loss: 3.394911630548022


### Test Prediction


In [None]:
y_pred = search_dt.predict_proba(X_test)
y_pred = np.array(y_pred)[:, :, 1]
y_pred = y_pred.transpose()
y_pred = pd.DataFrame(y_pred, index=y_test.index, columns=y_test.columns)

y_pred[y_test == 1] = 1
y_pred

## Random Forest

In [41]:
SEED = 1234

#instantiate random forest model
rf = RandomForestClassifier(random_state=SEED)

#setup pipeline to bundle preprocessing and modeling code
rf_steps = [('preprocessor', preprocessor), ('model', rf)]
rf_pipeline = Pipeline(rf_steps)

In [None]:
rf.get_params()

In [46]:
#define grid parameters
params_rf = {'model__max_depth': [4, 6, 8],
              'model__max_features': ['sqrt', 'log2'],
              "model__min_samples_leaf": [0.1, 0.2],
              'model__n_estimators': [300, 400, 500]}

#instantiate grid
grid_rf = GridSearchCV(estimator = rf_pipeline, param_grid = params_rf, cv = 5, verbose = 1, n_jobs = -1)
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 45.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [32]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

### Training Prediction

In [35]:
train_prediction = pipeline.predict_proba(X_train)
train_prediction = np.array(train_prediction)[:, :, 1]
train_prediction = train_prediction.transpose()

train_prediction = pd.DataFrame(train_prediction, index=y_train.index, columns=y_train.columns)
print("Log loss:", log_loss(y_train, train_prediction))

Log loss: 2.2218803377211254


In [None]:
train_prediction = grid_rf.predict_proba(X_train)
train_prediction = np.array(train_prediction)[:, :, 1]
train_prediction = train_prediction.transpose()

train_prediction = pd.DataFrame(train_prediction, index=y_train.index, columns=y_train.columns)
print("Log loss:", log_loss(y_train, train_prediction))

### Test Prediction

In [None]:
y_pred = pipeline.predict_proba(X_test)
y_pred = np.array(y_pred)[:, :, 1]
y_pred = y_pred.transpose()
y_pred = pd.DataFrame(y_pred, index=y_test.index, columns=y_test.columns)

y_pred[y_test == 1] = 1
y_pred

## Submission File

In [37]:
def zindi_submission(predictions, filename):
    
    '''
    Function takes as input a dataframe of predictions with an index that represents
    IDs, column names of different insurance products, and values of probablities,
    as well as a filename (string) for the output file
    Function returns a csv file in a two column format: one with the ID index and
    product name concatenated as a string and a corresponding second column with the
    probablity values as a float, which is called "label
    '''

    # promote index as a column
    predictions = predictions.reset_index()
    # extract name of the index/id column
    index_name = predictions.columns[0]
    # reshape dataframe with long format (melting) - "product" for the column names;
    # "label" for probability values
    melted_preds = predictions.melt(id_vars=index_name, var_name="Product", value_name="Label")

    # concatenate the ID and product name columns into a series
    id_x_pcode = melted_preds[index_name] + " X " + melted_preds["Product"]
    # extract the label column for the melted dataframe
    label_series = melted_preds[["Label"]]
    # bind the ID-Product name column with the label column into a dataframe; give column names
    final = pd.concat([id_x_pcode, label_series], axis=1, ignore_index=True)
    final.columns = ["ID X PCODE", "Label"]
    # export to CSV
    final.to_csv(filename, index=False, header=True)

In [38]:
zindi_submission(y_pred, "/content/drive/My Drive/Regression Challenge Shared Folder/Outputs/RF_no_tune.csv")