In [None]:
# importing packages
import pandas as pd

#dvierse
from string import ascii_letters
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# for preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from scipy import stats
from sklearn import preprocessing
from pprint import pprint

# for prediciting
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LassoCV



# for assessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
# importing data
df_raw = pd.read_csv('C:/Users/Marc/Dropbox/06_ESCP/01_Uni/05_Term 3/04_HDI/01_Code/01_Input/210517_syn_data.csv')

In [None]:
# drop not necessary index
df_raw = df_raw.drop(columns = ['Unnamed: 0','CONTRACT_ID'])


Approach:
- Dataset understanding
    - Split between Contract info, Customer info, property info, contract part, product description, damage description, damage target
    - Continuous vs Categorical Data
    - Multicollinearity?
- Check Cleaning & Scale
    - Data received clean, hence no cleaning to be performed
        - MinMaxScaler
        - StandardScaler
        - RobustScaler
        - Outlier reduction based on z-score
        - Bucketing for Neural Network

- Creation of Algorithms (Tried using lazy predict)
    - Multilinear Regression
    - XG Boost
    - Random Forest
    - Multilayer Perceptron
    - Lasso Regression

- Parameter Tuning (What do we want to optimize)
    - GridSearchCV
    - RandomizedSearchCV
- Conclusion Discussion and Interpretation

- More: Azure documentation as appendix


In [None]:
df_for_corr = df_raw.drop(columns = ['DAMAGE_TARGET'])
# Multicolinearity

# Correltation Heatmap
sns.set_theme(style="white")

# Compute the correlation matrix
corr = df_for_corr.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# Filtering out the columns that have a correlation of more than 0.6
abs_corr = corr.abs()
abs_corr_filtered = abs_corr[abs_corr > 0.5]
abs_corr_filtered.loc[:, (abs_corr_filtered.sum() > 2.2)].columns

In [None]:
### Only Contract Info 1 has an multicolinearity of more than 0.6 (0.6 + 0.6 + 1, because of with itself and the two parts of the trinangle)
### Drop it

df_raw = df_raw.drop(columns=['CONTRACT_INFO_1'])

In [None]:
df_raw.describe()

### mean and standard deviation are not around 0, hence scaling has to be done

In [None]:
# Selecting all numerical features to scale
numerical_to_scale = ['CONTRACT_INFO_0','CONTRACT_INFO_7', 'CUSTOMER_INFO_1', 'CUSTOMER_INFO_2', 'CUSTOMER_INFO_3', 'DAMAGE_DESCRIPTION_0', 'DAMAGE_DESCRIPTION_1' ]
num_to_s = df_raw[numerical_to_scale]

Data Exploration for features with continuous data

In [None]:
# boxpltos
boxp = sns.boxplot(data=num_to_s,palette="colorblind")
boxp.set_xticklabels(boxp.get_xticklabels(),rotation=90)

### hence devation must be very high

In [None]:
# check devation
num_to_s.describe()

### Scaling to be done

In [None]:
# split X from y, as target variable should remain original values and not adjusted to the scale
y = df_raw.DAMAGE_TARGET.values
X = df_raw.drop(columns = ['DAMAGE_TARGET'])
X_not_num = X.drop(columns=numerical_to_scale)

In [None]:
# to add columns back after scaling, as when scaling the column names are lost
columns = num_to_s.columns

# Standard Scaler

In [None]:
### No Scaling - bad option as data is not normal distributed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Standard Scaler
scaler_ss = StandardScaler()
df_ss_num = pd.DataFrame(scaler_ss.fit_transform(num_to_s))
df_ss_num.columns = columns

# SS_boxpltos
boxp_ss = sns.boxplot(data=df_ss_num,palette="colorblind")
boxp_ss.set_xticklabels(boxp_ss.get_xticklabels(),rotation=90)

# add back to non numerical
df_ss = pd.concat([df_ss_num, X_not_num], axis = 1)

# Split into train and test sets
X_train_ss, X_test_ss, y_train_ss, y_test_ss = train_test_split(df_ss, y, test_size=0.3, random_state=42)

### Still outliers for Contract_info_7 and Customer_info 1,2,3

# Robust Scaler

In [None]:
scaler_rs = RobustScaler()
df_rs_num = pd.DataFrame(scaler_rs.fit_transform(num_to_s))
df_rs_num.columns = columns

# RS_boxpltos
boxp_rs = sns.boxplot(data=df_rs_num,palette="colorblind")
boxp_rs.set_xticklabels(boxp_rs.get_xticklabels(),rotation=90)

# add back to non numerical
df_rs = pd.concat([df_rs_num, X_not_num], axis = 1)

# Split into train and test sets
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(df_rs, y, test_size=0.3, random_state=42)

### Still outliers for Contract_info_7 and Customer_info 1,2,3

# Min max Scaled

In [None]:
scaler_mm = MinMaxScaler()
df_min_max_num = pd.DataFrame(scaler_mm.fit_transform(num_to_s))
df_min_max_num.columns = columns

# MM_boxpltos
boxp_mm = sns.boxplot(data=df_min_max_num,palette="colorblind")
boxp_mm.set_xticklabels(boxp_mm.get_xticklabels(),rotation=90)

# add back to non numerical
df_min_max = pd.concat([df_min_max_num, X_not_num], axis = 1)

# Split into train and test sets
X_train_mm, X_test_mm, y_train_mm, y_test_mm = train_test_split(df_min_max, y, test_size=0.3, random_state=42)

### Still outliers for Contract_info_7 and Customer_info 1,2,3

# Z-score approach
As with standard scaling methods, outliers remain. New approach of Z-Score
https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba

The Z-score is the signed number of standard deviations by which the value of an observation or data point is above the mean value of what is being observed or measured.

In [None]:
z = np.abs(stats.zscore(num_to_s))
# to delete outliers
df_z_score = num_to_s[(z < 2).all(axis=1)]

### Standard Scaler
scaler_zs = StandardScaler()
df_zs_num = pd.DataFrame(scaler_zs.fit_transform(df_z_score))
df_zs_num.columns = columns

# ZS_boxpltos
boxp_zs = sns.boxplot(data=df_zs_num,palette="colorblind")
boxp_zs.set_xticklabels(boxp_zs.get_xticklabels(),rotation=90)

### Usally take z = 3 as like that around 99% of the data should be covered. When looked at the boxplots, there are however still outliers. Hence, approach could either adjust z further, or try predicition without customer info 2

In [None]:
# add back to non numerical
df_zs = df_zs_num.join(X_not_num, how = 'left')

In [None]:
# Split into train and test sets
X_train_ss, X_test_ss, y_train_ss, y_test_ss = train_test_split(df_ss, y, test_size=0.3, random_state=42)

Outliers very hard to scale, hence furhter apporach analyzed:
- Bucketing of continuous features into ranges
- Display them as integers
- Run NN, that will simply adjust weights of nodes accordingly (almost none for outliers)

# Bucketing

In [None]:
# objective to optimize q, as this defines number of buckets

for i in numerical_to_scale:
    X['quantile_' + i] = pd.qcut(X[i],q=1000, labels = False, duplicates= 'drop')

X_final = X.drop(columns = numerical_to_scale)

In [None]:
#checking out valuecounts for different buckets - should outliers have their own buckets?
X_final.quantile_DAMAGE_DESCRIPTION_1.value_counts().tail()


# Multilayer Perceptron

In [None]:
## Mnual feature tuning by amount of bins (previous block of code) and the amount of hidden layers of neuron and also their amount)

X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_final, y,random_state=1)

regr = MLPRegressor(random_state=1, max_iter=500, hidden_layer_sizes=(100,50,25)).fit(X_train_nn, y_train_nn)

y_pred = regr.predict(X_test_nn)

regr.score(X_test_nn, y_test_nn)


# Simple other models with scaled data (no bucketing), as R2 of MLP is very low

In [None]:
lr = LinearRegression()
lr.fit(X_train_rs, y_train_rs)
y_pred_lr = lr.predict(X_test_rs)
lr.score(X_test_rs, y_test_rs)

# XGboost

In [None]:
# XGBoost does not work with objects
# No features are objects
# Approach: See which parameters worked best - add more parameters on the end which best param was found

# Various hyper-parameters to tune
xgb1 = XGBRegressor()
xgb1.get_params()
xgb1.fit(X_train_rs, y_train_rs)

In [None]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.02, .025, .03], #so called `eta` value
              'max_depth': [3], #best
              'min_child_weight': [2,2.5,3],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(X_train_rs,
         y_train_rs)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)


# Lasso Regression
Approach:
- Different alphas to see which would result in best results

In [None]:
lasso = Lasso()
parameters = {'alpha':[1.4,1.45,1.5]}

lasso_grid = GridSearchCV(lasso,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

lasso_grid.fit(X_train_rs,
         y_train_rs)

print(lasso_grid.best_score_)
print(lasso_grid.best_params_)


# RandomForrest Regressor
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [None]:
rf = RandomForestRegressor(random_state = 42)
rf.get_params()

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
rf_param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = rf_param, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [None]:
# Fit the random search model
rf_random.fit(X_train_rs, y_train_rs)
rf_random.best_params_



# Lasso Regression to choose features based on coefficient


In [None]:
## Applying the best params of GridCV above

from sklearn import linear_model
clf = linear_model.Lasso(alpha=1.45)
clf.fit(X_train_rs,
         y_train_rs)
Lasso(alpha=0.1)
importance = np.abs(clf.coef_)

feature_names = np.array(X_train_rs.columns)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.xticks(rotation=90)
plt.show()

In [None]:
## Filtering out all features with coefficiants above 20 (manually chosen)
coeff = pd.DataFrame(importance).T
coeff.columns = X_train_rs.columns
coeff_important = coeff.T[coeff.T >= 20].dropna().T
new_cols_20 = coeff_important.columns

In [None]:
lasso_20 = Lasso(alpha = 1.45).fit(X_train_rs[new_cols_20], y_train_rs)
lasso_20.score(X_test_rs[new_cols_20], y_test_rs)



In [None]:
## Applying the chose features from lasso and perform XGBoost
xgb20 = XGBRegressor()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.02, .025, .03], #so called `eta` value
              'max_depth': [3], #best
              'min_child_weight': [2,2.5,3],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid_20 = GridSearchCV(xgb20,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid_20.fit(X_train_rs[new_cols_20],
         y_train_rs)

print(xgb_grid_20.best_score_)
print(xgb_grid_20.best_params_)


In [None]:
## Filtering out all features with coefficiants above 50 (manually chosen)
coeff = pd.DataFrame(importance).T
coeff.columns = X_train_rs.columns
coeff_important = coeff.T[coeff.T >= 50].dropna().T
new_cols_50 = coeff_important.columns

In [None]:
lasso_50 = Lasso(alpha = 1.45).fit(X_train_rs[new_cols_50], y_train_rs)
lasso_50.score(X_test_rs[new_cols_50], y_test_rs)

In [None]:
## Applying the chose features from lasso and perform XGBoost
xgb50 = XGBRegressor()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.02, .025, .03], #so called `eta` value
              'max_depth': [3], #best
              'min_child_weight': [2,2.5,3],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid_50 = GridSearchCV(xgb50,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid_50.fit(X_train_rs[new_cols_50],
         y_train_rs)

print(xgb_grid_50.best_score_)
print(xgb_grid_50.best_params_)


In [269]:
MLP_score = [regr.score(X_test_nn, y_test_nn)]
overview_test = pd.DataFrame(MLP_score, columns = ['MLP_score'])

In [270]:
MLP_score_train = [regr.score(X_train_nn, y_train_nn)]
overview_train = pd.DataFrame(MLP_score_train, columns = ['MLP_score'])


In [271]:
xgb_hyp_50_score = [xgb_grid_50.score(X_test_rs[new_cols_50], y_test_rs)]
overview_test['xgb_hyp_50_score'] = pd.DataFrame(xgb_hyp_50_score)


In [272]:
xgb_hyp_50_score_train = [xgb_grid_50.score(X_train_rs[new_cols_50], y_train_rs)]
overview_train['xgb_hyp_50_score'] = pd.DataFrame(xgb_hyp_50_score_train)

In [273]:
xgb_hyp_20_score = [xgb_grid_20.score(X_test_rs[new_cols_20], y_test_rs)]
overview_test['xgb_hyp_20_score'] = pd.DataFrame(xgb_hyp_20_score)

In [274]:
xgb_hyp_20_score_train = [xgb_grid_20.score(X_train_rs[new_cols_20], y_train_rs)]
overview_train['xgb_hyp_20_score'] = pd.DataFrame(xgb_hyp_20_score_train)

In [275]:
xgb_hyp_score = [xgb_grid.score(X_test_rs, y_test_rs)]
overview_test['xgb_hyp_score'] = pd.DataFrame(xgb_hyp_score)

In [276]:
xgb_hyp_score_train = [xgb_grid.score(X_train_rs, y_train_rs)]
overview_train['xgb_hyp_score'] = pd.DataFrame(xgb_hyp_score_train)

In [277]:
xgb_score = [xgb1.score(X_test_rs, y_test_rs)]
overview_test['xgb_score'] = pd.DataFrame(xgb_score, columns = ['xgb_score'])

In [278]:
xgb_score_train = [xgb1.score(X_train_rs, y_train_rs)]
overview_train['xgb_score'] = pd.DataFrame(xgb_score_train)


In [279]:
lasso_hyp_50_score = [lasso_50.score(X_test_rs[new_cols_50], y_test_rs)]
overview_test['lasso_hyp_50_score'] = pd.DataFrame(lasso_hyp_50_score)

In [280]:
lasso_hyp_50_score_train = [lasso_50.score(X_train_rs[new_cols_50], y_train_rs)]
overview_train['lasso_hyp_50_score'] = pd.DataFrame(lasso_hyp_50_score_train)

In [281]:
lasso_hyp_20_score = [lasso_20.score(X_test_rs[new_cols_20], y_test_rs)]
overview_test['lasso_hyp_20_score'] = pd.DataFrame(lasso_hyp_20_score)

In [282]:
lasso_hyp_20_score_train = [lasso_20.score(X_train_rs[new_cols_20], y_train_rs)]
overview_train['lasso_hyp_20_score'] = pd.DataFrame(lasso_hyp_20_score_train)

In [283]:
lasso_hyp_score = [lasso_grid.score(X_test_rs, y_test_rs)]
overview_test['lasso_hyp_score'] = pd.DataFrame(lasso_hyp_score)

In [284]:
lasso_hyp_score_train = [lasso_grid.score(X_train_rs, y_train_rs)]
overview_train['lasso_hyp_score'] = pd.DataFrame(lasso_hyp_score_train)


In [285]:
rf_hyp_score = [rf_random.score(X_test_rs, y_test_rs)]
overview_test['rf_hyp_score'] = pd.DataFrame(rf_hyp_score)


In [286]:
rf_hyp_score_train = [rf_random.score(X_train_rs, y_train_rs)]
overview_train['rf_hyp_score'] = pd.DataFrame(rf_hyp_score_train)

In [287]:
lr_score = [lr.score(X_test_rs, y_test_rs)]
overview_test['lr_score'] = pd.DataFrame(lr_score, columns = ['lr_score'])

In [288]:
lr_score_train = [lr.score(X_train_rs, y_train_rs)]
overview_train['lr_score'] = pd.DataFrame(lr_score)



In [289]:
overview_test = overview_test.T.rename(columns = {0:'overview_test'})
overview_train = overview_train.T.rename(columns = {0:'overview_train'})

In [290]:
overview = overview_train.join(overview_test, how = 'left')

In [291]:
overview

Unnamed: 0,overview_train,overview_test
MLP_score,0.019184,0.015341
xgb_hyp_50_score,0.037779,0.032072
xgb_hyp_20_score,0.049091,0.039977
xgb_hyp_score,0.072682,0.055294
xgb_score,0.269324,0.022227
lasso_hyp_50_score,0.023906,0.023907
lasso_hyp_20_score,0.026382,0.027225
lasso_hyp_score,0.028869,0.029547
rf_hyp_score,0.437097,0.046946
lr_score,0.029216,0.029216
