In [None]:
import pandas as pd
import numpy as np
from numpy.random import shuffle
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy.stats import zscore

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer


pd.options.mode.chained_assignment = None
sns.set()

%matplotlib inline

In [None]:
train_df = pd.read_csv('../input/criticalitypredictioninsurance/train.csv')
test_df = pd.read_csv('../input/criticalitypredictioninsurance/test.csv')

In [None]:
train_df.head()

### Separate columns

As we see there is 120 features, now we try to dig into the dataframe by separating the columns based on their number of unique values:

First I begin by separating all columns as *"int64"* type. we remove also the **Label** columns.

In [None]:
col_drop = ['Label']
tmp_df = train_df.drop(col_drop, axis=1)

numerics = ['int64']

train_df_1 = tmp_df.select_dtypes(include = numerics) # int64 columns
train_df_2 = tmp_df.select_dtypes(exclude = numerics) # remaining columns

Now let's keep only the features in *train_df_1* where the unique number of values is less than 10, the remaining columns will be appended to *train_df_2* dataframe.

In [None]:
train_df_11 = pd.DataFrame(train_df_1.nunique(), columns=['unique_val'])

train_df_12 = train_df_11[train_df_11['unique_val']<=10]
train_df_13 = train_df_11[train_df_11['unique_val']>10]

In [None]:
train_df_1 = train_df[train_df_12.index]
train_df_2 = train_df_2.join(train_df[train_df_13.index])

print('train_df_1 contains', len(train_df_1.columns),'columns')
print('train_df_2 contains', len(train_df_2.columns),'columns')

In below we see bar plot for train_df_1 where the unique values of 97 feautres is either =2 or =3. 

In [None]:
fig, ax = plt.subplots(figsize=(12, 3))
sns.barplot(x=train_df_12.index, y=train_df_12['unique_val'], palette="rocket", ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, size = 8)

plt.tight_layout()
plt.show()

where this dataframe contains zero NaN values:

In [None]:
n_1 = train_df_1.isna().sum().sum()
print('total number of NaN in train_df_1 dataframe is:', n_1)

Now let's focus on train_df_2 dataframe containing 23 features:

In [None]:
train_df_2.head()

However, our second dataframe contains many missing values which is shown in below as percentage:

In [None]:
n_rows = train_df_2.shape[0]
n_2 = train_df_2.isna().sum()
n_2_p = (n_2)*100/n_rows
round(n_2_p,2)

For simplicity, we exclude the features where there is missing values

In [None]:
col_drop_index = np.where(n_2_p)
col_drop_names = train_df_2.columns[col_drop_index]

train_df_2 = train_df_2.drop(col_drop_names,axis = 1)

In below we split the **train_df_2** into two separate dataframes containing *object* and *numeric* formats.

In [None]:
train_df_21 = train_df_2['Info_prod_2']
train_df_22 = train_df_2.drop(['Info_prod_2'], axis=1)

In [None]:
df_zscore = train_df_22.apply(zscore)

fig, ax = plt.subplots(figsize=(10, 5))
sns.violinplot(data=df_zscore, palette="Set3", bw=.4, cut=0, linewidth=.4)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45)
plt.show()

**To conclude:** train_df has 
<br> 120 features in total where
<br> 97  features their unique values is either 3 or 2
<br> from 23 remaining features
<br> 13 of them were removed due to many number of missing values and
<br> 10 features has numberic format, their distribution shown as violinplots and
<br> 1 feature is categorical. 
<br> finally, column *label* contains 6 unique values.

**Next steps:**
<br><br> 1.1) remove the 13 features with NaN from *train_df* and *test_df*
<br> 1.2) convert *Info_prod_2* column which is the only categorical feature, to numeric format.
<br><br> 2) split **train_df** to new *train and test* datasets
<br> 3) run the multi-class classification
<br> 3.1) create the model
<br> 3.2) train the model
<br> 3.3) test the model
<br> 3.4) compute the performance of algorithm using a suitable metric
<br> 4.1) test the trained model on **test_df**
<br> 4.2) compute the performance

#### 1.1) drop NaN features from both train_df and test_df

In [None]:
train_df = train_df.drop(col_drop_names, axis=1)
test_df = test_df.drop(col_drop_names, axis=1)

In [None]:
n_1 = train_df.isna().sum().sum()
n_2 = test_df.isna().sum().sum()

print('train_df contains:', n_1,'NaN')
print('test_df contains: ', n_2,' NaN')

#### 1.2) convert Info_prod_2 column from object to numeric

In [None]:
 # Creating a instance of label Encoder.
le = LabelEncoder()
 
# Using .fit and .transform function to fit label
# encoder and return encoded label
le.fit(train_df['Info_prod_2'])
train_df['Info_prod_2'] = le.transform(train_df['Info_prod_2'])
test_df['Info_prod_2'] = le.transform(test_df['Info_prod_2'])

### 2) split train_df into train and test datasets

In [None]:
n = len(train_df) # get number of rows in the training set
training_size = 0.75 # fraction of training data to split off for internal testing

# set up separate training and testing sets
# in this case using shuffled array indices
# there are many more ways to do this too
indices = np.array(range(n)) # makes an array of row indices in order
shuffle(indices)
split_point = int(n*training_size)
mytrain_i = indices[0:split_point]
mytest_i = indices[split_point:]

# now use those shuffled indices to separating training from test dataframes
new_train_df = train_df.iloc[mytrain_i]
new_test_df = train_df.iloc[mytest_i]

print("samples in the new training subset:",len(new_train_df))
print("samples in the new test subset:",len(new_test_df))

In [None]:
# Extracting Features and Output

drop_cols = ['Label']

X_train = new_train_df.drop(drop_cols, axis=1)
y_train = new_train_df['Label']

X_test = new_test_df.drop(drop_cols, axis=1)
y_test = new_test_df['Label']
y_test = np.array(y_test)


In [None]:
# Feature Scaling

scaler = MinMaxScaler()
# scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### 3) run the multi-class classification

#### Hyperparameter tuning using grid search and cross validation

In [None]:
n_estimators = range(20,25)
max_depth = range(20,25)
min_samples_leaf = [2]
bootstrap = [True]

param_grid = {
    "n_estimators": n_estimators,
    "max_depth": max_depth,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

clf = RandomForestClassifier(random_state=42)

clf_model = GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, verbose=10, n_jobs=-1)
clf_model.fit(X_train, y_train)
y_pred = clf_model.predict(X_test)
print("Using hyperparameters --> \n", clf_model.best_params_)

#### Set chosen hyperparameters to the model

In [None]:
# Fitting Random Forest Classification to the training subset of original training data
clf = RandomForestClassifier(n_estimators = clf_model.best_params_['n_estimators'], 
                            max_depth = clf_model.best_params_['max_depth'],
                            min_samples_leaf = 2,
                            random_state = 42, 
                            bootstrap = True)


clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#### Performance metrics

In [None]:
print('accuracy =        ', round(accuracy_score(y_test, y_pred),2))

print('micro precision = ', round(precision_score(y_test, y_pred, average = 'micro'),2))
print('macro precision = ', round(precision_score(y_test, y_pred, average = 'macro'),2))

print('micro recall =    ', round(recall_score(y_test, y_pred, average = 'micro'),2))
print('macro recall =    ', round(recall_score(y_test, y_pred, average = 'macro'),2))

print('micro f1_score =  ', round(f1_score(y_test, y_pred, average = 'micro'),2))
print('macro f1_score =  ', round(f1_score(y_test, y_pred, average = 'macro'),2))

### 4) test the trained model on original test_df

In [None]:
X_train = train_df.drop(drop_cols, axis=1)
y_train = train_df['Label']

clf.fit(X_train, y_train)
clf.predict(test_df)

test_df['Label'] = clf.predict(test_df)

test_df.index.name = 'ID'
test_df.index = test_df.index +1

In [None]:
test_df[['Label']].to_csv('my_prediction.csv', 
    index=True, header=True)

print("Prediction complete. Saved as my_prediction.csv")