# Import the necessary libraries

In [11]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

# Load the training data
Using Pandas library, read the training data into a dataframe. The task is to classify the dataset based on the target variable: TARGET_FLAG, which is a binary variable with values 0 or 1.

In [12]:
column_names = ['INDEX', 'TARGET_FLAG', 'TARGET_AMT', 'KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'PARENT1', 'HOME_VAL', 'MSTATUS', 'SEX', 'EDUCATION', 'JOB', 'TRAVTIME', 'CAR_USE', 'BLUEBOOK', 'TIF', 'CAR_TYPE', 'RED_CAR', 'OLDCLAIM', 'CLM_FREQ', 'REVOKED', 'MVR_PTS', 'CAR_AGE', 'URBANICITY']
data = pd.read_csv('train_auto.csv', header=None, names=column_names)

# Data preprocessing

In the provided dataset, there are two columns that are not useful for classification: INDEX (unique value for each sample) and TARGET_AMT (we are interested in classification and not regression).

There are four variable that are numerical (amount in dollars) but are encoded as strings: 'BLUEBOOK', 'OLDCLAIM', 'INCOME', 'HOME_VAL'. For each of these variables, we convert the values to numbers by removing the dollar sign and the comma.

In [13]:
data = data.drop(['INDEX', 'TARGET_AMT'], axis=1)
data = data[1:]

# Convert string to integers
df = data
cols = ['BLUEBOOK', 'OLDCLAIM', 'INCOME', 'HOME_VAL']
for col in cols:
    df[col] = data[col].str.replace('$', '')
    df[col] = data[col].str.replace(',', '')
    df[col] = pd.to_numeric(data[col])
data = df

## Handling missing values
Observing the dataset, we see that there are null values (encoded as Nan in pandas) for some of the features: AGE, YOJ, INCOME, HOME_VAL, JOB, and CAR_AGE. For the numeric features, we replace these missing values with the median value of that column. For the feature that has a string value (JOB), we replace the missing values with the most frequently occuring job. After doing this, we see that there are no more missing values in the dataset.

In [14]:
missing_feature_counts = data.isnull().sum()
print("Missing values for each feature:\n{}".format(missing_feature_counts))

med_age = pd.to_numeric(data['AGE'], errors='coerce').median()
data['AGE'] = data['AGE'].fillna(med_age)

med_yoj = pd.to_numeric(data['YOJ'], errors='coerce').median()
data['YOJ'] = data['YOJ'].fillna(med_yoj)

med_income = pd.to_numeric(data['INCOME'], errors='coerce').median()
data['INCOME'] = data['INCOME'].fillna(med_income)

med_home = pd.to_numeric(data['HOME_VAL'], errors='coerce').median()
data['HOME_VAL'] = data['HOME_VAL'].fillna(med_home)

med_job = 'z_Blue Collar'
data['JOB'] = data['JOB'].fillna(med_job)

med_car = pd.to_numeric(data['CAR_AGE'], errors='coerce').median()
data['CAR_AGE'] = data['CAR_AGE'].fillna(med_car)

missing_feature_counts = data.isnull().sum()
print("Missing values for each feature:\n{}".format(missing_feature_counts))

Missing values for each feature:
TARGET_FLAG      0
KIDSDRIV         0
AGE              6
HOMEKIDS         0
YOJ            454
INCOME         445
PARENT1          0
HOME_VAL       464
MSTATUS          0
SEX              0
EDUCATION        0
JOB            526
TRAVTIME         0
CAR_USE          0
BLUEBOOK         0
TIF              0
CAR_TYPE         0
RED_CAR          0
OLDCLAIM         0
CLM_FREQ         0
REVOKED          0
MVR_PTS          0
CAR_AGE        510
URBANICITY       0
dtype: int64
Missing values for each feature:
TARGET_FLAG    0
KIDSDRIV       0
AGE            0
HOMEKIDS       0
YOJ            0
INCOME         0
PARENT1        0
HOME_VAL       0
MSTATUS        0
SEX            0
EDUCATION      0
JOB            0
TRAVTIME       0
CAR_USE        0
BLUEBOOK       0
TIF            0
CAR_TYPE       0
RED_CAR        0
OLDCLAIM       0
CLM_FREQ       0
REVOKED        0
MVR_PTS        0
CAR_AGE        0
URBANICITY     0
dtype: int64


## Function to normalise the data

In [15]:
def normalise_data(dataframe):
    x = dataframe.values 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled)

## Handling categorical variables
In the dataset, we have multiple variables that have categorical values: 'PARENT1', 'MSTATUS', 'SEX', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'JOB'. Before we give the data as input to any machine learning algorithm, we need to convert these variables to have numeric values. We use on hot encoding to convert these categorical variables into numeric ones.

In [16]:
print(data.shape)
categorical_cols = ['PARENT1', 'MSTATUS', 'SEX', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'JOB']
data = pd.get_dummies(data, columns = categorical_cols)
print(data.shape)

(8161, 24)
(8161, 47)


## Imbalanced data
Observing the count of the target variable, we see that there are around 8k examples belonging to class 1 and around 2k examples belonging to class 0. We explore three strategies to handle this: under sampling, over sampling and leaving the data as it is. We finally use the oversampled data for our classification, as it resulted in the best performance.

In [17]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

labels = data['TARGET_FLAG'].copy()
data = data.drop(['TARGET_FLAG'], axis=1)
labels = pd.to_numeric(labels)

undersample = RandomUnderSampler(sampling_strategy='majority')
data_us, labels_us = undersample.fit_resample(data, labels)
data_us = normalise_data(data_us)

oversample = RandomOverSampler(sampling_strategy='minority')
data_os, labels_os = undersample.fit_resample(data, labels)
data_os = normalise_data(data_os)

ImportError: cannot import name 'MultiOutputMixin' from 'sklearn.base' (//anaconda3/envs/tensorflow_cpu/lib/python3.7/site-packages/sklearn/base.py)

## Normalise and split the dataset into training and validation

In [None]:
data = normalise_data(data_os)
x_train, x_test, y_train, y_test = train_test_split(data.values, labels_os.values, test_size=0.25, shuffle=True)

# Training and validation

## Random forest classifier

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(class_weight='balanced')
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='f1')
# rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 5, verbose=2, scoring='f1')
# Fit the random search model
rf_random.fit(x_train, y_train)

# Evaluate the trained model on the validation data using confusion matrix and f-score.
y_pred = rf_random.predict(x_test)
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))
print(f1_score(y_test, y_pred))

## Support vector machines

In [None]:
# Different parameters for the SVM: kernel, gamma, and C
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
# Using grid search to find the best parameters
svm = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
# Fit the model to the training data
svm.fit(x_train, y_train)

# Evaluate the trained model on the validation data using confusion matrix and f-score.
y_pred = svm.predict(x_test)
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))
print(f1_score(y_test, y_pred))

## Logistic regression

In [None]:
lreg = LogisticRegressionCV(cv=5, random_state=0).fit(x_train, y_train)
y_pred = lreg.predict(x_test)

print(confusion_matrix(y_test, y_pred, labels=[0, 1]))
print(f1_score(y_test, y_pred))

## Gradient boosting classifier

In [None]:
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
max_depths = np.linspace(1, 32, 32, endpoint=True)
lrs = [0.01, 0.05, 0.1, 0.25, 0.5, 1]
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)

random_grid = {'learning_rate': lrs,
               'n_estimators': n_estimators,
               'max_depth': max_depths,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
                }

gbm = GradientBoostingClassifier()
gbm_random = RandomizedSearchCV(estimator = gbm, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='f1')
gbm_random.fit(x_train, y_train)

y_pred = gbm_random.predict(x_test)
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))
print(f1_score(y_test, y_pred))

## Bagging classifier

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'bootstrap': bootstrap}
print(random_grid)

bc = BaggingClassifier()
bc_random = RandomizedSearchCV(estimator = bc, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='f1')
bc_random.fit(x_train, y_train)

y_pred = bc_random.predict(x_test)
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))
print(f1_score(y_test, y_pred))

# Evaluate the best model on the test set
We repeat the same preprocessing steps on the test set: drop the unnecessary columns, convert dollar values to numbers, fill the missing values with the median values from the training data, and convert the categorical variables into numeric ones. After the preprocessing the predicted labels are stored in the variable y_pred. 

In [None]:
column_names = ['INDEX', 'TARGET_FLAG', 'TARGET_AMT', 'KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'PARENT1', 'HOME_VAL', 'MSTATUS', 'SEX', 'EDUCATION', 'JOB', 'TRAVTIME', 'CAR_USE', 'BLUEBOOK', 'TIF', 'CAR_TYPE', 'RED_CAR', 'OLDCLAIM', 'CLM_FREQ', 'REVOKED', 'MVR_PTS', 'CAR_AGE', 'URBANICITY']
test_data = pd.read_csv('test_auto.csv', header=None, names=column_names)
test_data = test_data[1:]
test_data = test_data.drop(['INDEX', 'TARGET_FLAG', 'TARGET_AMT'], axis=1)

df = test_data
cols = ['BLUEBOOK', 'OLDCLAIM', 'INCOME', 'HOME_VAL']
for col in cols:
    df[col] = test_data[col].str.replace('$', '')
    df[col] = test_data[col].str.replace(',', '')
    df[col] = pd.to_numeric(test_data[col])
test_data = df

test_data['YOJ'] = test_data['YOJ'].fillna(med_age)
test_data['AGE'] = test_data['AGE'].fillna(med_age)
test_data['INCOME'] = test_data['INCOME'].fillna(med_income)
test_data['HOME_VAL'] = test_data['HOME_VAL'].fillna(med_home)
test_data['JOB'] = test_data['JOB'].fillna(med_job)
test_data['CAR_AGE'] = test_data['CAR_AGE'].fillna(med_car)

print(test_data.shape)
categorical_cols = ['PARENT1', 'MSTATUS', 'SEX', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'JOB']
test_data = pd.get_dummies(test_data, columns = categorical_cols)
print(test_data.values.shape)

test_data = normalise_data(test_data)
y_pred = gbm_random.predict(test_data.values)