In [1]:
# modules
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Loading Data 

In [2]:
DATA_DIR = 'binary_classifier_data/'

train_ds = pd.read_csv(os.path.join(DATA_DIR, 'training.csv'), delimiter=';')
valid_ds = pd.read_csv(os.path.join(DATA_DIR, 'validation.csv'), delimiter=';')

In [14]:
# save a row to make predictions later
one_row_valid = valid_ds.sample(1)
one_row_valid.to_csv('one_row_valid.csv', index=False)
one_row_valid.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
93,b,4317,225,u,g,i,bb,75,t,f,0,f,g,560.0,0,5600000.0,f,0,no.


In [15]:
one_row_valid = pd.read_csv('one_row_valid.csv', decimal=',')
one_row_valid

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,b,43.17,0.000225,u,g,i,bb,0.75,t,f,0,f,g,560.0,0,5600000.0,f,0,no.


In [3]:
train_ds.shape, valid_ds.shape

((3700, 19), (200, 19))

In [4]:
train_ds.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,a,1792,0.00054,u,g,c,v,175,f,t,1,t,g,80.0,5,800000.0,t,0,no.
1,b,1692,0.00335,y,p,k,v,29,f,f,0,f,s,200.0,0,2000000.0,,0,no.
2,b,3125,1125.0,u,g,ff,ff,0,f,t,1,f,g,96.0,19,960000.0,t,0,no.
3,a,4817,1335.0,u,g,i,o,335,f,f,0,f,g,0.0,120,0.0,,0,no.
4,b,3233,35.0,u,g,k,v,5,f,f,0,t,g,232.0,0,2320000.0,f,0,no.


In [5]:
train_ds.dtypes

variable1      object
variable2      object
variable3      object
variable4      object
variable5      object
variable6      object
variable7      object
variable8      object
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

In [6]:
# Numeric and Non-numeric columns
numerical_vars = []
non_numerical_vars = []

features = dict(train_ds.dtypes)

for feature in features:
    if str(features[feature]) == "object":
        non_numerical_vars.append(feature)
    else:
        numerical_vars.append(feature)

print('Numerical Features:\n', numerical_vars)
print('Non-numerical features:\n', non_numerical_vars)

Numerical Features:
 ['variable11', 'variable14', 'variable15', 'variable17', 'variable19']
Non-numerical features:
 ['variable1', 'variable2', 'variable3', 'variable4', 'variable5', 'variable6', 'variable7', 'variable8', 'variable9', 'variable10', 'variable12', 'variable13', 'variable18', 'classLabel']


In [7]:
# unique values per non-numeric columns
for col in non_numerical_vars:
    print(col)
    print(train_ds[col].unique())

variable1
['a' 'b' nan]
variable2
['17,92' '16,92' '31,25' '48,17' '32,33' '34,83' '26,17' '21,17' '28,92'
 '18,17' '24,75' '31,75' '18,25' '17,58' '51,83' '26,67' '31,42' '25,67'
 '33,75' '32,75' '39,83' '40,33' '35,25' '24,33' '15,75' '45' '30,42' nan
 '17,25' '38,58' '22,67' '49' '27,83' '16,5' '36,67' '47,25' '33,67'
 '33,17' '25' '48,5' '37,42' '18' '56,5' '25,75' '22,58' '32' '40,83'
 '59,67' '42,5' '28,08' '37,75' '53,92' '47,67' '23,25' '54,58' '34,25'
 '27,58' '46' '60,92' '47,42' '37,33' '33,25' '24,58' '21,83' '15,17'
 '22,92' '69,5' '33' '23' '22' '20,75' '20,67' '48,75' '22,75' '42,25'
 '27,67' '28,75' '23,5' '29,5' '31,92' '18,83' '26,75' '27,17' '47,17'
 '23,75' '41,58' '26,33' '39,42' '44,83' '50,75' '19,17' '21,5' '25,58'
 '20,42' '44,17' '34,75' '20,5' '22,08' '42,83' '18,92' '52,83' '36,25'
 '20,33' '28,17' '23,92' '56' '30,5' '45,83' '39' '28,25' '28,67' '22,5'
 '36,08' '20' '43,25' '52,33' '19,33' '23,58' '69,17' '24,92' '29,83'
 '23,08' '31' '22,17' '45,17' '40,58

In [8]:
# Converting numeric columns
obj_to_numeric_cols = ["variable2", "variable3", "variable8"]

for col in obj_to_numeric_cols:
    # replace "," with "."
    train_ds[col] = train_ds[col].str.replace(",", ".")
    # convert to numeric data types
    train_ds[col] = pd.to_numeric(train_ds[col])

In [9]:
train_ds.dtypes

variable1      object
variable2     float64
variable3     float64
variable4      object
variable5      object
variable6      object
variable7      object
variable8     float64
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

In [10]:
# missing values per column
print("Missing Values Per Column:")
train_ds.isna().sum()

Missing Values Per Column:


variable1       39
variable2       39
variable3        0
variable4       64
variable5       64
variable6       66
variable7       66
variable8        0
variable9        0
variable10       0
variable11       0
variable12       0
variable13       0
variable14     100
variable15       0
variable17     100
variable18    2145
variable19       0
classLabel       0
dtype: int64

In [11]:
# Numeric and Non-numeric columns
numerical_vars = []
non_numerical_vars = []

features = dict(train_ds.dtypes)

for feature in features:
    if str(features[feature]) == "object":
        non_numerical_vars.append(feature)
    else:
        numerical_vars.append(feature)

print('Numerical Features:\n', numerical_vars)
print('Non-numerical features:\n', non_numerical_vars)

Numerical Features:
 ['variable2', 'variable3', 'variable8', 'variable11', 'variable14', 'variable15', 'variable17', 'variable19']
Non-numerical features:
 ['variable1', 'variable4', 'variable5', 'variable6', 'variable7', 'variable9', 'variable10', 'variable12', 'variable13', 'variable18', 'classLabel']


In [12]:
train_ds

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,a,17.92,0.000054,u,g,c,v,1.750,f,t,1,t,g,80.0,5,800000.0,t,0,no.
1,b,16.92,0.000034,y,p,k,v,0.290,f,f,0,f,s,200.0,0,2000000.0,,0,no.
2,b,31.25,0.000112,u,g,ff,ff,0.000,f,t,1,f,g,96.0,19,960000.0,t,0,no.
3,a,48.17,0.000133,u,g,i,o,0.335,f,f,0,f,g,0.0,120,0.0,,0,no.
4,b,32.33,0.000350,u,g,k,v,0.500,f,f,0,t,g,232.0,0,2320000.0,f,0,no.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3695,a,18.75,0.000750,u,g,q,v,2.710,t,t,5,f,g,,26726,,t,1,yes.
3696,a,23.50,0.000900,u,g,q,v,8.500,t,t,5,t,g,120.0,0,1200000.0,t,1,yes.
3697,b,34.17,0.000917,u,g,c,v,4.500,t,t,12,t,g,0.0,221,0.0,,1,yes.
3698,b,27.83,0.000154,u,g,W,v,3.750,t,t,5,t,g,100.0,3,1000000.0,,1,yes.


# Dealing with missing values
- variable18 has a very high number of missing values, 57% -> will be removed
- Impute the categorical features with mode
- Impute the numerical features with median

In [13]:
# drop variable18
drop_var = 'variable18'
# remove from the training set
train_ds.drop(drop_var, axis=1, inplace=True)
# remove for the list of vars
if drop_var in numerical_vars: numerical_vars.remove(drop_var)
if drop_var in non_numerical_vars: non_numerical_vars.remove(drop_var)

In [14]:
# impute categorica and numerical features
# saving values for validation
valid_mean_mode = {}
for col in train_ds.columns:
    # check if the columns has missing values
    if train_ds[col].isna().sum() == 0:
        print(f'{col}: {train_ds[col].dtype}, no missing values')
        continue
    
    # categorical features -> mode
    if col in non_numerical_vars:
        # get the mode
        col_mode = train_ds[col].mode()[0]
        print(f'{col}: Categorical, the mode is: {col_mode}')
        # fill nan with the mode
        train_ds[col].fillna(col_mode, inplace=True)     
#         # imputing the validation with the same mode
#         valid_ds[col].fillna(col_mode, inplace=True)
        valid_mean_mode[col] = col_mode
    elif col in numerical_vars:
        # get the median
        col_median = train_ds[col].median()
        print(f'{col}: Numerical, the median is: {col_median}')
        # fill nan with the median
        train_ds[col].fillna(col_median, inplace=True)
#         # imputing the validation with the same median
#         valid_ds[col].fillna(col_mode, inplace=True)
        valid_mean_mode[col] = col_median
    else:
        print('Column data type is unknown!')

variable1: Categorical, the mode is: b
variable2: Numerical, the median is: 28.67
variable3: float64, no missing values
variable4: Categorical, the mode is: u
variable5: Categorical, the mode is: g
variable6: Categorical, the mode is: c
variable7: Categorical, the mode is: v
variable8: float64, no missing values
variable9: object, no missing values
variable10: object, no missing values
variable11: int64, no missing values
variable12: object, no missing values
variable13: object, no missing values
variable14: Numerical, the median is: 120.0
variable15: int64, no missing values
variable17: Numerical, the median is: 1200000.0
variable19: int64, no missing values
classLabel: object, no missing values


In [15]:
# make sure that everything works properly 
train_ds.isna().sum()

variable1     0
variable2     0
variable3     0
variable4     0
variable5     0
variable6     0
variable7     0
variable8     0
variable9     0
variable10    0
variable11    0
variable12    0
variable13    0
variable14    0
variable15    0
variable17    0
variable19    0
classLabel    0
dtype: int64

# Encoding Categorical Features
- Categorical Features: Label Encoding -> One-Hot-Encoding
- Target Column -> Label Encoding

In [16]:
# Label encoding all categorical columns inluding the target column
labelencoder = LabelEncoder()
train_ds['classLabel'] = labelencoder.fit_transform(train_ds['classLabel'])

# one hot encoding the features except the target column
onehot_cols = non_numerical_vars.copy()
if 'classLabel' in onehot_cols: onehot_cols.remove('classLabel')

oneencoder = OneHotEncoder()
encoded_df = pd.DataFrame(oneencoder.fit_transform(train_ds[onehot_cols]).toarray())

# merge with the train_ds
train_ds = train_ds.join(encoded_df)

# drop the original columns
train_ds.drop(onehot_cols, axis=1, inplace=True)
train_ds

Unnamed: 0,variable2,variable3,variable8,variable11,variable14,variable15,variable17,variable19,classLabel,0,...,30,31,32,33,34,35,36,37,38,39
0,17.92,0.000054,1.750,1,80.0,5,800000.0,0,0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,16.92,0.000034,0.290,0,200.0,0,2000000.0,0,0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,31.25,0.000112,0.000,1,96.0,19,960000.0,0,0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,48.17,0.000133,0.335,0,0.0,120,0.0,0,0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,32.33,0.000350,0.500,0,232.0,0,2320000.0,0,0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3695,18.75,0.000750,2.710,5,120.0,26726,1200000.0,1,1,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3696,23.50,0.000900,8.500,5,120.0,0,1200000.0,1,1,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3697,34.17,0.000917,4.500,12,0.0,221,0.0,1,1,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3698,27.83,0.000154,3.750,5,100.0,3,1000000.0,1,1,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [17]:
train_ds.columns

Index([ 'variable2',  'variable3',  'variable8', 'variable11', 'variable14',
       'variable15', 'variable17', 'variable19', 'classLabel',            0,
                  1,            2,            3,            4,            5,
                  6,            7,            8,            9,           10,
                 11,           12,           13,           14,           15,
                 16,           17,           18,           19,           20,
                 21,           22,           23,           24,           25,
                 26,           27,           28,           29,           30,
                 31,           32,           33,           34,           35,
                 36,           37,           38,           39],
      dtype='object')

# Training a model

In [18]:
y_train = train_ds.classLabel
X_train = train_ds.drop('classLabel', axis=1)

In [19]:
# support vector machines
from sklearn.svm import SVC

# Instantiate
svm_model = SVC(probability=True)
# Fit
svm_model = svm_model.fit(X_train, y_train)
# Accuracy
svm_model.score(X_train, y_train)

0.9256756756756757

# Preprocessing The Validation data 

In [20]:
valid_ds.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,b,3233,75.0,u,g,e,bb,1585,t,f,0,t,s,420.0,0,4200000.0,,1,no.
1,b,2358,179.0,u,g,c,v,54,f,f,0,t,g,136.0,1,1360000.0,,0,no.
2,b,3642,0.00075,y,p,d,v,585,f,f,0,f,g,240.0,3,2400000.0,,1,no.
3,b,1842,10415.0,y,p,aa,v,125,t,f,0,f,g,120.0,375,1200000.0,,0,no.
4,b,245,13335.0,y,p,aa,v,4,f,f,0,t,g,120.0,475,1200000.0,f,1,no.


In [21]:
valid_ds.dtypes

variable1      object
variable2      object
variable3      object
variable4      object
variable5      object
variable6      object
variable7      object
variable8      object
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

In [22]:
# Converting numeric columns
obj_to_numeric_cols = ["variable2", "variable3", "variable8"]

for col in obj_to_numeric_cols:
    # replace "," with "."
    valid_ds[col] = valid_ds[col].str.replace(",", ".")
    # convert to numeric data types
    valid_ds[col] = pd.to_numeric(valid_ds[col])

In [23]:
valid_mean_mode

{'variable1': 'b',
 'variable2': 28.67,
 'variable4': 'u',
 'variable5': 'g',
 'variable6': 'c',
 'variable7': 'v',
 'variable14': 120.0,
 'variable17': 1200000.0}

In [24]:
for col in valid_ds.columns:
    if col in valid_mean_mode:
        valid_ds[col].fillna(valid_mean_mode[col], inplace=True)

In [25]:
# drop variable18
drop_var = 'variable18'
# remove from the training set
valid_ds.drop(drop_var, axis=1, inplace=True)

In [26]:
valid_ds.isna().sum()

variable1     0
variable2     0
variable3     0
variable4     0
variable5     0
variable6     0
variable7     0
variable8     0
variable9     0
variable10    0
variable11    0
variable12    0
variable13    0
variable14    0
variable15    0
variable17    0
variable19    0
classLabel    0
dtype: int64

In [27]:
valid_ds['classLabel'] = labelencoder.transform(valid_ds['classLabel'])

encoded_df = pd.DataFrame(oneencoder.transform(valid_ds[onehot_cols]).toarray())

# merge with the train_ds
valid_ds = valid_ds.join(encoded_df)

# drop the original columns
valid_ds.drop(onehot_cols, axis=1, inplace=True)
valid_ds

Unnamed: 0,variable2,variable3,variable8,variable11,variable14,variable15,variable17,variable19,classLabel,0,...,30,31,32,33,34,35,36,37,38,39
0,32.33,0.000750,1.585,0,420.0,0,4200000.0,1,0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,23.58,0.000179,0.540,0,136.0,1,1360000.0,0,0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,36.42,0.000075,0.585,0,240.0,3,2400000.0,1,0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,18.42,0.001041,0.125,0,120.0,375,1200000.0,0,0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,24.50,0.001334,0.040,0,120.0,475,1200000.0,1,0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,23.08,0.001150,3.500,9,56.0,742,560000.0,0,1,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
196,57.08,0.001950,5.500,7,0.0,3000,0.0,1,1,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
197,30.83,0.000000,1.250,1,202.0,0,2020000.0,0,1,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
198,19.67,0.000037,2.000,2,80.0,0,800000.0,1,1,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [28]:
y_valid = valid_ds.classLabel
X_valid = valid_ds.drop('classLabel', axis=1)

In [30]:
# Predictions/probs on the test dataset
predicted = pd.DataFrame(svm_model.predict(X_valid))
probs = pd.DataFrame(svm_model.predict_proba(X_valid))

# Store metrics
svm_accuracy = metrics.accuracy_score(y_valid, predicted)     
svm_roc_auc = metrics.roc_auc_score(y_valid, probs[1])       
svm_confus_matrix = metrics.confusion_matrix(y_valid, predicted) 
svm_classification_report = metrics.classification_report(y_valid, predicted)
svm_precision = metrics.precision_score(y_valid, predicted, pos_label=1)
svm_recall = metrics.recall_score(y_valid, predicted, pos_label=1)
svm_f1 = metrics.f1_score(y_valid, predicted, pos_label=1)

  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
x_train = X_train
x_test = X_valid
y_test = y_valid

from sklearn.ensemble import RandomForestClassifier

# Instantiate
rf = RandomForestClassifier()	   
# Fit
rf_model = rf.fit(x_train, y_train)
# training accuracy 99.74%
rf_model.score(x_train, y_train)

# Predictions/probs on the test dataset
predicted = pd.DataFrame(rf_model.predict(x_test))
probs = pd.DataFrame(rf_model.predict_proba(x_test))

# Store metrics
rf_accuracy = metrics.accuracy_score(y_test, predicted)     
rf_roc_auc = metrics.roc_auc_score(y_test, probs[1])       
rf_confus_matrix = metrics.confusion_matrix(y_test, predicted) 
rf_classification_report = metrics.classification_report(y_test, predicted)
rf_precision = metrics.precision_score(y_test, predicted, pos_label=1)
rf_recall = metrics.recall_score(y_test, predicted, pos_label=1)
rf_f1 = metrics.f1_score(y_test, predicted, pos_label=1)

# Evaluate the model using 10-fold cross-validation
rf_cv_scores = cross_val_score(RandomForestClassifier(), x_test, y_test, scoring='precision', cv=10)
rf_cv_mean = np.mean(rf_cv_scores)

In [39]:
rf_accuracy

0.585