## Create Ballanced Data Set

In [1]:
import time
import numpy as np
import pandas as pd
import random as rand
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25

from pandas.api.types import CategoricalDtype

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn import model_selection

use_smote = True
binaryClass = True

inputfile = 'CKME136X10_2018_Data_Cleaned_Transformed.csv'

if binaryClass:
    outputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    outputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    outputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'
else:
    outputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    outputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    outputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'


  from numpy.core.umath_tests import inner1d


In [2]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

In [3]:
#load data
df = pd.read_csv(inputfile, engine = 'python')

In [4]:
df_columns = df.columns[:len(df.columns) -1]
print(df_columns)

Index(['C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG', 'C_WTHR',
       'C_RSUR', 'C_RALN', 'C_TRAF', 'P_SEX', 'P_AGE', 'P_PSN', 'P_USER'],
      dtype='object')


In [5]:
#check for nun numeric values
print("Number of Null values: {}".format(df.isnull().sum().sum()))
print()
print("Number of NaN: {0}".format(df.isna().sum().sum()))
print()
print("Number of Non Numeric: {}".format(df[df.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum()))
print()

Number of Null values: 0

Number of NaN: 0

Number of Non Numeric: 0



In [6]:
# convert to the correct type
df = convert_type(df)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4816153 entries, 0 to 4816152
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(14), int32(1)
memory usage: 82.7 MB
None


## Split Training and Test set 70/30 split

In [7]:
## Split Training and Test set 70/30 split, so we don't bleed information to test set
#Split between data and class
Y = df[df.columns[-1]].copy()
if binaryClass:
    Y.replace(to_replace = 1, value = 0, inplace = True)
    Y.replace(to_replace = 2, value = 1, inplace = True)
    Y.replace(to_replace = 3, value = 1, inplace = True)
else:
    Y.replace(to_replace = 1, value = 0, inplace = True)
    Y.replace(to_replace = 2, value = 1, inplace = True)
    Y.replace(to_replace = 3, value = 2, inplace = True)

X = df[df.columns[0:df.columns.size -1]].copy()

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, stratify=Y)

In [8]:
print(Y_train.groupby(Y_train).size())
print(Y_test.groupby(Y_test).size())


P_ISEV
0    1432039
1    1939268
Name: P_ISEV, dtype: int64
P_ISEV
0    613731
1    831115
Name: P_ISEV, dtype: int64


In [9]:
#check for nun numeric values
print("Number of Null values: {}".format(X_train.isnull().sum().sum()))
print()
print("Number of NaN: {0}".format(X_train.isna().sum().sum()))
print()
print("Number of Non Numeric: {}".format(X_train[X_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum()))
print()

Number of Null values: 0

Number of NaN: 0

Number of Non Numeric: 0



In [10]:
#check for nun numeric values
print("Number of Null values: {}".format(X_test.isnull().sum().sum()))
print()
print("Number of NaN: {0}".format(X_test.isna().sum().sum()))
print()
print("Number of Non Numeric: {}".format(X_test[X_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum()))
print()

Number of Null values: 0

Number of NaN: 0

Number of Non Numeric: 0



In [11]:
X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3371307 entries, 2300036 to 3726133
Data columns (total 14 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
dtypes: category(14)
memory usage: 70.7 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1444846 entries, 594060 to 2720405
Data columns (total 14 columns):
C_MNTH    1444846 non-null category
C_WDAY    1444846 non-null category
C_HOUR    1444846 non-null category
C_VEHS    1444846 non-null category
C_CONF    1444846 non-null category
C_RCFG    1444846 non-null category
C_WTHR    1444846 non-null category
C_RSUR    1444846 non-null category
C_RALN    1444846 non-null category
C_TRAF    1444846 non-null category
P_SEX     1444846 non-null category
P_AGE     1444846 non-null category
P_PSN     1444846 non-

In [12]:
#Testing Set
df_test = X_test.copy()
df_test['P_ISEV'] = Y_test.copy()

### over sample using smote

In [13]:
if (use_smote):
# use somte to over sample the minority class
    sm = SMOTE(random_state=12, ratio = 'minority', n_jobs = -1)
    x_res, y_res = sm.fit_sample(X_train, Y_train)
    print('Resampled dataset shape %s' % Counter(y_res))
    
    df_train_over = pd.DataFrame(data=x_res, columns = df_columns)
    df_train_over['P_ISEV'] = y_res

    # convert to the correct type

    #not sure why converting to category still leave fold values, lets convert to int first
    df_train_over = df_train_over.astype('int')
    df_train_over = convert_type(df_train_over)
    print(df_train_over.shape)
    df_train_over.head(3)

Resampled dataset shape Counter({0: 1939268, 1: 1939268})
(3878536, 15)


### over sample using RamdomOverSampler

In [14]:
if (not use_smote):
    ros = RandomOverSampler(random_state=0, ratio = 'minority')
    X_resampled_ros, y_resampled_ros = ros.fit_sample(X_train, Y_train)
    print(sorted(Counter(y_resampled_ros).items()))
    
    df_train_over = pd.DataFrame(data=X_resampled_ros, columns = df_columns)
    df_train_over['P_ISEV'] = y_resampled_ros

    df_train_over = df_train_over.astype('int')
    df_train_over = convert_type(df_train_over)

    print(df_train_over.shape)
    df_train_over.head(3)

### under sample non minority classes

In [15]:
# user RandomUnderSampler even if somte = True

In [16]:
# under sample using Ramdom Under Sampler, much faster than smote
rus = RandomUnderSampler(random_state=0, ratio = 'not minority')
X_resampled_rus, y_resampled_rus = rus.fit_sample(X_train, Y_train)
print(sorted(Counter(y_resampled_rus).items()))

df_train_under = pd.DataFrame(data=X_resampled_rus, columns = df_columns)
df_train_under['P_ISEV'] = y_resampled_rus

df_train_under = df_train_under.astype('int')
df_train_under = convert_type(df_train_under)

print(df_train_under.shape)
df_train_under.head(3)

[(0, 1432039), (1, 1432039)]
(2864078, 15)


Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,P_SEX,P_AGE,P_PSN,P_USER,P_ISEV
0,9,3,3,3,21,2,1,1,1,1,2,4,1,2,0
1,10,5,3,2,33,2,2,1,1,1,1,4,1,1,0
2,9,7,1,2,33,3,1,1,3,7,1,2,1,1,0


In [17]:
print(df_train_under.info())
print(df_train_over.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864078 entries, 0 to 2864077
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(14), int32(1)
memory usage: 49.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3878536 entries, 0 to 3878535
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(14), int32(1)
memory usage: 66.6 MB
None


In [18]:
df_train_over.to_csv(outputfile_train_O, encoding='utf-8', index=False)
df_train_under.to_csv(outputfile_train_U, encoding='utf-8', index=False)
df_test.to_csv(outputfile_test, encoding='utf-8', index=False)