## Create Ballanced Data Set

In [1]:
import time
import numpy as np
import pandas as pd
import random as rand
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25

from pandas.api.types import CategoricalDtype

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn import model_selection

use_smote = True
binaryClass = False

inputfile = 'CKME136X10_2018_Data_CTF.csv'
if binaryClass:
    outputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    outputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    outputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'
else:
    outputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    outputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    outputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'


  from numpy.core.umath_tests import inner1d


In [2]:
#load data
df = pd.read_csv(inputfile, engine = 'python')

In [3]:
df_columns = df.columns[:len(df.columns) -1]
print(df_columns)

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_TYPE', 'V_YEAR', 'P_SEX',
       'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER'],
      dtype='object')


In [4]:
#check for nun numeric values
print("Number of Null values: {}".format(df.isnull().sum().sum()))
print()
print("Number of NaN: {0}".format(df.isna().sum().sum()))
print()
print("Number of Non Numeric: {}".format(df[df.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum()))
print()
print('Information on the imported data')
print(df.info())

Number of Null values: 0

Number of NaN: 0

Number of Non Numeric: 0

Information on the imported data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4336558 entries, 0 to 4336557
Data columns (total 19 columns):
C_YEAR    int64
C_MNTH    int64
C_WDAY    int64
C_HOUR    int64
C_VEHS    int64
C_CONF    int64
C_RCFG    int64
C_WTHR    int64
C_RSUR    int64
C_RALN    int64
C_TRAF    int64
V_TYPE    int64
V_YEAR    int64
P_SEX     int64
P_AGE     int64
P_PSN     int64
P_SAFE    int64
P_USER    int64
P_ISEV    int64
dtypes: int64(19)
memory usage: 628.6 MB
None


In [5]:
# convert to the correct type
df = df.astype('category')
df['C_YEAR'] = df['C_YEAR'].astype(CategoricalDtype(ordered=True))
df['C_MNTH'] = df['C_MNTH'].astype(CategoricalDtype(ordered=True))
df['C_WDAY'] = df['C_WDAY'].astype(CategoricalDtype(ordered=True))
df['C_HOUR'] = df['C_HOUR'].astype(CategoricalDtype(ordered=True))
df['C_VEHS'] = df['C_VEHS'].astype('int')
df['V_YEAR'] = df['V_YEAR'].astype(CategoricalDtype(ordered=True))
df['P_PSN'] = df['P_PSN'].astype(CategoricalDtype(ordered=True))
df['P_AGE'] = df['P_AGE'].astype('int')
df['P_ISEV'] = df['P_ISEV'].astype('int')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4336558 entries, 0 to 4336557
Data columns (total 19 columns):
C_YEAR    category
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    int32
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
V_TYPE    category
V_YEAR    category
P_SEX     category
P_AGE     int32
P_PSN     category
P_SAFE    category
P_USER    category
P_ISEV    int32
dtypes: category(16), int32(3)
memory usage: 115.8 MB
None


## Split Training and Test set 70/30 split

In [6]:
## Split Training and Test set 70/30 split, so we don't bleed information to test set
#Split between data and class
Y = df[df.columns[-1]].copy()
if binaryClass:
    Y.replace(to_replace = 1, value = 0, inplace = True)
    Y.replace(to_replace = 2, value = 1, inplace = True)
    Y.replace(to_replace = 3, value = 1, inplace = True)
else:
    Y.replace(to_replace = 1, value = 0, inplace = True)
    Y.replace(to_replace = 2, value = 1, inplace = True)
    Y.replace(to_replace = 3, value = 2, inplace = True)

X = df[df.columns[0:df.columns.size -1]].copy()

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, stratify=Y)

In [7]:
print(Y_train.groupby(Y_train).size())
print(Y_test.groupby(Y_test).size())


P_ISEV
0    1268513
1    1745255
2      21822
Name: P_ISEV, dtype: int64
P_ISEV
0    543649
1    747967
2      9352
Name: P_ISEV, dtype: int64


In [8]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1300968 entries, 3578209 to 1478231
Data columns (total 18 columns):
C_YEAR    1300968 non-null category
C_MNTH    1300968 non-null category
C_WDAY    1300968 non-null category
C_HOUR    1300968 non-null category
C_VEHS    1300968 non-null int32
C_CONF    1300968 non-null category
C_RCFG    1300968 non-null category
C_WTHR    1300968 non-null category
C_RSUR    1300968 non-null category
C_RALN    1300968 non-null category
C_TRAF    1300968 non-null category
V_TYPE    1300968 non-null category
V_YEAR    1300968 non-null category
P_SEX     1300968 non-null category
P_AGE     1300968 non-null int32
P_PSN     1300968 non-null category
P_SAFE    1300968 non-null category
P_USER    1300968 non-null category
dtypes: category(16), int32(2)
memory usage: 39.7 MB


In [9]:
#Testing Set
df_test = X_test.copy()
df_test['P_ISEV'] = Y_test.copy()

### over sample using smote

In [10]:
if (use_smote):
# use somte to over sample the minority class
    sm = SMOTE(random_state=12, ratio = 'minority')
    x_res, y_res = sm.fit_sample(X_train, Y_train)
    print('Resampled dataset shape %s' % Counter(y_res))
    
    df_train_over = pd.DataFrame(data=x_res, columns = df_columns)
    df_train_over['P_ISEV'] = y_res

    # convert to the correct type

    #not sure why converting to category still leave fold values, lets convert to int first
    df_train_over = df_train_over.astype('int')
    df_train_over = df_train_over.astype('category')
    df_train_over['C_YEAR'] = df_train_over['C_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_over['C_MNTH'] = df_train_over['C_MNTH'].astype(CategoricalDtype(ordered=True))
    df_train_over['C_WDAY'] = df_train_over['C_WDAY'].astype(CategoricalDtype(ordered=True))
    df_train_over['C_HOUR'] = df_train_over['C_HOUR'].astype(CategoricalDtype(ordered=True))
    df_train_over['C_VEHS'] = df_train_over['C_VEHS'].astype('int')
    df_train_over['V_YEAR'] = df_train_over['V_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_over['P_PSN'] = df_train_over['P_PSN'].astype(CategoricalDtype(ordered=True))
    df_train_over['P_AGE'] = df_train_over['P_AGE'].astype('int')
    df_train_over['P_ISEV'] = df_train_over['P_ISEV'].astype('int')
    print(df_train_over.shape)
    df_train_over.head(3)

Resampled dataset shape Counter({1: 1745255, 2: 1745255, 0: 1268513})
(4759023, 19)


### under sample using smote

In [11]:
if (use_smote):
    # use somte to under sample the minority class
    sm = SMOTE(random_state=12, ratio = 'not minority')
    x_res, y_res = sm.fit_sample(X_train, Y_train)
    print('Resampled dataset shape %s' % Counter(y_res))
    
    df_train_under = pd.DataFrame(data=x_res, columns = df_columns)
    df_train_under['P_ISEV'] = y_res

    df_train_under = df_train_under.astype('int')
    df_train_under = df_train_under.astype('category')
    df_train_under['C_YEAR'] = df_train_under['C_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_MNTH'] = df_train_under['C_MNTH'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_WDAY'] = df_train_under['C_WDAY'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_HOUR'] = df_train_under['C_HOUR'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_VEHS'] = df_train_under['C_VEHS'].astype('int')
    df_train_under['V_YEAR'] = df_train_under['V_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_under['P_PSN'] = df_train_under['P_PSN'].astype(CategoricalDtype(ordered=True))
    df_train_under['P_AGE'] = df_train_under['P_AGE'].astype('int')
    df_train_under['P_ISEV'] = df_train_under['P_ISEV'].astype('int')

    print(df_train_under.shape)
    df_train_under.head(3)

Resampled dataset shape Counter({0: 1745255, 1: 1745255, 2: 21822})
(3512332, 19)


### over sample using RamdomOverSampler

In [12]:
if (not use_smote):
    ros = RandomOverSampler(random_state=0, ratio = 'minority')
    X_resampled_ros, y_resampled_ros = ros.fit_sample(X_train, Y_train)
    print(sorted(Counter(y_resampled_ros).items()))
    
    df_train_under = pd.DataFrame(data=X_resampled_ros, columns = df_columns)
    df_train_under['P_ISEV'] = y_resampled_ros

    df_train_under = df_train_under.astype('int')
    df_train_under = df_train_under.astype('category')
    df_train_under['C_YEAR'] = df_train_under['C_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_MNTH'] = df_train_under['C_MNTH'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_WDAY'] = df_train_under['C_WDAY'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_HOUR'] = df_train_under['C_HOUR'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_VEHS'] = df_train_under['C_VEHS'].astype('int')
    df_train_under['V_YEAR'] = df_train_under['V_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_under['P_PSN'] = df_train_under['P_PSN'].astype(CategoricalDtype(ordered=True))
    df_train_under['P_AGE'] = df_train_under['P_AGE'].astype('int')
    df_train_under['P_ISEV'] = df_train_under['P_ISEV'].astype('int')

    print(df_train_under.shape)
    df_train_under.head(3)

In [13]:
### under sample non minority classes

In [14]:
if (not use_smote):
    # under sample using Ramdom Under Sampler, much faster than smote
    rus = RandomUnderSampler(random_state=0, ratio = 'not minority')
    X_resampled_rus, y_resampled_rus = rus.fit_sample(X_train, Y_train)
    print(sorted(Counter(y_resampled_rus).items()))

    df_train_under = pd.DataFrame(data=X_resampled_rus, columns = df_columns)
    df_train_under['P_ISEV'] = y_resampled_rus

    df_train_under = df_train_under.astype('int')
    df_train_under = df_train_under.astype('category')
    df_train_under['C_YEAR'] = df_train_under['C_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_MNTH'] = df_train_under['C_MNTH'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_WDAY'] = df_train_under['C_WDAY'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_HOUR'] = df_train_under['C_HOUR'].astype(CategoricalDtype(ordered=True))
    df_train_under['C_VEHS'] = df_train_under['C_VEHS'].astype('int')
    df_train_under['V_YEAR'] = df_train_under['V_YEAR'].astype(CategoricalDtype(ordered=True))
    df_train_under['P_PSN'] = df_train_under['P_PSN'].astype(CategoricalDtype(ordered=True))
    df_train_under['P_AGE'] = df_train_under['P_AGE'].astype('int')
    df_train_under['P_ISEV'] = df_train_under['P_ISEV'].astype('int')

    print(df_train_under.shape)
    df_train_under.head(3)

In [15]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1300968 entries, 3578209 to 1478231
Data columns (total 19 columns):
C_YEAR    1300968 non-null category
C_MNTH    1300968 non-null category
C_WDAY    1300968 non-null category
C_HOUR    1300968 non-null category
C_VEHS    1300968 non-null int32
C_CONF    1300968 non-null category
C_RCFG    1300968 non-null category
C_WTHR    1300968 non-null category
C_RSUR    1300968 non-null category
C_RALN    1300968 non-null category
C_TRAF    1300968 non-null category
V_TYPE    1300968 non-null category
V_YEAR    1300968 non-null category
P_SEX     1300968 non-null category
P_AGE     1300968 non-null int32
P_PSN     1300968 non-null category
P_SAFE    1300968 non-null category
P_USER    1300968 non-null category
P_ISEV    1300968 non-null int32
dtypes: category(16), int32(3)
memory usage: 44.7 MB


In [16]:
df_train_over.to_csv(outputfile_train_O, encoding='utf-8', index=False)
df_train_under.to_csv(outputfile_train_U, encoding='utf-8', index=False)
df_test.to_csv(outputfile_test, encoding='utf-8', index=False)