In [1]:
import numpy as np   
import pandas as pd  
import os
import gc
import seaborn as sns  # for plotting graphs
import matplotlib.pyplot as plt # for plotting graphs aswell
import glob
from datetime import datetime
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix, hstack
%matplotlib inline



In [2]:
# to display maximum rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [3]:
# function to set all numerical data to int16 or float16, to save on memory use
def dtype_conver(Dataframe):
    for col in Dataframe:
        if Dataframe[col].dtype == 'float32' or 'float64':
            Dataframe[col] = Dataframe[col].astype(np.float16)
        if Dataframe[col].dtype == 'int8' or 'int32' or 'float64':
            Dataframe[col] = Dataframe[col].astype(np.int16)

In [4]:
# Read in filepath  
DATA_PATH = r'C:/Users/t891199/Desktop/Big_Data_Diploma/CEBD_1260_Machine_learning/Data Files/Class_3/'
file_name = os.path.join(DATA_PATH,'train.csv')


In [5]:
# pandas reads in csv file using filepath
old_train_df = pd.read_csv(file_name)
print(old_train_df.shape)
#original_quote_date is time-series

(260753, 299)


In [6]:
#Feature Engineering
old_train_df['Original_Quote_Date'] = pd.to_datetime(old_train_df['Original_Quote_Date'])
old_train_df['year'] = old_train_df['Original_Quote_Date'].dt.year
old_train_df['month'] = old_train_df['Original_Quote_Date'].dt.month
old_train_df['day'] = old_train_df['Original_Quote_Date'].dt.day

In [7]:
train_df = old_train_df.drop(["Original_Quote_Date"], axis = 1)
# lets see how many NaN or Null values are in each column
nan_info = pd.DataFrame(train_df.isnull().sum()).reset_index()
nan_info.columns = ['col','nan_cnt']


In [8]:
#sort them in descending order and print 1st 10
nan_info.sort_values(by = 'nan_cnt',ascending=False,inplace=True)
nan_info.head(10)

Unnamed: 0,col,nan_cnt
160,PropertyField29,200685
124,PersonalField84,124208
169,PropertyField38,1220
167,PropertyField36,113
48,PersonalField7,113
129,PropertyField3,81
163,PropertyField32,70
165,PropertyField34,70
130,PropertyField4,63
198,GeographicField14A,0


In [9]:
# extract column names with NaNs and Nulls
# numerical cols with missing values
num_cols_with_missing = ['PersonalField84','PropertyField29']



In [10]:
#boolean type cols with missing values
bool_cols_with_missing = ['PropertyField3','PropertyField4','PersonalField7','PropertyField32',
                          'PropertyField34','PropertyField36','PropertyField38']


In [11]:
# fill in null and NaN values with 'U' in boolean type cols ( 'Y','N')
for cols in bool_cols_with_missing:
    train_df[cols].fillna('U',inplace=True)



In [12]:
# fill in null and NaN values with -1 in numerical missing values
for cols in num_cols_with_missing:
    train_df[cols].fillna(-1, inplace=True)


In [13]:
# define target
y = old_train_df["QuoteConversion_Flag"].values

In [14]:
# drop target column from data
X = train_df.drop(["QuoteConversion_Flag"], axis = 1)


In [15]:
#QuoteNumber setting as index
X = X.set_index("QuoteNumber")



In [16]:
# select all columns that are categorical i.e with unique categories less than 40 in our case
X_for_ohe = [cols for cols in X.columns if X[cols].nunique() < 40 or X[cols].dtype in['object']]
X_not_ohe = [cols for cols in X.columns if X[cols].nunique() > 40 and X[cols].dtype not in['object']]


In [17]:
#numerical column that we will not encode
X[X_not_ohe].head()


Unnamed: 0_level_0,SalesField8
QuoteNumber,Unnamed: 1_level_1
1,48649
2,26778
4,8751
6,43854
8,12505


In [18]:
#to keep track of our columns, how many are remaining after we removed 4 so far?
len(X_for_ohe)

298

In [19]:
nan_info = pd.DataFrame(X[X_for_ohe].isnull().sum()).reset_index()
nan_info.columns = ['col','nan_cnt']

In [20]:
#sort them in descending order and print 1st 10
nan_info.sort_values(by = 'nan_cnt',ascending=False,inplace=True)
nan_info.head(10)

Unnamed: 0,col,nan_cnt
0,Field6,0
204,GeographicField18B,0
202,GeographicField17B,0
201,GeographicField17A,0
200,GeographicField16B,0
199,GeographicField16A,0
198,GeographicField15B,0
197,GeographicField15A,0
196,GeographicField14B,0
195,GeographicField14A,0


In [None]:
# These are columns that need to be picked through, they seem to have all kinds of strange data in them! 
#X_try = X[X_for_ohe].drop(['PropertyField3','PropertyField4','PropertyField32','PropertyField34','PropertyField36',
#                           'PropertyField38','PersonalField7','PersonalField4A',
#                           'PersonalField4B'], axis = 1)


In [21]:
# import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categories = 'auto',sparse=True)


In [22]:
# apply OneHotEncoder on categorical feature columns
X_ohe = ohe.fit_transform(X[X_for_ohe])


In [23]:
# we are pretty much done for now here, apparently we can set 'sparse = True' in OneHotEncoder and we get a 
#csr_matrix. I left it as false so that you can see the sparse matrix
X_ohe


<260753x5058 sparse matrix of type '<class 'numpy.float64'>'
	with 77704394 stored elements in Compressed Sparse Row format>

In [24]:
X_ohe.shape

(260753, 5058)

In [25]:
X['SalesField8'].shape

(260753,)

In [28]:
#always separate test data from the rest
X_rem,X_test,y_rem,y_test = train_test_split(X_ohe,y,test_size=0.2,random_state=1)

#separate validation data from training data
X_train,X_val,y_train,y_val = train_test_split(X_rem,y_rem,test_size=0.25,random_state=1)

In [29]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

In [30]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [31]:
print(clf.feature_importances_)

[0.        0.0019476 0.        ... 0.        0.        0.       ]


In [32]:
y_pred = clf.predict(X_val)