<h4> Importing modules </h4>

In [256]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy.random as nr
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
from sklearn import linear_model
import sklearn.metrics as sklm
import math
import seaborn as sns

<h5> Importing Datasets </h5>

In [257]:
customers_data = pd.read_csv('Customers.csv')
buyers_data = pd.read_csv('BikeBuyer.csv')

<h5> A. Working with the Customers Dataset </h5>

In [258]:
customers_data.head()
print(customers_data.shape)
#Drop Duplicates
customers_data.drop_duplicates(subset='CustomerID', keep='first', inplace=True)
print(customers_data.shape)

#Drop Columns with missing values
features_drop=['MiddleName', 'Suffix', 'AddressLine2', 'Title', 'PhoneNumber']
customers_data=customers_data.drop(features_drop, axis=1)
customers_data.info()

(16519, 23)
(16404, 23)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16404 entries, 0 to 16507
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CustomerID            16404 non-null  int64 
 1   FirstName             16404 non-null  object
 2   LastName              16404 non-null  object
 3   AddressLine1          16404 non-null  object
 4   City                  16404 non-null  object
 5   StateProvinceName     16404 non-null  object
 6   CountryRegionName     16404 non-null  object
 7   PostalCode            16404 non-null  object
 8   BirthDate             16404 non-null  object
 9   Education             16404 non-null  object
 10  Occupation            16404 non-null  object
 11  Gender                16404 non-null  object
 12  MaritalStatus         16404 non-null  object
 13  HomeOwnerFlag         16404 non-null  int64 
 14  NumberCarsOwned       16404 non-null  int64 
 15  NumberChildr

<h5> Merging with buyers_data and spending_data </h5>

In [259]:
buyers_data.drop_duplicates(subset='CustomerID', keep='first', inplace=True)
buyers_data.shape

(16404, 2)

In [304]:
full_data=pd.merge(left=customers_data, right=buyers_data, on="CustomerID", how="outer")
full_data.shape

(16404, 19)

<h5> Encoding categorical features to dummy variables </h5>

In [261]:
print(full_data['Occupation'].unique())
Features = full_data['Occupation']
enc = preprocessing.LabelEncoder()
enc.fit(Features)
Features = enc.transform(Features)
print(Features)

['Professional' 'Management' 'Skilled Manual' 'Clerical' 'Manual']
[3 3 3 ... 0 0 0]


<h6> One Hot encoding of the dummy variables <h/6>

In [262]:
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(Features.reshape(-1,1))
Features = encoded.transform(Features.reshape(-1,1)).toarray()
Features[:10,:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]])

<h6> Label encoding Categorical Features to dummy variables 
     and OneHotEncoding the dummy variables<h/6>

In [263]:
def encode_string(cat_feature):
    
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    enc_cat_feature = enc.transform(cat_feature)
    
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
    return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
    

cat_columns = ['Education', 'Gender', 'MaritalStatus']
for col in cat_columns:
    temp = encode_string(customers_data[col])
    Features = np.concatenate([Features, temp], axis = 1)
print(Features.shape)
Features[:2,:]

(16404, 14)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.]])

<h5> Add To Numeric Features </h5>

In [264]:
full_data.columns

Index(['CustomerID', 'FirstName', 'LastName', 'AddressLine1', 'City',
       'StateProvinceName', 'CountryRegionName', 'PostalCode', 'BirthDate',
       'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome', 'BikeBuyer'],
      dtype='object')

In [265]:
Features = np.concatenate([Features, np.array(full_data[['HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome']])], axis = 1)
print(Features[:2,:])
print(Features.shape)

[[0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 2.00000e+00
  1.37947e+05]
 [0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 3.00000e+00 3.00000e+00
  1.01141e+05]]
(16404, 19)


<h5> Split the data </h5>

In [266]:
## Randomly sample cases to create independent training and test data
nr.seed(9988)
labels = np.array(full_data['BikeBuyer'])
indx = range(Features.shape[0])
print(indx)
indx = ms.train_test_split(indx, test_size = 0.4)
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

range(0, 16404)


<h5> Rescale Numeric Features </h5>

In [267]:
scaler = preprocessing.StandardScaler().fit(x_train[:,14:])
x_train[:,14:] = scaler.transform(x_train[:,14:])
x_test[:,14:] = scaler.transform(x_test[:,14:])
print(x_train.shape)
x_train[:5,:]

(9842, 19)


array([[ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.69410561,
         0.44859986, -0.65124974, -1.19500052, -1.16444299],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  0.69410561,
         0.44859986, -0.65124974, -1.19500052, -0.72394657],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.69410561,
        -0.42741598,  0.67895675, -0.00459616, -0.89407488],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.   

<h1> Classification </h1>

<h4> Construct a logistic regression model </h4>

In [268]:
logistic_mod = linear_model.LogisticRegression() 
logistic_mod.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [269]:
print(logistic_mod.intercept_)
print(logistic_mod.coef_)

[-0.40954788]
[[-0.04652456 -0.8176407   0.27237478  0.00617892  0.17606368  0.32689361
   0.03805423 -0.21337525 -0.07854742 -0.48257304 -0.49800216  0.08845428
  -0.96053475  0.55098688  0.02365078 -0.08963873  1.25148288 -0.18439566
   0.57942732]]


In [270]:
probabilities = logistic_mod.predict_proba(x_test)
print(probabilities[:15,:])

[[0.94747839 0.05252161]
 [0.89763412 0.10236588]
 [0.72814961 0.27185039]
 [0.60356924 0.39643076]
 [0.20747746 0.79252254]
 [0.65922211 0.34077789]
 [0.60533297 0.39466703]
 [0.83915954 0.16084046]
 [0.74588683 0.25411317]
 [0.92147607 0.07852393]
 [0.36206642 0.63793358]
 [0.76583082 0.23416918]
 [0.32183245 0.67816755]
 [0.87556809 0.12443191]
 [0.94283256 0.05716744]]


In [271]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])
scores = score_model(probabilities, 0.5)
print(np.array(scores[:15]))
print(y_test[:15])

[0 0 0 0 1 0 0 0 0 0 1 0 1 0 0]
[0 1 0 0 0 0 0 0 1 0 0 0 0 1 0]


In [272]:
score = logistic_mod.score(x_test, y_test)
print(score)

0.784364523011277


In [273]:
prediction = logistic_mod.predict(x_test)
prediction

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

<h3> Importing Testing data </h3>

In [274]:
test = pd.read_csv('AW_test.csv')

In [275]:
print(test['Occupation'].unique())
Testing=test['Occupation']
enc = preprocessing.LabelEncoder()
enc.fit(Testing)
Testing = enc.transform(Testing)
# print(Features)

['Management' 'Skilled Manual' 'Manual' 'Clerical' 'Professional']


In [276]:
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(Testing.reshape(-1,1))
Testing = encoded.transform(Testing.reshape(-1,1)).toarray()
# Features

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [277]:
def categorical_encoding (cat_feature):
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    cat_feature_encoded = enc.transform(cat_feature)
    
    #OneHotEncoding the dummy variables
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(cat_feature_encoded.reshape(-1,1))
    return encoded.transform(cat_feature_encoded.reshape(-1,1)).toarray()

categorical_cols =['Education', 'Gender', 'MaritalStatus']
for col in categorical_cols:
    temp = categorical_encoding(test[col])
    Testing = np.concatenate([Testing, temp], axis=1)
print(Testing.shape)

(500, 14)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [278]:
Testing = np.concatenate([Testing, np.array(test[['HomeOwnerFlag', 'NumberCarsOwned',
       'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']])], axis=1)
print(Testing.shape)


(500, 19)


<h4> Scaling the Numeric Data in the testing sample </h4>

In [279]:
Testing[:, 14:] = scaler.transform(Testing[:,14:])
Testing.shape

(500, 19)

In [280]:
prediction2 = logistic_mod.predict(Testing)
prediction2.shape

(500,)

In [281]:
submission = pd.DataFrame({
        "CustomerID": test["CustomerID"],
        "BikeBuyer": prediction2
    })

submission.to_csv('submission.csv', index=False)

In [282]:
submission = pd.read_csv('submission.csv')
submission.head()

Unnamed: 0,CustomerID,BikeBuyer
0,18988,0
1,29135,1
2,12156,0
3,13749,0
4,27780,0


<h2> Modelling </h2>

In [283]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [284]:
Features2 = pd.DataFrame(Features)
Features2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16404 entries, 0 to 16403
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       16404 non-null  float64
 1   1       16404 non-null  float64
 2   2       16404 non-null  float64
 3   3       16404 non-null  float64
 4   4       16404 non-null  float64
 5   5       16404 non-null  float64
 6   6       16404 non-null  float64
 7   7       16404 non-null  float64
 8   8       16404 non-null  float64
 9   9       16404 non-null  float64
 10  10      16404 non-null  float64
 11  11      16404 non-null  float64
 12  12      16404 non-null  float64
 13  13      16404 non-null  float64
 14  14      16404 non-null  float64
 15  15      16404 non-null  float64
 16  16      16404 non-null  float64
 17  17      16404 non-null  float64
 18  18      16404 non-null  float64
dtypes: float64(19)
memory usage: 2.4 MB


<h3> 1. Cross Validation </h3>

In [285]:
from sklearn.model_selection import KFold, cross_val_score
k_fold=KFold(n_splits=10, shuffle=True, random_state=0)

<h4> 1.1 KNN </h4>

In [286]:
clf=KNeighborsClassifier(n_neighbors=31)
scoring = 'accuracy'
score = cross_val_score(clf, x_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.80304569 0.8071066  0.77439024 0.7804878  0.78861789 0.7703252
 0.78353659 0.77947154 0.80691057 0.78861789]


In [287]:
round(np.mean(score)*100, 2)

78.83

<h4> 1.2  DecisionTree Classifier </h4>

In [288]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, x_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.72081218 0.71370558 0.69715447 0.70223577 0.69512195 0.71036585
 0.71544715 0.70223577 0.72764228 0.68394309]


In [289]:
round(np.mean(score)*100, 2)

70.69

<h4> 1.3 Random Forest </h4>

In [290]:
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, x_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.75329949 0.73604061 0.72154472 0.73272358 0.7296748  0.74085366
 0.74796748 0.7449187  0.75609756 0.73069106]


In [291]:
round(np.mean(score)*100, 2)

73.94

<h4> 1.4 Naives Bayes </h4>

In [292]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, x_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.75634518 0.75228426 0.7347561  0.70121951 0.73272358 0.74186992
 0.73882114 0.72764228 0.7550813  0.73780488]


In [293]:
round(np.mean(score)*100, 2)

73.79

<h4> 1.5 SVM </h4>

In [294]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, x_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)



[0.79695431 0.79796954 0.77845528 0.76930894 0.78455285 0.77947154
 0.79065041 0.77235772 0.79471545 0.79065041]


In [295]:
round(np.mean(score)*100,2)

78.55