In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_auc_score, roc_curve,recall_score, classification_report
from sklearn.naive_bayes import GaussianNB

In [2]:
data=pd.read_csv("NewBlackFridayData.csv")

In [7]:
data.sample(5)

Unnamed: 0.1,Unnamed: 0,User_ID,Marital_Status,Product,F,A,B,0,1,2,...,18-25,26-35,36-45,46-50,51-55,P00025442,P00057642,P00110742,P00112142,P00265242
4509,4509,1000735,0,2,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23403,23403,1003650,0,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
447138,447138,1002894,0,2,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
92861,92861,1002247,1,2,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
431150,431150,1000355,0,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [4]:
data.drop("User_ID", axis=1, inplace=True)

In [5]:
Y=data.Product
X=data.drop("Product",axis=1)

In [6]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

# Decision Tree

In [6]:
tree1=DecisionTreeClassifier(max_depth=10, min_samples_leaf=25, random_state=42).fit(x_train, y_train)

In [10]:
print("accuracy:", tree1.score(x_train, y_train),
      tree1.score(x_test, y_test))

accuracy: 0.3961139138155969 0.39421109416272926


In [11]:
print("Train:",classification_report(y_test,tree1.predict(x_test)))


Train:               precision    recall  f1-score   support

           1       0.40      0.00      0.01     41725
           2       0.39      0.97      0.55     51416
           3       0.58      0.07      0.12     41254

   micro avg       0.39      0.39      0.39    134395
   macro avg       0.46      0.35      0.23    134395
weighted avg       0.45      0.39      0.25    134395



In [13]:
tree1.feature_importances_

array([0.0005829 , 0.00565072, 0.02724384, 0.02562004, 0.01838986,
       0.00320866, 0.00368114, 0.00125341, 0.00049705, 0.        ,
       0.00068404, 0.        , 0.02295349, 0.0035673 , 0.00164932,
       0.        , 0.00242521, 0.        , 0.01154146, 0.00413565,
       0.18075386, 0.16475771, 0.18373409, 0.17450324, 0.16316703])

In [14]:
feat_imp = pd.DataFrame(tree1.feature_importances_, index=X.columns,columns=["Feature_imp"])

In [15]:
feat_imp.sort_values(by="Feature_imp", ascending=False)

Unnamed: 0,Feature_imp
P00110742,0.183734
P00025442,0.180754
P00112142,0.174503
P00057642,0.164758
P00265242,0.163167
F,0.027244
A,0.02562
17,0.022953
B,0.01839
46-50,0.011541


# Random Forest

In [16]:
tree2=RandomForest(max_depth=5, min_samples_leaf=25, random_state=42).fit(x_train, y_train)

In [20]:
print("accuracy:", tree2.score(x_train, y_train),
      tree2.score(x_test, y_test))

accuracy: 0.3956873074690834 0.39380185274749807


In [21]:
print("Train:",classification_report(y_test,tree2.predict(x_test)))

Train:               precision    recall  f1-score   support

           1       0.00      0.00      0.00     41725
           2       0.39      1.00      0.56     51416
           3       1.00      0.04      0.07     41254

   micro avg       0.39      0.39      0.39    134395
   macro avg       0.46      0.35      0.21    134395
weighted avg       0.45      0.39      0.24    134395



  'precision', 'predicted', average, warn_for)


In [17]:
tree2.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.20850218, 0.19005039, 0.21193992, 0.201292  , 0.18821551])

In [18]:
feat_imp = pd.DataFrame(tree2.feature_importances_, index=X.columns,columns=["Feature_imp"])

In [19]:
feat_imp.sort_values(by="Feature_imp", ascending=False)

Unnamed: 0,Feature_imp
P00110742,0.21194
P00025442,0.208502
P00112142,0.201292
P00057642,0.19005
P00265242,0.188216
4,0.0
51-55,0.0
46-50,0.0
36-45,0.0
26-35,0.0


# Naive Bayes

In [7]:
clf = GaussianNB()
clf.fit(X, Y)
print(clf.predict(x_test))

GaussianNB(priors=None, var_smoothing=1e-09)

[2 2 2 ... 2 2 2]


In [8]:
clf.score(x_train, y_train)
clf.score(x_test, y_test)

0.38418629799941467

0.38257375646415415

In [11]:
clf.predict_proba(X)

array([[0.31108549, 0.38326835, 0.30564616],
       [0.31108548, 0.38326835, 0.30564617],
       [0.31108547, 0.38326836, 0.30564617],
       ...,
       [0.31217924, 0.3842775 , 0.30354327],
       [0.31217925, 0.3842775 , 0.30354325],
       [0.31217926, 0.3842775 , 0.30354324]])