In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
bfdata = pd.read_csv("BlackFridaySales.csv")

bfdata

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Crime_rate,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Purchase2
0,1000001,P00069042,F,0-17,10,A,8,2,0,3,,,8370,0
1,1000001,P00248942,F,0-17,10,A,6,2,0,1,6.0,14.0,15200,1
2,1000001,P00087842,F,0-17,10,A,17,2,0,12,,,1422,0
3,1000001,P00085442,F,0-17,10,A,17,2,0,12,14.0,,1057,0
4,1000002,P00285442,M,55+,16,C,13,4,0,8,,,7969,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,1000175,P00085542,F,26-35,2,B,10,1,0,5,14.0,,6919,0
1040,1000175,P00307642,F,26-35,2,B,10,1,0,5,6.0,,1772,0
1041,1000175,P00052842,F,26-35,2,B,15,1,0,10,15.0,,23341,1
1042,1000175,P00057542,F,26-35,2,B,8,1,0,3,4.0,5.0,10604,1


In [19]:
bfdata.shape
bfdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     1044 non-null   int64  
 1   Product_ID                  1044 non-null   object 
 2   Gender                      1044 non-null   object 
 3   Age                         1044 non-null   object 
 4   Occupation                  1044 non-null   int64  
 5   City_Category               1044 non-null   object 
 6   Crime_rate                  1044 non-null   int64  
 7   Stay_In_Current_City_Years  1044 non-null   int64  
 8   Marital_Status              1044 non-null   int64  
 9   Product_Category_1          1044 non-null   int64  
 10  Product_Category_2          731 non-null    float64
 11  Product_Category_3          336 non-null    float64
 12  Purchase                    1044 non-null   int64  
 13  Purchase2                   1044 

In [20]:
bfdata.isnull().sum()

User_ID                         0
Product_ID                      0
Gender                          0
Age                             0
Occupation                      0
City_Category                   0
Crime_rate                      0
Stay_In_Current_City_Years      0
Marital_Status                  0
Product_Category_1              0
Product_Category_2            313
Product_Category_3            708
Purchase                        0
Purchase2                       0
dtype: int64

Cleaning:

One of the feature is 'Crime_rate' which tells us about the crime rate in that particular city. I see that there is no noticeable relation so discarding that column.

In [21]:
bfdata.drop(columns=['Crime_rate'],inplace=True)
bfdata.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase', 'Purchase2'],
      dtype='object')

Checking null values

In [22]:
bfdata.isnull().sum()

User_ID                         0
Product_ID                      0
Gender                          0
Age                             0
Occupation                      0
City_Category                   0
Stay_In_Current_City_Years      0
Marital_Status                  0
Product_Category_1              0
Product_Category_2            313
Product_Category_3            708
Purchase                        0
Purchase2                       0
dtype: int64

I see Product_Category_2  and Product_Category_3 have significanty high number of NULL values. I will discard these columns for now.

In [23]:
bfdata.drop(columns=['Product_Category_2','Product_Category_3'],inplace=True)
bfdata.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Purchase', 'Purchase2'],
      dtype='object')

In [24]:
#Checking if there are more NULL Values
bfdata.isnull().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Purchase                      0
Purchase2                     0
dtype: int64

Creating Training/Test sets

In [25]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(bfdata, test_size=0.2, random_state=123)
print(len(train_set), len(test_set))
print(train_set.head())
print(test_set.head())

835 209
      User_ID Product_ID Gender    Age  Occupation City_Category  \
554   1000102  P00274942      M  36-45          19             C   
485   1000090  P00127942      M    55+          13             C   
345   1000058  P00119142      M  26-35           2             B   
1004  1000173  P00057642      M  26-35           0             B   
730   1000139  P00249642      F  26-35          20             C   

      Stay_In_Current_City_Years  Marital_Status  Product_Category_1  \
554                            3               0                   8   
485                            1               0                   1   
345                            3               0                   3   
1004                           1               1                   1   
730                            2               0                   3   

      Purchase  Purchase2  
554       4102          0  
485      11605          1  
345      13557          1  
1004     15351          1  
730       

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

X = bfdata[['Stay_In_Current_City_Years','Occupation','Marital_Status']]
y = bfdata[['Purchase2']]


From the dataset I see one of the column `Stay_In_Current_City_Years`. I noticed that those who have lived in the city for one year likely to spend the most.This is comprehensible since individuals who have lived in the city for more than four years are often more established and less interested in buying new goods than those who are new to the city, who tend to spend more. Similarly occupation has an effect on purchases. When we combined Purchase and Marital_Status for analysis, we came to know that Single Men spend the most during the Black Friday. It also tells that Men tend to spend less once they are married. It maybe because of the added responsibilities.
So I have taken features Stay_In_Current_City_Years,Occupation,Marital_Status for X and Purchase2 as Y.  Purchase 2 is a derived feature that I have created to distinguish the purchases which are above 10000 and below 10000.

In [27]:
lreg = LinearRegression()
print(lreg.fit(X, y))
print("Linear Regression accuracy: ", (lreg.score(X,y))*100)

Random_model = RandomForestClassifier()
Random_model.fit(X,y)
print("Random Forest accuracy: ", (Random_model.score(X,y))*100)

decision_model = DecisionTreeClassifier()
decision_model.fit(X,y)
print("Decision Tree Classifier accuracy: ", (decision_model.score(X,y))*100)


svm_model = SVC()
svm_model.fit(X,y)
print("SVM accuracy: ", (svm_model.score(X,y))*100)


LinearRegression()
Linear Regression accuracy:  0.04803725894035882
Random Forest accuracy:  67.81609195402298


  
  return f(*args, **kwargs)


Decision Tree Classifier accuracy:  67.81609195402298
SVM accuracy:  61.877394636015325


Decision Tree

In [28]:
X = bfdata[['Stay_In_Current_City_Years','Occupation','Marital_Status']]
Y = bfdata[['Purchase2']]

dtree = DecisionTreeClassifier()
dtree.fit(X,Y)

DecisionTreeClassifier()

In [29]:
from sklearn.metrics import confusion_matrix

y_pred = dtree.predict(X)
matrix = confusion_matrix(Y, y_pred)
print(matrix)

[[577  69]
 [267 131]]


As we can see there are 577 predictions where the model is positive and the instance is positive. On the other hand the True Negative count is 131. The SVM accuracy is calculated using a SVM model(SVC) and the value for svm accuracy is 61.877394636015325

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print ("Accuracy =", accuracy_score(Y, y_pred))
print ("Precision =", precision_score(Y, y_pred, average="weighted"))
print ("Sensitivity =", recall_score(Y, y_pred, average="weighted"))
print ("F1 =", f1_score(Y, y_pred, average="weighted"))

Accuracy = 0.6781609195402298
Precision = 0.6727274336765267
Sensitivity = 0.6781609195402298
F1 = 0.6462638066614602


Earlier with a single feature Linear Regression accuracy was calculated at 0.02197. After adding 3 features which are  'Stay_In_Current_City_Years','Occupation' and 'Marital_Status' Linear Regression accuracy increased to  0.04803. We have created a decision tree and also calculated svm accuracy using a svm model