## <h2 style='text-align: center;'>Black Friday Sales: Analysis and Prediction</h2>

In [1]:
#importing library
import pandas as pd  #data processing
import numpy as np  #linear algebra
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#importing the train dataset
df_train = pd.read_csv('blackfriday_train.csv')
df_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [3]:
df_train.shape

(550068, 12)

In [5]:
## importing the test data
df_test = pd.read_csv('blackfriday_test.csv')
df_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [6]:
df_test.shape

(233599, 11)

- The dataset here is a sample of the transactions made in a retail store.
- The store wants to know better the customer purchase behaviour against different products.
- Specifically, here the problem is a regression problem where we are trying to predict the dependent variable (the amount of purchase) with the help of the information contained in the other variables.
- There are seven categorical variables to analyse.
  
Let us list down some points that can be addressed with the analsysis.

- Understanding the cutomers on the basis of their purchasing habits.
- Understanding the purchasing habits according to Age groups, Occuptation, City_Categories.
- The above segmented group of users can be then used to model the data and use to predict the purchase spend for each customer. Lets dive in by understanding the data.

In [7]:
df = df_train.append(df_test)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370.0
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422.0
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057.0
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969.0


In [8]:
## basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783667 entries, 0 to 783666
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   City_Category               783667 non-null  object 
 6   Stay_In_Current_City_Years  783667 non-null  object 
 7   Marital_Status              783667 non-null  int64  
 8   Product_Category_1          783667 non-null  int64  
 9   Product_Category_2          537685 non-null  float64
 10  Product_Category_3          237858 non-null  float64
 11  Purchase                    550068 non-null  float64
dtypes: float64(3), int64(4), object(5)
memory usage: 71.7+ MB


In [9]:
df.tail()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
783662,1006036,P00118942,F,26-35,15,B,4+,1,8,,,
783663,1006036,P00254642,F,26-35,15,B,4+,1,5,8.0,,
783664,1006036,P00031842,F,26-35,15,B,4+,1,1,5.0,12.0,
783665,1006037,P00124742,F,46-50,1,C,4+,0,10,16.0,,
783666,1006039,P00316642,F,46-50,0,B,4+,1,4,5.0,,


** There are null values in Product_category_2, Product_Category_3

In [None]:
df.describe()

** Mean value of Product_Category_2 is 9.8 and that for Product_Category_3 is 12.6, which we can use to fill the missing values in these two columns if we find more suitable to do.

- Dropping the columns that intuitiey should not imapact the purchase outcome, i.e. User_ID and Product ID.

In [None]:
df.drop(['User_ID','Product_ID'],axis=1,inplace=True)
df.info()

In [None]:
df.head()

- So as we know we have to do some changes in column where type is object before using machine learning algorithms which can be run later

- First we'll handle gender column - Two ways to do it by using get_dummies function or by simply assigning to numerical integer also known as level encoding

In [None]:
#df['Gender'] = pd.get_dummies(df[gender],drop_first=1) 
#or
#Dealing with Values of gender to make categorial to numerical
df['Gender'] = df['Gender'].map({'F':0,'M':1})
df.head()

- Second we do this same modification in Age column as we know this is also a object type and also issue with special characters(+).

In [None]:
df['Age'].unique()

In [None]:
#pd.get_dummies(df['Age']) - target ordinal encoding/guiding
       # or by level encoding
df['Age'] = df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})

In [None]:
df['Age'].unique()

In [None]:
df.head()

- Third we do same kind of modification in City_Category
column(type-object) by using pd.get_dummies function

In [None]:
df_city = pd.get_dummies(df['City_Category'],drop_first=True)

In [None]:
df_city.head()

In [None]:
df_city=df_city.rename(columns={'B':'City_Category-B','C':'City_category-C'})

In [None]:
df_city.head()

In [None]:
df = pd.concat([df,df_city],axis=1)
df.head()

In [None]:
# drop city category feature
df.drop('City_Category',axis =1,inplace=True)

In [None]:
df.head()

- Okay so before encountring further object type column, we'll handle missing value

In [None]:
df.isnull().sum()

In [None]:
# Let's replace missing values
df['Product_Category_2'].unique() #-- discrete features - so will handle it base on type of it's features

- Here we are filling the missing value with mode. Though we can also do by filling mean value.

In [None]:
df['Product_Category_2'].value_counts()

In [None]:
df['Product_Category_2'].mode()[0]

In [None]:
#Replace missing values with mode
df['Product_Category_2'] = df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])

In [None]:
df['Product_Category_2'].isnull().sum()

In [None]:
## Product_category_3 replace missing values
df['Product_Category_3'].unique()

In [None]:
df['Product_Category_3'].value_counts()

In [None]:
df['Product_Category_3'].mode()[0]

In [None]:
df['Product_Category_3'] = df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])

In [None]:
df.head()

- Let's Continue - Modification in Stay_In_Current_City_Years column

In [None]:
df['Stay_In_Current_City_Years'].unique()

In [None]:
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+','')

In [None]:
df.head()

In [None]:
## Convert object into integer
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)

In [None]:
df['City_Category-B'] = df['City_Category-B'].astype(int)
df['City_category-C'] = df['City_category-C'].astype(int)

-

### Exploratory data analysis supported by data visualisations.

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(),annot=True)

The key take aways from the above plot are the positive correlation coefficients of three features as a function of Purchase:

- Occupation
- Stay_In_Current_City_Years
- Marital Status

Increase in any of the values for the above three features is likey to result in a higher purchase from the customer.

In [None]:
# plt.figure(figsize=(10, 8))
g = sns.FacetGrid(df,col="Stay_In_Current_City_Years")
g.map(sns.barplot, "Marital_Status", "Purchase");

- It is difficult to conclude anything from the above visulaisation, but it might be useful to analyse if the trend shows something different across the different cities.

In [None]:
sns.jointplot(x='Occupation',y='Purchase',
              data=df, kind='hex')

- First insight would be that most of the purchase is done between 5000-10000.
- Next important insight, would be the occupations that lead to highest purchases. In this case, it would be occupation 4, listed in the dataset, closely followed that by 0 and 7.
- One can imagine that the store can run targeted advertiements next time around to people with above listed occupations as they more likely to spend within the above purchase range.

In [None]:
## Visualization of Purchase with Age
plt.figure(figsize=(8, 6))
ax = sns.barplot('Age','Purchase',hue='Gender',data=df)
ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))


- So from above observation we can conclude that amount of purchase of Men is higher than Female

In [None]:
df['Occupation'].unique()

In [None]:
## Visualization of Purchase with occupation
plt.figure(figsize=(10,8))
ax = sns.barplot('Occupation','Purchase',hue='Gender',data=df)
ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

** As we can see , It is difficult to conclude anything from the above visulaisation, but it might be useful to analyse if the trend shows something different across the different categories of occupation.

- Let see relationship between all types of 'Product_Category_1' vs 'Purchase'

In [None]:
plt.figure(figsize=(8,6))
ax = sns.barplot('Product_Category_1','Purchase',hue='Gender',data=df)
ax.legend(loc='center left',bbox_to_anchor=(1.0,0.5))

In [None]:
sns.barplot('Product_Category_2','Purchase',hue='Gender',data=df) 

In [None]:
sns.barplot('Product_Category_3','Purchase',hue='Gender',data=df) 

** From the above observation we can conclude that Product_Category_1 is sold the most from all of other Product_category

In [None]:
df_test=df[df['Purchase'].isnull()]

In [None]:
df_train = df[~df['Purchase'].isnull()]

In [None]:
X = df_train.drop('Purchase',axis=1)

In [None]:
X.head()

In [None]:
X.shape

In [None]:
y = df_train['Purchase']

In [None]:
y.shape

In [None]:
y

In [None]:
X_train.info()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

In [None]:
#Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(X_train)
x_test = sc.transform(X_test)

In [None]:
## LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train,y_train)

In [None]:
print('Intercept parameter:', lm.intercept_)
coeff_df = pd.DataFrame(lm.coef_, X_train.columns, columns=['Coefficient'])
print(coeff_df)

In [None]:
predictions = lm.predict(x_test)
print("Predicted purchases (in dollars) for new costumers:", predictions)

In [None]:
from sklearn import metrics

print('MAE:',metrics.mean_absolute_error(y_test,predictions))
print('MSE:',metrics.mean_squared_error(y_test,predictions))
MSE = metrics.mean_squared_error(y_test,predictions)
#RMSE is even more popular than MSE, because RMSE is interpretable in the "y" units - Root mean square
#error
print('RMSE:',np.sqrt(MSE))