###  Data Analysis of Black Friday

In [1]:
# Load EDA
import pandas as pd 
import numpy as np

In [2]:
# Load Data Viz
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
# Load Dataset
df = pd.read_csv("BlackFriday.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# DataType 
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# City Category
df.columns

In [None]:
# List of Unique
df['City_Category'].unique()

In [None]:
# Value Counts
df['City_Category'].value_counts()

In [11]:
# Value Counts
cc_df = df['City_Category'].value_counts().to_frame()

In [12]:
cc_df = cc_df.reset_index()

In [None]:
cc_df

In [None]:
sns.countplot(df['City_Category'])

In [None]:
df.columns

In [None]:
df['Gender'].value_counts()

In [None]:
df['Occupation'].value_counts()

In [None]:
df['Age'].value_counts()

In [None]:
df['Purchase']

In [None]:
df.columns

In [None]:
df['Marital_Status'].unique()

In [22]:
def count_plot(dataframe, column_name, title =None, hue = None):
    '''
    Function to plot seaborn count plot
    Input: Dataframe name that has to be plotted, column_name that has to be plotted, title for the graph
    Output: Plot the data as a count plot
    '''
    base_color = sns.color_palette()[0]
    sns.countplot(data = dataframe, x = column_name, hue=hue)
    plt.title(title)
    pass

In [None]:
count_plot(df,'Age','Gender')

In [None]:
df.head()

In [None]:
# How many maried and gender
df.groupby(['Gender','Marital_Status']).size().to_frame()

In [26]:
mr_gender = df.groupby(['Gender','Marital_Status']).size().to_frame()

In [27]:
marital_df =mr_gender.reset_index()

In [None]:
marital_df.columns

In [29]:
marital_df.rename(columns={0:'Counts'},inplace=True)

In [None]:
marital_df

In [None]:
df.shape

In [None]:
df['Age'].unique()

In [33]:
age_map = {"Children":"0-17","Young Adult":"18-25","Young Adult(Prime)":"26-35","Middle Adult":"36-45","Late Adult":"46-50","Early Old Age":"55+"}

In [None]:
df.head()

In [None]:
# Check for unique Products
df['Product_ID'].unique().tolist()

In [36]:
# Convert to Dict
def make_dict(col):
    d = {v:k for k,v in enumerate(col.unique())}
    return d

In [37]:
age_dict = make_dict(df['Age'])

In [None]:
age_dict

In [39]:
age_dict = {'0-17': 1,
 '55+': 7,
 '26-35': 3,
 '46-50': 5,
 '51-55': 6,
 '36-45': 4,
 '18-25': 2}

In [None]:
age_dict

In [41]:
city_dict = make_dict(df['City_Category'])

In [42]:
city_dict = {'A': 0, 'C': 2, 'B': 1}

In [None]:
df['Stay_In_Current_City_Years'].unique()

In [None]:
df['Stay_In_Current_City_Years'].replace('4+',4,inplace=True)

In [None]:
# Occupation
df['Occupation'].unique()

In [None]:
# Marital Status
df['Marital_Status'].unique()

In [None]:
df['Product_Category_1'].isnull().sum()

In [None]:
df['Product_Category_2'].isnull().sum()

In [None]:
df['Product_Category_2'].unique()

In [None]:
# Fill Na
df['Product_Category_2'].fillna(df['Product_Category_2'].value_counts().idxmax(),inplace=True)

In [None]:
# Fill Na
df['Product_Category_3'].fillna(df['Product_Category_3'].value_counts().idxmax(),inplace=True)

In [None]:
# Check For Missing Values
df.isnull().sum()

In [None]:
# Encoding Dataset
df.dtypes

In [54]:
df_clean = df

In [55]:
df_clean.to_csv("Black_Friday_No_Missing_Value.csv")

In [56]:
# Gender
df['Gender'] = df['Gender'].map({"F":0,"M":1})

In [None]:
df.head()

In [58]:
df['Age'] = df['Age'].map(age_dict)

In [None]:
age_dict

In [60]:
df['City_Category'] = df['City_Category'].map(city_dict)

In [None]:
df.head()

In [None]:
# Save Numeric Data
df.dtypes

In [63]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)

In [None]:
df.dtypes

In [65]:
df.to_csv("Black_Friday_Data_Encoded.csv")

In [66]:
### Machine Learning

In [None]:
df.columns

In [68]:
df2 = df[['Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase']]

In [None]:
sns.heatmap(df2.corr(),annot=True)

In [70]:
# Remove User ID and Product ID


In [71]:
Xfeatures = df[['Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3']]

In [None]:
Xfeatures

In [73]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [74]:
X = scaler.fit_transform(Xfeatures)

In [None]:
Xfeatures.head()

In [None]:
X

In [None]:
Xfeatures.columns

In [78]:
X2 = pd.DataFrame(X,columns=Xfeatures.columns)

In [None]:
X2.head()

In [80]:
from sklearn.model_selection import train_test_split

In [81]:
y = df['Purchase']

In [82]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X2, y, test_size=0.2, random_state=42)

In [83]:
# split into training and testing sets
# Unscaled
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    Xfeatures, y, test_size=0.2, random_state=42)

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)

In [86]:
from sklearn.metrics import accuracy_score,mean_squared_error,r2_score

In [87]:
y_pred_lr = lr_model.predict(X_test)

In [None]:
y_pred_lr

In [89]:
actual_df = pd.DataFrame(y_pred_lr,columns=['Prediction'])

In [90]:
actual_df['Actual'] = y_test

In [None]:
actual_df

In [None]:
y_test.iloc[0]

In [None]:
lr_model.predict(np.array(X_test.iloc[0]).reshape(1,-1))

In [None]:
print("Linear Regression: ")
print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("R2 score:", r2_score(y_test, y_pred_lr))

In [95]:
import joblib

In [96]:
lr_model_file = open("lr_bf_sales_model_23_oct.pkl","wb")
joblib.dump(lr_model,lr_model_file)
lr_model_file.close()

In [97]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor()

In [None]:
dtree.fit(X_train,y_train)

In [99]:
y_pred_dt = dtree.predict(X_test)

In [None]:
print("Decision Tree Regression: ")
print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred_dt)))
print("R2 score:", r2_score(y_test, y_pred_dt))

In [101]:
dt_model_file = open("dt_bf_sales_model_23_oct.pkl","wb")
joblib.dump(dtree,dt_model_file)
dt_model_file.close()

In [None]:
lr_model2 = LinearRegression()
lr_model2.fit(X_train2,y_train2)

In [103]:
y_pred_lr2 = lr_model2.predict(X_test2)

In [None]:
print("Linear Regression: ")
print("RMSE:",np.sqrt(mean_squared_error(y_test2, y_pred_lr2)))
print("R2 score:", r2_score(y_test2, y_pred_lr2))

In [105]:
lr_model2_file = open("lr2_bf_sales_model_23_oct.pkl","wb")
joblib.dump(lr_model2,lr_model2_file)
lr_model2_file.close()

In [None]:
dtree2 = DecisionTreeRegressor()
dtree2.fit(X_train2,y_train2)

In [107]:
y_pred_dt2 = dtree2.predict(X_test2)

In [None]:
print("Decision Tree Regression: ")
print("RMSE:",np.sqrt(mean_squared_error(y_test2, y_pred_dt2)))
print("R2 score:", r2_score(y_test2, y_pred_dt2))

In [109]:
dt2_model_file = open("dt2_bf_sales_model_23_oct.pkl","wb")
joblib.dump(dtree2,dt2_model_file)
dt2_model_file.close()