## Black Friday Data set

### Cleaning and Preparing the Data for model Training 

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings

In [2]:
warnings.filterwarnings(action='ignore')

#### Reformating the folder path

In [3]:
## Reformating the folder path

path_reformate = lambda x : x.replace("\\", "/")

current_dir = os.getcwd()
current_dir = path_reformate(current_dir)
current_dir

'C:/Users/mayal/Desktop/Data_Science/Git_folders/EDA_Feature_Engineering'

#### Loading the Data frames

In [4]:
dataset_path = 'C:/Users/mayal/Desktop/Data_Science/data_sets/Black_friday//*.csv'

def dataframe(folder_path):
    df = {}
    for file in glob.glob(folder_path):
        file = file.replace("\\", "/")
        df_name = file.split("/")[-1].split(".")[0]
        df[df_name]=pd.read_csv(file)
        # print(f"df_{df_name}", end=",")
  
    return df

   
df_test,df_train = dataframe(dataset_path).values()

In [5]:
df_train.head(2)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200


In [6]:
df_test.head(2)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,


In [7]:
df = df_train.append(df_test)

In [8]:
df.shape

(783667, 12)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   City_Category               783667 non-null  object 
 6   Stay_In_Current_City_Years  783667 non-null  object 
 7   Marital_Status              783667 non-null  int64  
 8   Product_Category_1          783667 non-null  int64  
 9   Product_Category_2          537685 non-null  float64
 10  Product_Category_3          237858 non-null  float64
 11  Purchase                    550068 non-null  float64
dtypes: float64(3), int64(4), object(5)
memory usage: 77.7+ MB


In [10]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,783667.0,783667.0,783667.0,783667.0,537685.0,237858.0,550068.0
mean,1003029.0,8.0793,0.409777,5.366196,9.844506,12.668605,9263.968713
std,1727.267,6.522206,0.491793,3.87816,5.089093,4.12551,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001519.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003075.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


#### Dropping user id column = axis 1, rows axis =0


In [11]:
## Dropping user id column = axis 1, rows axis =0

df.drop(columns="User_ID",axis=1, inplace=True)

In [12]:
## Numerical Value 

df.select_dtypes(include=["int64","float64"]).columns

Index(['Occupation', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [13]:
## Categorical Feature

df.select_dtypes(include=["object"]).columns

Index(['Product_ID', 'Gender', 'Age', 'City_Category',
       'Stay_In_Current_City_Years'],
      dtype='object')

#### Handelling the Categorical Features

In [14]:
pd.get_dummies(df.Gender)

Unnamed: 0,F,M
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
233594,1,0
233595,1,0
233596,1,0
233597,1,0


#### ## Conveting Gender column into the Numerical column


In [15]:
## Conveting Gender column into the Numerical column
print([0 if sex =='F' else  1 for sex in df.Gender.head()])

## can be done with multiple approach 
print(df.Gender.head().map({'F':0, 'M':1}))


## Updating the Gender Column with M = 1 and F = 0
df.Gender = df.Gender.map({'F':0, 'M':1})

[0, 0, 0, 0, 1]
0    0
1    0
2    0
3    0
4    1
Name: Gender, dtype: int64


In [16]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,0,0-17,10,A,2,0,3,,,8370.0
1,P00248942,0,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,P00087842,0,0-17,10,A,2,0,12,,,1422.0
3,P00085442,0,0-17,10,A,2,0,12,14.0,,1057.0
4,P00285442,1,55+,16,C,4+,0,8,,,7969.0


#### ## Working with the Age Feature

In [17]:
df.Age.unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [18]:
pd.get_dummies(df.Age).head()

Unnamed: 0,0-17,18-25,26-35,36-45,46-50,51-55,55+
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1


##### ## Assining Ordinal Values manually in the Age Feature

In [19]:
odinal_var = {}
for idx, age in enumerate(sorted(list(df.Age.unique())),1):
    odinal_var[age]=idx
odinal_var

{'0-17': 1,
 '18-25': 2,
 '26-35': 3,
 '36-45': 4,
 '46-50': 5,
 '51-55': 6,
 '55+': 7}

In [20]:
# df.Age = df.Age.map(odinal_var)
# df.Age.unique()

In [21]:
df.Age.unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

##### ## Lable encoding Approach assinging the value in the Age Feature

In [23]:
from sklearn import preprocessing

lable_encoder = preprocessing.LabelEncoder()
df.Age = lable_encoder.fit_transform(df.Age)
df.Age.unique()

array([0, 6, 2, 4, 5, 3, 1])