In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib as mlp
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv("../input/BlackFriday.csv")
df.info()

In [None]:
df.head()
# As we can see, there are some missing values at the columns for product categories.

In [None]:
# From the kernel I got this dataset, the owner devised the following scheme that can tell us 
# for what this dataset could be useful for:
# - Analyzing data for research 
#     ○ Who is more likely to spend more in a black Friday sale?
#         § Men or Women.
#         § Married or Un Married
#         § Old Residents or new residents 
#     ○ Which type of products are more likely to be sold in a sale like black Friday?
#     ○ Which type of products are common among men and which among women?
# - Prediction/classification-
#     ○ Predict Age of a customer based on the rest of available data.
#     ○ Predict gender of a customer based on rest of data.
#     ○ Predict amount of purchase based on rest of data.
#  - Recommendation -
#      ○ Suggest products based on age.
#      ○ Suggest products based on location customers belong to.
#      ○ Suggest products based on gender.

In [None]:
# Let's start our data exploration!

# Firstly, let's start which columns have missing data.
df.isna().any()

In [None]:
# As I noticed earlier, the columns 'Product_Category_2' and 'Product_Category_3' have at least one missing
# value. Therefore, let's take a look at their values and see which value we can fill in to replace the
# missing data.
print("Product_Category_2", df["Product_Category_2"].unique())
print("Product_Category_3", df["Product_Category_3"].unique())

# All of the values are integers. Let's arbitrarily fill the null values with 0s. (I will try to fill
# with the average of the numbers to see if it improves the prediction of any kind of class in the
# future).
df.fillna(value=0, inplace=True)

# The type of both columns is float. Let's change it to int (because we know they hold ints).
df["Product_Category_2"] = df["Product_Category_2"].astype(int)
df["Product_Category_3"] = df["Product_Category_3"].astype(int)

# Let's also already drop some columns that I'm not gonna use.
df.drop(columns=["User_ID", "Product_ID"], inplace=True)

In [None]:
# Let's take a look at the genders of people who bought at the black Friday in this store.
sns.countplot(df["Gender"])
# Conclusion: A lot more males bought during the black Friday at this store. However, it can also mean 
# that, if there were couples at the store, the men were the ones who mostly actually paid it.

In [None]:
# Let's take a look at the age groups who bought at the black Friday in this store.
sns.countplot(df["Age"])
# Conclusion: Most of the people who bought during black Friday in this store were in the range
# from 26-35 years old.

In [None]:
# Let's take a look at each gender in each different age group's purchases.
sns.countplot(df["Age"], hue=df["Gender"])
# Conclusion: It's clear: men spent a lot more.

In [None]:
# Let's now find out which men and women were actually married and explore this new data with
# age groups.
df["combined_G_M"] = df.apply(lambda x: "{}_{}".format(x["Gender"], x["Marital_Status"]), axis=1)
print(df["combined_G_M"].unique())
sns.countplot(df["Age"], hue=df["combined_G_M"])
# Conclusion: No bars for married men or women in the range of 0-17 years old, which makes sense, and
# married men made a lot more purchases than married women in the range 46-55+


In [None]:
# Grouping the 'Occupation' and 'Product_Category_1' columns.
df_by_occupation_and_categories = df.groupby(["Occupation", "Product_Category_1"]).count().reset_index("Product_Category_1")
df_by_occupation_and_categories

In [None]:
# Here, I'll be taking a look at the proportion of people in each occupation who bought specific products in the 
# product category 1 (with ids from 1 to 18).
fig = plt.figure()

# Changing the color of our graph (so that it doesn't repeat).
cmap = plt.get_cmap("terrain")
num_of_product_ids_in_categ_1 = len(df["Product_Category_1"].unique())
colors = [cmap(i) for i in np.linspace(0, 1, num_of_product_ids_in_categ_1)]

# Plotting data.
plottingFor = "Occupation {}"
title = "Product_Category_1"
num_of_occupations = len(df["Occupation"].unique())
for i in range(num_of_occupations):
    pie = plt.pie(df_by_occupation_and_categories["Gender"][i].get_values(), 
                 autopct="%1.1f%%",
                 colors=colors)
    plt.legend(df_by_occupation_and_categories["Product_Category_1"][0].get_values(), 
               title=title, 
               bbox_to_anchor=(1.2, 1))
    plt.title(plottingFor.format(i))
    plt.show()
# Conclusion: regardless of customers' occupations, the products in the category 1 with ids 1, 5, 6 are the most bought.