# 4.9_2. Data Visualisations

Contents
1.  Import Libraries
2.  Import Dataframe
3.  Bar Chart Days of Week
4.  Histogram Price Frequency
5.  Scatterplot Price
6.  Using a subset to create a Temporal Line Chart
7.  Histogram hour of day
8.  Bar chart customer profile
9.  Randomised sample for line chart
10. Line chart Dependents and Age
11. Scatterplot Income and Age
12. Export Dataframe

# 01. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Import Dataframe

In [None]:
# create path to folder
path = r'C:\Users\mngun\Documents\11_2023_InstaCart_asket_Analysis'

In [None]:
# import the latest dataframe
ords_prods_cust = pd.read_pickle(os.path.join(path, '02_Data','Prepared Data','ords_prods_cust.pkl'))

In [None]:
# reviewing the imported dataframe
ords_prods_cust.shape

In [None]:
# reviewing the imported dataframe
ords_prods_cust.head()

# 03. Bar Chart Days of Week

In [None]:
# bar chart with sorted results
bar_dow = ords_prods_cust['orders_day_of_week'].value_counts().sort_index().plot.bar(color= ['#1f78b4', '#33a02c', '#e31a1c', '#ff7f00', '#6a3d9a', '#a6cee3', '#fdbf6f'])
plt.title('Sum of Orders by Day of Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Orders')

In [None]:
# export bar chart
bar_dow.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','bar_dow_sorted.png'))

# 04. Histogram Price Frequency

In [None]:
# create histogram with color for prices
hist = ords_prods_cust['prices'].plot.hist(bins = 75, color='skyblue', edgecolor= 'black')
plt.title('Frequency of Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')

In [None]:
# export histogram
hist.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','hist_prices.png'))

# 05. Scatterplot Price

In [None]:
# generate scatterplot for prices
scatterplot = sns.scatterplot(x='prices', y='prices', data= ords_prods_cust)
plt.title('Scatterplot of Prices')
plt.xlabel('Prices')
plt.ylabel('Prices')

In [None]:
# export scatterplot
scatterplot.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','scatter_prices.png'))

# 06. Using a subset to create a Temporal Line Chart

In [None]:
# assign a seed to randomised distribution
np.random.seed(5)

In [None]:
# create a randomised subset
dev = np.random.rand(len(ords_prods_cust))<= 0.8

In [None]:
# assigning temporary names to big and small subsets
big = ords_prods_cust[dev]
small = ords_prods_cust[~dev]

In [None]:
# stating length of dataframe for comparison
len(ords_prods_cust)

In [None]:
# commparing length of subsets to ensure completeness
len(big) + len(small)

In [None]:
# isolate two columns from small sample for analysis
df_line = small[['orders_day_of_week','prices']]

In [None]:
# generate line chart
line_2 = sns.lineplot(data= df_line, x= 'orders_day_of_week', y= 'prices')
plt.title('Orders by day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Prices')

In [None]:
# export line chart
line_2.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','line_prices.png'))

# 07. Histogram hour of day

In [None]:
# create histogram with and color for order_hour_of_day
hist_2 = ords_prods_cust['order_hour_of_day'].plot.hist(bins = 24, color='#ff7f00', edgecolor= 'black')
plt.title('Orders by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Order Numbers')

Interpretation:
- The histogram uses scientific notation on the y-axis and plots the frquency of the hours of each order in the dataframe
- The majority of orders occur during the day, between 9am and 4pm, with 10am being the peak. The fewest orders are made at 3am.

In [None]:
# export histogram order_hour_of_day
hist_2.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','hist_order_hour.png'))

# 08. Bar chart customer profile

In [None]:
# bar chart showing order quantities according to customer loyalty
bar_loyalty = ords_prods_cust['loyalty_flag'].value_counts().plot.bar(color= ['#008080','#FA8072','#6A5ACD'])
plt.title('Frequency of Customer Profiles')
plt.xlabel('Customer Profile')
plt.ylabel('Orders')

In [None]:
# export bar chart as png
bar_loyalty.figure.savefig(os.path.join(path, '04_Analysis','Visualisations','bar_loyalty.png'),bbox_inches='tight')

# 09. Randomised sample for line chart

In [None]:
# using random sample set to create line chart for order_hour_of_day
df_line_2 = small[['order_hour_of_day','prices']]

In [None]:
# create line chart for order_hour_of_day
line_hour = sns.lineplot(data= df_line_2, x= 'order_hour_of_day', y= 'prices')
plt.title('Spending per hour of day')
plt.xlabel('Hour of Day')
plt.ylabel('Prices')

In [None]:
# export line chart
line_hour.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','line_hour_price.png'))

# 10. Line chart Dependents and Age

In [None]:
# using random sample set to create line chart for order_hour_of_day
df_line_3 = small[['age','n_dependants']]

In [None]:
# create line chart for age and dependants
line_dependants = sns.lineplot(data= df_line_3, x= 'age', y= 'n_dependants')
plt.title('Number of Dependents and Customer Age')
plt.xlabel('Customer Age')
plt.ylabel('Number of dependents')

Comments:
- I would have prefered to use a scatterplot chart to reveal any trends or possible relationships, however it can still be seen that there is no real relationship between progressive age and number of dependants.

In [None]:
# export line chart
line_dependants.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','line_dependants.png'))

# 11. Scatterplot Income and Age

In [None]:
# generate a scatterplot for income and age
scatteplot_income = sns.scatterplot(x= 'age', y= 'income', data= ords_prods_cust)
plt.title('Scatterplot Age and Income')
plt.xlabel('Customer Age')
plt.ylabel('Income')

Comments:
- For the income range 0 to 200,000 there is relative uniformity in the distribution of income amongst the age groups.
- When considering incomes 200,000 to 400,000 we se that individuals with such income are concentrated in the age group 40 to 80 years
- Individuals with income above 400,000 USD are exclusively above 40 years of age.

# 12. Export Dataframe

In [None]:
# export scatterplot to png
scatteplot_income.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','scatter_income_age.png'))