# 4.10 Coding Etiquette & Excel Reporting

### This script contains the following points:
#### 1. Import libraries and dataframe
#### 2. Create an appropriate visualization to show the distribution of profiles
#### 3. Aggregate the max, mean, and min variables on a customer-profile age group
#### 4. Aggregate the max, mean, and min variables on a customer-profile dependants_status
#### 5. Compare customer profiles with regions and departments
#### 6. Compare behaviors across regions
#### 7. Needed charts for project results

#### 1. Import libraries and dataframe

In [38]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [39]:
# Defining path and import dataframe (sample)
path = r'C:\Users\olilo\OneDrive\Dokumente\CF\Data Analytics Immersion_Week 8-12\2023-08-28 Instacart Basket Analysis'
sample_cust_profiling= pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'cust_prof_dep_groups.pkl'))

In [40]:
# Set the option to display all columns
pd.set_option('display.max_columns', None)

In [41]:
# Output check
sample_cust_profiling.shape

(9721098, 17)

In [42]:
sample_cust_profiling.head(20)

Unnamed: 0,department_id,department,order_id,user_id,order_number,prices,orders_day_of_week,order_hour_of_day,age_group,income_group,dependants_number,fam_status,gender,region,dependants_status,order_hour_of_day_group,day
0,1,frozen,86918,19,9,10.6,5,16,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Thursday
1,1,frozen,1321974,19,1,5.2,6,13,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Friday
2,1,frozen,2293453,19,2,5.2,5,14,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Thursday
3,1,frozen,2293453,19,2,1.1,5,14,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Thursday
4,1,frozen,2208892,19,3,1.1,0,14,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Saturday
5,1,frozen,1973799,19,5,1.1,6,12,old adults,high income,1,married,Female,3 South,married female without dependents,,Friday
6,1,frozen,532817,19,7,1.1,4,17,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Wednesday
7,1,frozen,2808909,19,4,12.8,5,12,old adults,high income,1,married,Female,3 South,married female without dependents,,Thursday
8,1,frozen,532817,19,7,12.8,4,17,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Wednesday
9,1,frozen,1321974,19,1,4.2,6,13,old adults,high income,1,married,Female,3 South,married female without dependents,afternoon,Friday


#### 2. Create an appropriate visualization to show the distribution of profiles

In [None]:
# Creating a color palette
color = ["orangered", "darkorange", "gold", "yellowgreen", "limegreen", "mediumseagreen", "teal"]

In [None]:
# Creating a chart dependants_statu and day
day_dep = sample_cust_profiling.groupby(['day', 'dependants_status'])['order_id'].nunique().reset_index()
day_dep_id = day_dep.pivot(index='day', columns='dependants_status', values='order_id')

# Bar chart
day_dep_id_bar = day_dep_id.plot.bar(color=color, figsize=(18, 6))

plt.xlabel("Day", fontsize=12)
plt.ylabel("Dependants status", fontsize=12)
plt.title("Day-based orders and dependents status", fontsize=12)
plt.xticks(rotation=0)  # Rotate x-axis labels if needed

plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()   

In [None]:
# Export the visualization to a file
output_path = os.path.join(path, '04 Analysis', 'Visualizations', 'day_dep_id_bar.png')
day_dep_id_bar.figure.savefig(output_path)

In [None]:
# Create a scatterplot chart with Seaborn's built-in 'deep' palette
scatter_hour_day = sns.scatterplot(
    x='order_hour_of_day_group', y='day', data=sample_cust_profiling, hue='order_id', palette='Reds')

plt.xlabel("Order Hour of Day", fontsize=12)
plt.ylabel("Day", fontsize=12)
plt.title("What Days and Times are Orders Most Often Shipped?", fontsize=12)
plt.xticks(rotation=0)  # Rotate x-axis labels if needed

# Move the legend to the side
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()

In [None]:
# Export the visualization it outside
scatter_hour_day.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'scatter_day_hour.png'))

#### 3. Aggregate the max, mean, and min variables on a customer-profile age group

In [None]:
# mean, min, max of 'department_id' grouped by 'age_group'
sample_cust_profiling.groupby('age_group').agg({'department_id': ['mean', 'median', 'min', 'max']})

In [None]:
# mean, min, max of 'prices' grouped by 'age_group'
sample_cust_profiling.groupby('age_group').agg({'prices': ['mean', 'median', 'min', 'max']})

In [None]:
#  mean, min, max of 'order_hour_of_day' grouped by 'age_group'
sample_cust_profiling.groupby('age_group').agg({'order_hour_of_day': ['mean', 'median', 'min', 'max']})

In [None]:
#  mean, min, max of 'order_day_of_week' grouped by 'age_group'
sample_cust_profiling.groupby('age_group').agg({'orders_day_of_week': ['mean', 'median', 'min', 'max']})

In [None]:
#  mean, min, max of 'order_number' grouped by 'age_group'
sample_cust_profiling.groupby('age_group').agg({'order_number': ['mean', 'median', 'min', 'max']})

#### 4. Aggregate the max, mean, and min variables on a customer-profile dependants_status

In [None]:
# mean, min, max of 'department_id' grouped by 'dependants_status'
sample_cust_profiling.groupby('dependants_status').agg({'department_id': ['mean', 'median', 'min', 'max']})

In [None]:
# mean, min, max of 'prices' grouped by 'dependants_status'
sample_cust_profiling.groupby('dependants_status').agg({'prices': ['mean', 'median', 'min', 'max']})

In [None]:
# mean, min, max of 'order_hour_of_day' grouped by 'age_group'
sample_cust_profiling.groupby('dependants_status').agg({'order_hour_of_day': ['mean', 'median', 'min', 'max']})

In [None]:
# mean, min, max of 'order_day_of_week' grouped by 'age_group'
sample_cust_profiling.groupby('dependants_status').agg({'orders_day_of_week': ['mean', 'median', 'min', 'max']})

In [None]:
# mean, min, max of 'order_number' grouped by 'age_group'
sample_cust_profiling.groupby('dependants_status').agg({'order_number': ['mean', 'median', 'min', 'max']})

#### 5. Compare customer profiles with regions and departments

In [None]:
# Bar chart
bar_regio_count = sample_cust_profiling['region'].value_counts().plot.bar(color=["yellowgreen", "limegreen", "mediumseagreen", "teal"], figsize=(12, 6))

plt.xlabel("Region", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Customer Counts by Region", fontsize=12)
plt.xticks(rotation=0)  # Rotate x-axis labels if needed

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
bar_regio_count.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'bar_regio_count.png'))

In [None]:
# Group customer profile and regions
cust_dep_status_region = sample_cust_profiling.groupby(['dependants_status', 'region']).size().unstack()

# Sort
cust_dep_status_region = cust_dep_status_region.loc[cust_dep_status_region.sum(axis=1).sort_values(ascending=True).index]

# Create bar chart
cust_dep_status_region_bar = cust_dep_status_region.plot.barh(stacked = True, color = ["yellowgreen", "limegreen", "mediumseagreen", "teal"], figsize=(12, 6))

plt.xlabel("Region", fontsize=12)
plt.ylabel("Dependants status", fontsize=12)
plt.title("Customer status by region", fontsize=12)

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
cust_dep_status_region_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'cust_dep_status_region_bar.png'))

In [None]:
# Create customer profile and region crosstab
cust_dep_status_region = pd.crosstab(sample_cust_profiling['dependants_status'], sample_cust_profiling['region'], dropna=False)
cust_dep_status_region

#### 6. Compare behaviors across regions

In [None]:
# Checking age groups by region stats
age_group_region = pd.crosstab(sample_cust_profiling['age_group'],sample_cust_profiling['region'], dropna = False)
# Output check
age_group_region

In [None]:
# Create bar chart
age_group_region_bar = cust_dep_status_region.plot.barh(stacked = True, color = ["yellowgreen", "limegreen", "mediumseagreen", "teal"], figsize=(12, 6))

plt.xlabel("Region", fontsize=12)
plt.ylabel("Age groups", fontsize=12)
plt.title("Age group by region", fontsize=12)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
age_group_region_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'age_group_region_bar.png'))

In [None]:
# Checking income by region stats
income_region = pd.crosstab(sample_cust_profiling['income_group'],sample_cust_profiling['region'], dropna = False)
# Output check
income_region

In [None]:
# Create a bar chart
income_region_bar = income_region.sort_values(by=['income_group'], ascending=True).plot.bar(
    color=["orangered", "darkorange", "gold", "yellowgreen"], figsize=(12, 6))

plt.title("Income by Region", fontsize=12)
plt.xlabel("Income Group", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.legend(loc='upper left', fontsize=12)

plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
income_region_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'income_region_bar.png'))

#### 7. Needed charts for project results

In [None]:
# Relationship between order departments and age groups
dep_id_age_group = pd.crosstab(sample_cust_profiling['department'],sample_cust_profiling['age_group'], dropna = False)
#Output check
dep_id_age_group

In [None]:
# Creating a chart
dep_id_age_group_bar = dep_id_age_group.plot.barh(stacked = True, color = color, figsize=(12, 6))

plt.xlabel("Age groups", fontsize=12)
plt.ylabel("Department", fontsize=12)
plt.title("Product department sales in a given age group", fontsize=12)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
dep_id_age_group_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'dep_id_age_group_bar.png'))

In [None]:
# Relationship between order departments and dependants_status
depart_dep_status = pd.crosstab(sample_cust_profiling['department'],sample_cust_profiling['dependants_status'], dropna = False)
#Output check
depart_dep_status

In [None]:
# Creating a chart
depart_dep_status_bar = depart_dep_status.plot.barh(stacked = True, color = color, figsize=(12, 6))

plt.xlabel("Dependants_status", fontsize=12)
plt.ylabel("Department", fontsize=12)
plt.title("Product department sales depending on the number of residents", fontsize=12)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
depart_dep_status_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'depart_dep_status_bar.png'))

In [None]:
# Relationship between order departments and dependants_status and gender
# Filter the DataFrame for rows where 'gender' is 'Female'
female_data = sample_cust_profiling[sample_cust_profiling['gender'] == 'Female']
# Output check
female_data

In [None]:
# Create a crosstab between 'department', 'dependants_status', and the filtered 'female' data
depart_dep_status_female = pd.crosstab(female_data['department'], female_data['dependants_status'])
# Output check
depart_dep_status_female

In [None]:
# Creating a chart
depart_dep_status_female_bar = depart_dep_status_female.plot.barh(stacked = True, color = color, figsize=(12, 6))

plt.xlabel("Dependants_status female", fontsize=12)
plt.ylabel("Department", fontsize=12)
plt.title("Product department sales depending on dependent status by female", fontsize=12)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
depart_dep_status_female_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'depart_dep_status_female_bar.png'))

In [None]:
# Relationship between order departments and dependants_status and gender
# Filter the DataFrame for rows where 'gender' is 'Male'
male_data = sample_cust_profiling[sample_cust_profiling['gender'] == 'Male']
# Output check
male_data

In [None]:
# Create a crosstab between 'department', 'dependants_status', and the filtered 'female' data
depart_dep_status_male = pd.crosstab(male_data['department'], male_data['dependants_status'])
# Output check
depart_dep_status_male

In [None]:
# Creating a chart
depart_dep_status_male_bar = depart_dep_status_male.plot.barh(stacked = True, color = color, figsize=(12, 6))

plt.xlabel("Dependants_status male", fontsize=12)
plt.ylabel("Department", fontsize=12)
plt.title("Product department sales depending on dependent status by male", fontsize=12)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Adjust the layout to prevent the bottom part from getting cut off
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
depart_dep_status_male_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'depart_dep_status_male_bar.png'))

In [None]:
# Relationship between order departments and region
depart_region = pd.crosstab(sample_cust_profiling['department'],sample_cust_profiling['region'], dropna = False)
# Output check
depart_region

In [None]:
# Create a bar chart
depart_region_bar = depart_region.sort_values(by=['department'], ascending=True).plot.bar(
    color=["orangered", "darkorange", "gold", "yellowgreen"], figsize=(12, 6))

plt.title("The most frequently ordered types of products departments in the regions", fontsize=12)
plt.xlabel("Department", fontsize=12)
plt.ylabel("Region", fontsize=12)
plt.legend(loc='upper left', fontsize=12)

plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
depart_region_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'depart_region_bar.png'))

In [None]:
# Finding the dpartment with the most orders
depart_order_bar = sample_cust_profiling['department'].value_counts().sort_values(ascending=True).plot.barh( color=color, figsize=(12, 6))
plt.title("Frequency of orders in all departments", fontsize=12)
plt.xlabel("Orders count", fontsize=12)
plt.ylabel("Department", fontsize=12)

# Show the plot
plt.show()

In [None]:
# Export the visualization to a file
depart_order_bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'depart_order_bar.png'))

 Import dataframe

In [43]:
# Defining path and import dataframe (sample)
path = r'C:\Users\olilo\OneDrive\Dokumente\CF\Data Analytics Immersion_Week 8-12\2023-08-28 Instacart Basket Analysis'
ords_prods_cust_region= pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_cust_region.pkl'))

In [44]:
# Output check
ords_prods_cust_region.head(20)

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,price_range_loc.1,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_price,Spending_flag,median_days_order,frequency_flag,name,surname,gender,state,age,date_joined,dependants_number,fam_status,income,region,activity_flag
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,9.0,both,,Mid-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
5,3367565,1,prior,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,,Mid-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
6,550135,1,prior,7,1,9,20.0,196,1,1,Soda,77,7,9.0,both,,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
7,3108588,1,prior,8,1,14,14.0,196,2,1,Soda,77,7,9.0,both,,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
8,2295261,1,prior,9,1,16,0.0,196,4,1,Soda,77,7,9.0,both,,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity
9,2550362,1,prior,10,4,8,30.0,196,1,1,Soda,77,7,9.0,both,,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,3 South,high activity


In [45]:
ords_prods_cust_region.shape

(32404859, 37)

In [46]:
# Create a sample, split into two dataframes at a 70/30 ratio
np.random.seed(4)
dev = np.random.rand(len(ords_prods_cust_region)) <= 0.7

In [47]:
# Sample 70% of dataframe
big_data = ords_prods_cust_region[dev]
# Sample 30% of dataframe
small_data = ords_prods_cust_region[~dev]

In [48]:
# Check the results
len(ords_prods_cust_region)

32404859

In [49]:
# Check the results
len(big_data) + len(small_data)

32404859

In [50]:
small_data.shape

(9721098, 37)

In [51]:
# Reducing your samples to only those columns necessary# Merge the two DataFrames on the `order_id` column
small_1=small_data[['loyalty_flag','price_range_loc','order_id']]

In [52]:
# Reducing your samples to only those columns necessary
small_2=small_data[['product_name','order_id']]

Merge the two DataFrames on the `order_id` column
sample_cust_profiling_m = sample_cust_profiling.merge(small, on='order_id', how='left')

I can't combine these two samples into one because I don't have enough memory according to the computer. Screenshot with error information is located in the test files folder