In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *   

In [0]:
df = pd.read_csv("/Workspace/Users/mahla.abedi@gmail.com/Supplement_Sales_Data_Analysis/Data/Supplement_Sales_Weekly_Expanded.csv")

In [0]:
df.head()

In [0]:
df.info()

In [0]:
df.describe(include='all').T

In [0]:
print(df.shape)


In [0]:
df['Platform'].unique()

In [0]:
cols=['Product Name','Category','Location','Platform']
for col in cols:
  print(df[col].unique())
  print('-'*50)
  



In [0]:
df['Date']=pd.to_datetime(df['Date'])
df['Date_year']=pd.to_datetime(df['Date']).dt.year
df['Date_month']=pd.to_datetime(df['Date']).dt.month
df['Date_day']=pd.to_datetime(df['Date']).dt.day



In [0]:
Categorized = df.groupby(['Category','Location','Date_year'],as_index =False).agg({'Revenue':'sum'})
Categorized= Categorized.sort_values(by=['Date_year','Revenue'],ascending=[True, False]).reset_index(drop=True)
#make pivot table
pivot_table=Categorized.pivot_table(index='Category',columns='Location',values='Revenue',aggfunc='sum',fill_value=0)
pivot_table.loc[pivot_table.sum(axis=1).sort_values(ascending=False).index]

fig, ax= plt.subplots(figsize=(12,6))
locations= pivot_table.columns
Categories= pivot_table.index
n_location= len(locations)
x=np.arange(len(Categories))
width=0.2
for i, location in enumerate(locations):
    values = pivot_table[location]
    ax.bar(x + i*width, values, width, label=location)
    ax.set_xlabel('Categories')
    ax.set_ylabel('TotalRevenue')
    ax.set_title('Revenue by Category and Location')
ax.set_xticks(x + width*(n_location-1)/2)  
ax.set_xticklabels(Categories, rotation=90, ha='right')
ax.legend(title='Location')
for container in ax.containers:
    ax.bar_label(container, fmt='%.0f', padding=3,rotation=90)
fig.tight_layout()
plt.show()

In [0]:
platforms_revenue= df.groupby('Platform')['Revenue'].sum().sort_values(ascending=False)
Totalrevenue=platforms_revenue.sum()
platforms_revenuepercentage= (platforms_revenue/Totalrevenue*100).round(2)
fig, ax = plt.subplots(figsize=(12, 6))
plt.pie(platforms_revenue, labels=platforms_revenue.index, autopct='%1.1f%%', startangle=90)
plt.title('Revenue by Platforms')
plt.legend(labels=platforms_revenuepercentage,loc='upper right', bbox_to_anchor=(1.3, 1)
           , fontsize=10, title='Revenue Percentage')
plt.show()



In [0]:

platformsperlocation_revenue = df.groupby(['Platform', 'Location'])['Revenue'].sum()
platformsperlocation_revenuepercentage = (platformsperlocation_revenue / platformsperlocation_revenue.sum() * 100).round(2)


percentage_df = platformsperlocation_revenuepercentage.unstack(level='Location').fillna(0)


percentage_df = percentage_df.loc[percentage_df.sum(axis=1).sort_values(ascending=False).index]


locations = percentage_df.columns 
platforms = percentage_df.index     

x = np.arange(len(platforms))      
width = 0.25                        
multiplier = 0

fig, ax = plt.subplots(figsize=(12, 6))

for location in locations:
    values = percentage_df[location]
    offset = width * multiplier
    rects = ax.bar(x + offset, values, width, label=location)
    ax.bar_label(rects, padding=3, fmt='%.2f%%')  
    multiplier += 1


ax.set_xticks(x + width * (len(locations) - 1) / 2)
ax.set_xticklabels(platforms)

ax.set_ylabel('Revenue Percentage (%)')
ax.set_title('Revenue Percentage by Platform and Location')
ax.legend(title='Location', bbox_to_anchor=(1.05, 1), loc='upper left')

fig.tight_layout()
plt.show()



In [0]:
YearlyGrowth= df.groupby('Date_year')['Revenue'].sum().pct_change()*100
YearlyGrowth=YearlyGrowth.dropna()
YearlyGrowth=YearlyGrowth.reset_index()
YearlyGrowth=YearlyGrowth.rename(columns={'Date_year':'Year','Revenue':'YearlyGrowth'})
lineplot=YearlyGrowth.plot(x='Year',y='YearlyGrowth',kind='line',figsize=(10,6),title='Yearly Growth of Revenue',color='red')
plt.xticks(YearlyGrowth['Year'])
lineplot.set_ylabel('Yearly Growth (%)')
lineplot.set_xlabel('Year')
plt.show()

In [0]:
yearlygrowth=df.groupby('Date_year')['Revenue'].sum()
print(yearlygrowth)
print(yearlygrowth.pct_change()*100)
