In [None]:
!pip install pandas 
!pip install matplotlib
!pip install spicy
!pip install statsmodels
!pip install seaborn
!pip install pingouin
!pip install pmdarima
!pip install numpy==1.26.4


In [None]:
from pathlib import Path

current_path = Path.cwd()

# Get the root directory
root_directory = current_path.parent
print(root_directory)

In [None]:
import pandas as pd

df = pd.read_csv(root_directory / "data" / "01_extracted_data" /"stores_sales_forecasting.csv", encoding="ISO-8859-1")
df = df.drop(["Row ID", "Order ID", "Customer ID", "Product ID"], axis=1)
print(df.shape)
df.head()

In [None]:
df.describe()
#Sales right skewed valeurs extreme vers la droite)
#supp country(unique), category(unique), ship mode(no impact on sales), segment(no impact)
# product name (en soi na pas fino paritculiere dans son nom qui le classerait plus haut ou le differencierait dans la vente)
#discount, Profit

# city, state, ( ne rejette pas forcement hypothese null) mais affecte quand meme la moyenne des ventes
# subcategory et quantity(rejette hypothese null t-test), 
# order date, delais tres impacte le sales

In [None]:
df.isna().sum()
df.isnull().values.any()

In [None]:
import seaborn as sns

sns.pairplot(df, hue="Sub-Category")

In [None]:
import matplotlib.pyplot as plt 
sns.histplot(data=df, x="Sales", hue="Sub-Category", kde=True)
plt.xlim(0, 2000) 

In [None]:
import matplotlib.pyplot as plt 
sns.histplot(data=df, x="Sales", hue="Ship Mode", kde=True)
plt.xlim(0, 2000)

In [None]:
import matplotlib.pyplot as plt 
sns.histplot(data=df, x="Sales", hue="Segment", kde=True)
plt.xlim(0, 2000)

In [None]:
import matplotlib.pyplot as plt

# Calcul de la matrice de corrélation
corr_matrix = df.corr(numeric_only=True)

# Affichage avec seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Matrice de corrélation")
plt.show()

In [None]:
from scipy.stats import pearsonr

corr, pval = pearsonr(df["Sales"], df["Quantity"])
print(f"Corrélation : {corr:.2f}, p-value : {pval}")

In [None]:
from scipy.stats import spearmanr

corr_spearman, pval_spearman = spearmanr(df["Sales"], df["Quantity"])
print(f"Spearman correlation: {corr_spearman:.2f}, p-value: {pval_spearman:.3e}")

Le test Pearson dit qu’il y a un lien linéaire modéré.

Le test Spearman dit qu’il y a un lien monotone modéré, pas parfaitement linéaire.

Ce lien n’est pas parfaitement linéaire, ce qui explique que le scatterplot ne montre pas une droite claire.

Le lien est modéré, pas fort, donc d’autres facteurs influencent aussi les ventes.

### Impact of State

In [None]:
df["Product Name"].value_counts()
df.groupby("State")["Sales"].count().sort_values(ascending=False).head(20)

#df.groupby("State")["Sales"].agg(['mean', 'count']).sort_values("mean", ascending=False)



#### Anova-test

In [None]:
from scipy.stats import levene


groups = [group["Sales"].values for name, group in df.groupby("State")]

# Test de Levene
stat, p = levene(*groups)
print(f"Levene statistic: {stat:.2f}, p-value: {p:.4g}")

if p < 0.05:
    print("Les variances sont significativement différentes (hétéroscédasticité).")
else:
    print("Les variances sont homogènes (homoscédasticité).")


In [None]:
import pingouin as pg

pg.welch_anova(dv = "Sales", between= "State", data= df)

### Effect of sub-category

#### Anova-test

In [None]:
from scipy.stats import levene


groups = [group["Sales"].values for name, group in df.groupby("Sub-Category")]

# Test de Levene
stat, p = levene(*groups)
print(f"Levene statistic: {stat:.2f}, p-value: {p:.4g}")

if p < 0.05:
    print("Les variances sont significativement différentes (hétéroscédasticité).")
else:
    print("Les variances sont homogènes (homoscédasticité).")

In [None]:
import pingouin as pg

pg.welch_anova(dv = "Sales", between= "Sub-Category", data= df)

In [None]:
from scipy.stats import f_oneway
groups = [group["Sales"].values for name, group in df.groupby("Sub-Category")]

# ANOVA
f_stat, p_val = f_oneway(*groups)
print(f"F-statistic: {f_stat:.2f}, p-value: {p_val:.4g}")

# Time Series Analysis

In [None]:
df['Order Date'] = pd.to_datetime(df['Order Date'])
df_ts = df[["Order Date", "Sales"]].sort_values('Order Date')
df_ts


## Visualization

### By day

In [None]:
import matplotlib.pyplot as plt
df_ts = df[["Order Date", "Sales"]].sort_values('Order Date')
df_ts = df_ts.set_index("Order Date")
df_ts.plot(figsize= (20,10))
plt.xlabel("Year")

## Aggregate by month

In [None]:
import matplotlib.pyplot as plt
df_ts = df[["Order Date", "Sales"]].sort_values('Order Date')
df_ts_month = df_ts.groupby(pd.Grouper(key='Order Date', freq='M'))['Sales'].sum()
print(df_ts_month)
df_ts_month.plot(figsize= (20,10))
plt.xlabel("Year")

### Filter by chairs and bookcases

In [None]:
import matplotlib.pyplot as plt
df_ts = df[df["Sub-Category"].isin(["Chairs", "Bookcases"])][["Order Date", "Sales"]].sort_values('Order Date')
df_ts_month = df_ts.groupby(pd.Grouper(key='Order Date', freq='M'))['Sales'].sum()
print(df_ts_month)
df_ts_month.plot(figsize= (20,10))
plt.xlabel("Year")

### Trend check

In [None]:
df_ts_month.rolling(12).mean().plot(figsize=(20,10))
plt.xlabel("Year")
#en moyenne chaque annee , les ventes augmentent indépendamment de Noël ou des pics ponctuels.

### Seasonality check

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df_ts_month)
plt.show()

### PACF et ACF

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
ts = df_ts_month

# Tracer l'autocorrélation (ACF)
plt.figure(figsize=(12,5))
plot_acf(ts, lags=40)
plt.title("ACF - Autocorrelation Function")
plt.show()

# Tracer l'autocorrélation partielle (PACF)
plt.figure(figsize=(12,5))
plot_pacf(ts, lags=22, method='ywm')  # méthode Yule-Walker modifiée, plus stable
plt.title("PACF - Partial Autocorrelation Function")
plt.show()

### Arrima parameters

In [None]:
import pmdarima as pm

model = pm.auto_arima(df_ts_month, seasonal=True, m = 12, trace=False)
print(model.summary())