In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from imblearn.under_sampling import NearMiss
from scipy.stats import wilcoxon

In [None]:
df = pd.read_csv('../content/Online-Retail.csv')

In [None]:
df

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
  missing_count = df.isnull().sum()
  missing_percentage = (missing_count / len(df)) * 100

  missing_data = pd.DataFrame({'Missing Count': missing_count, 'Missing Percentage': missing_percentage})

  plt.figure(figsize=(10, 6))
  sns.barplot(x=missing_data.index, y=missing_data['Missing Count'], color='#7EC0EE')

  for i, v in enumerate(missing_data['Missing Count']):
      plt.text(i, v + 1, f'{missing_data["Missing Percentage"].iloc[i]:.2f}%', ha='center', va='bottom', fontsize=10)

  plt.title('Missing Values per Feature')
  plt.xlabel('Columns')
  plt.ylabel('Count of Missing Values')
  plt.xticks(rotation=90)
  plt.show()

In [None]:
df.info()

In [None]:
df['UnitPrice'] = df['UnitPrice'].str.replace(',', '.').astype(float)

In [None]:
df.info()

In [None]:
df['UnitPrice'].describe()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y='UnitPrice', data=df)
plt.title("First Boxplot of the UnitPrice column")
plt.show()

In [None]:
median = df['UnitPrice'][df['UnitPrice'] >= 0].median()
df['UnitPrice'] = df['UnitPrice'].apply(lambda x: median if x < 0 else x)

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y='UnitPrice', data=df)
plt.title("Second Boxplot of the UnitPrice column")
plt.show()

In [None]:
df['UnitPrice'].describe()

In [None]:
Q1_UP = df['UnitPrice'].quantile(0.25)
Q2_UP = df['UnitPrice'].quantile(0.5)
Q3_UP = df['UnitPrice'].quantile(0.75)

lower_limit_UP = Q1_UP - 1.5 * (Q3_UP - Q1_UP)
upper_limit_UP = Q3_UP + 1.5 * (Q3_UP - Q1_UP)

print(f"Feature: UnitPrice")
print(f"- Q1: {Q1_UP:.2f}")
print(f"- Q2 (Mediana): {Q2_UP:.2f}")
print(f"- Q3: {Q3_UP:.2f}")
print(f"- Limite Inferior: {lower_limit_UP:.2f}")
print(f"- Limite Superior: {upper_limit_UP:.2f}")

In [None]:
mean_UP = df['UnitPrice'].mean()
median_UP = df['UnitPrice'].median()
mode_UP = df['UnitPrice'].mode()[0]
var_UP = df['UnitPrice'].var()
std_UP = df['UnitPrice'].std()

print(f"Feature: UnitPrice")
print(f"- mean: {mean_UP:.6f}")
print(f"- median: {median_UP:.6f}")
print(f"- mode: {mode_UP:.6f}")
print(f"- var: {var_UP:.6f}")
print(f"- std: {std_UP:.6f}")

In [None]:
df['Quantity'].describe()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y='Quantity', data=df)
plt.title("First Boxplot of the Quantity column")
plt.show()

In [None]:
median = df['Quantity'][df['Quantity'] > 0].median()
df['Quantity'] = df['Quantity'].apply(lambda x: median if x <= 0 else x)

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y='Quantity', data=df)
plt.title("Second Boxplot of the Quantity column")
plt.show()

In [None]:
df['Quantity'].describe()

In [None]:
Q1_Q = df['Quantity'].quantile(0.25)
Q2_Q = df['Quantity'].quantile(0.5)
Q3_Q = df['Quantity'].quantile(0.75)

lower_limit_Q = Q1_Q - 1.5 * (Q3_Q - Q1_Q)
upper_limit_Q = Q3_Q + 1.5 * (Q3_Q - Q1_Q)

print(f"Feature: Quantity")
print(f"- Q1: {Q1_Q:.2f}")
print(f"- Q2 (Mediana): {Q2_Q:.2f}")
print(f"- Q3: {Q3_Q:.2f}")
print(f"- Limite Inferior: {lower_limit_Q:.2f}")
print(f"- Limite Superior: {upper_limit_Q:.2f}")

In [None]:
mean_Q = df['Quantity'].mean()
median_Q = df['Quantity'].median()
mode_Q = df['Quantity'].mode()[0]
var_Q = df['Quantity'].var()
std_Q = df['Quantity'].std()

print(f"Feature: Quantity")
print(f"- mean: {mean_Q:.6f}")
print(f"- median: {median_Q:.6f}")
print(f"- mode: {mode_Q:.6f}")
print(f"- var: {var_Q:.6f}")
print(f"- std: {std_Q:.6f}")

In [None]:
plt.scatter(df['Quantity'], df['UnitPrice'], color='skyblue', edgecolor='black', alpha=0.5)
plt.title('Scatter Plot of Quantity vs UnitPrice')
plt.xlabel('Quantity')
plt.ylabel('Price')
plt.show()

In [None]:
plt.scatter(df['Quantity'], df['UnitPrice'], color='skyblue', edgecolor='black', alpha=0.5)
plt.title('Scatter Plot of Quantity vs UnitPrice')
plt.xlabel('Quantity')
plt.ylabel('Price')
plt.xscale('log')
plt.yscale('log')
plt.show()

In [None]:
df['Country'].describe()

In [None]:
df['Country'].value_counts().head(10).plot(kind='barh', color='skyblue', edgecolor='black')
plt.title('Bar Plot of Country')
plt.xlabel('Country')
plt.ylabel('Frequency')
plt.show()

In [None]:
df['InvoiceDate'].describe()

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M')

df['Month'] = df['InvoiceDate'].dt.strftime('%m')
df['Week'] = df['InvoiceDate'].dt.strftime('%A')
df['Time'] = df['InvoiceDate'].dt.strftime('%H')

df = df.drop(columns=['InvoiceDate'])

In [None]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
daywise = df.groupby('Week').agg({'TotalPrice': 'sum'}).reset_index()
df = df.drop(columns=['TotalPrice'])

daywise['Week'] = pd.Categorical(daywise['Week'], categories=["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"], ordered=True)
daywise = daywise.sort_values('Week')

plt.figure(figsize=(10, 6))
plt.plot(daywise['Week'], daywise['TotalPrice'], marker='o', color='skyblue', linewidth=1.2, markersize=6)
plt.title('Total Sales by Day')
plt.xlabel('Days of the week')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Time', bins=24, color='lightcoral', kde=False)
plt.xlabel('Hours throughout the day', fontsize=12)
plt.ylabel('Count of Orders', fontsize=12)
plt.title('Most customers buy products between 12:00 to 15:00', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True)
plt.show()

In [None]:
labels = [
    'January-February',
    'March-April',
    'May-June',
    'July-August',
    'September-October',
    'November-December'
]

df['Month'] = pd.cut(df['Month'].astype(int), bins=[0, 2, 4, 6, 8, 10, 12], labels=labels, right=True)

In [None]:
df['Month'].value_counts().plot(kind='pie', autopct='%1.0f%%', title='Months')

In [None]:
labels = ['Midnight', 'Morning', 'Afternoon', 'Night']
df['Time'] = pd.cut(df['Time'].astype(int), bins=[0, 6, 12, 18, 24], labels=labels, right=False)

In [None]:
df['Time'].value_counts().plot(kind='pie', autopct='%1.0f%%', title='Shifts')

In [None]:
df_numeric = df[['Quantity', 'UnitPrice']]
sns.pairplot(df_numeric.sample(1000))
plt.show()

In [None]:
def categorize_price(price):
    if  <= Q1_UP:
        return "Low"
    elif price >= Q3_UP:
        return "High"
    else:
        return "Average"

df["PriceCategory"] = df["UnitPrice"].apply(categorize_price)

In [None]:
df['PriceCategory'].value_counts().plot(kind='pie', autopct='%1.0f%%', title='PriceCategory')

In [None]:
df = pd.get_dummies(df, columns=['Country', 'Month', 'Week', 'Time'])
df

In [None]:
x_temp = df.drop(['PriceCategory', 'InvoiceNo', 'CustomerID', 'Description', 'StockCode'], axis=1)

In [None]:
y_temp = df['PriceCategory']

In [None]:
nm = NearMiss(sampling_strategy = {'Low':5000, 'Average':5000, 'High':5000})
x_temp_res, y_temp_res = nm.fit_resample(x_temp, y_temp)

In [None]:
y_temp_res.value_counts().plot(kind='pie', autopct='%1.0f%%', title='PriceCategory')

In [None]:
df = pd.DataFrame(x_temp_res, columns=x_temp.columns)

In [None]:
df.shape

In [None]:
x = df.drop(['UnitPrice'], axis=1)
x

In [None]:
x = x.values

In [None]:
y = df['UnitPrice']
y

In [None]:
param_grid_KNN = {
    'n_neighbors': [3, 5, 7, 9, 11, 15],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'cosine']
}

In [None]:
param_grid_DT = {
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}

In [None]:
param_grid_RF = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50]
}

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
S = MinMaxScaler()

In [None]:
test_scores_KNN = []
test_scores_DT = []
test_scores_RF = []
test_scores_LR = []

In [None]:
for i, (train_index, test_index) in enumerate(kfold.split(x)):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    x_train_D, x_val, y_train_D, y_val = train_test_split(x_train, y_train, test_size=0.2)

    params_KNN = []
    mses_val_KNN = []

    params_DT = []
    mses_val_DT = []

    params_RF = []
    mses_val_RF = []

    x_train_S = S.fit_transform(x_train)
    x_test_S = S.transform(x_test)


    x_train_D_S = S.fit_transform(x_train_D)
    x_val_S = S.transform(x_val)


    for params in ParameterGrid(param_grid_KNN):
        knn = KNeighborsRegressor(n_neighbors=params['n_neighbors'], metric=params['metric'])
        knn.fit(x_train_D_S, y_train_D)
        y_pred = knn.predict(x_val_S)
        mse = mean_squared_error(y_val, y_pred)
        params_KNN.append(params)
        mses_val_KNN.append(mse)

    for params in ParameterGrid(param_grid_DT):
        dt = DecisionTreeRegressor(max_depth=params['max_depth'], random_state=42)
        dt.fit(x_train_D_S, y_train_D)
        y_pred = dt.predict(x_val_S)
        mse = mean_squared_error(y_val, y_pred)
        params_DT.append(params)
        mses_val_DT.append(mse)

    for params in ParameterGrid(param_grid_RF):
        rf = RandomForestRegressor(n_estimators=params['n_estimators'], max_depth=params['max_depth'], random_state=42)
        rf.fit(x_train_D_S, y_train_D)
        y_pred = rf.predict(x_val_S)
        mse = mean_squared_error(y_val, y_pred)
        params_RF.append(params)
        mses_val_RF.append(mse)

    lr = LinearRegression()

    print(f"Fold {i+1}:")

    best_params_KNN = params_KNN[mses_val_KNN.index(min(mses_val_KNN))]
    best_params_DT = params_DT[mses_val_DT.index(min(mses_val_DT))]
    best_params_RF = params_RF[mses_val_RF.index(min(mses_val_RF))]

    knn = KNeighborsRegressor(n_neighbors=best_params_KNN['n_neighbors'], metric=best_params_KNN['metric'])
    knn.fit(x_train_S, y_train)
    y_pred = knn.predict(x_test_S)
    knn_mse = mean_squared_error(y_test, y_pred)
    test_scores_KNN.append(knn_mse)

    print(f"\tKNeighborsRegressor:")
    print(f"\t- Melhor n_neighbors: {best_params_KNN['n_neighbors']}")
    print(f"\t- Erro médio quadrático no teste: {knn_mse:.2f}")

    dt = DecisionTreeRegressor(max_depth=best_params_DT['max_depth'], random_state=42)
    dt.fit(x_train_S, y_train)
    y_pred = dt.predict(x_test_S)
    dt_mse = mean_squared_error(y_test, y_pred)
    test_scores_DT.append(dt_mse)

    print(f"\n\tDecisionTreeRegressor:")
    print(f"\t- Melhor max_depth: {best_params_DT['max_depth']}")
    print(f"\t- Erro médio quadrático no teste: {dt_mse:.2f}")

    rf = RandomForestRegressor(n_estimators=best_params_RF['n_estimators'], max_depth=best_params_RF['max_depth'], random_state=42)
    rf.fit(x_train_S, y_train)
    y_pred = rf.predict(x_test_S)
    rf_mse = mean_squared_error(y_test, y_pred)
    test_scores_RF.append(rf_mse)

    print(f"\n\tRandomForestRegressor:")
    print(f"\t- Melhor n_estimators: {best_params_RF['n_estimators']}")
    print(f"\t- Melhor max_depth: {best_params_RF['max_depth']}")
    print(f"\t- Erro médio quadrático no teste: {rf_mse:.2f}")

    lr.fit(x_train_S, y_train)
    y_pred = lr.predict(x_test_S)
    lr_mse = mean_squared_error(y_test, y_pred)
    test_scores_LR.append(lr_mse)

    print(f"\n\tLinearRegression:")
    print(f"\t- Erro médio quadrático no teste: {lr_mse:.2f}")

    print("----------------------------------------")

In [None]:
test_scores_dict = [
    ('KNeighborsRegressor', test_scores_KNN),
    ('DecisionTreeRegressor', test_scores_DT),
    ('LinearRegression', test_scores_LR),
    ('RandomForestRegressor', test_scores_RF),
]

for i in range(len(test_scores_dict)):
    for j in range(i + 1, len(test_scores_dict)):
        model1, test_scores_1 = test_scores_dict[i]
        model2, test_scores_2 = test_scores_dict[j]

        print(f"\n{model1} e {model2}:")

        z, p = wilcoxon(test_scores_1, test_scores_2)

        print(f"\t- Estatística Z: {z}")
        print(f"\t- Valor-p: {p:.2f}")

        if p < 0.05:
            print("Diferença significativa (rejeitar H0).")

            mean1 = np.mean(test_scores_1)
            mean2 = np.mean(test_scores_2)

            best_model = model1 if mean1 < mean2 else model2
            print(f"{best_model} é o melhor modelo.")
        else:
            print("Sem diferença significativa (não rejeitar H0).")

        print("----------------------------------------")