In [1]:
import pandas as pd
import numpy as np

# Đọc dữ liệu
data = pd.read_csv('data\\Features data set.csv')

# Hiển thị thông tin dữ liệu
print(data.info())
print(data.head())

# Chuyển đổi cột ngày tháng về dạng datetime
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

# Tính tổng doanh thu bằng cách cộng tất cả các MarkDown
data['Revenue'] = data[['Sale1', 'Sale2', 'Sale3', 'Sale4', 'Sale5']].sum(axis=1)

# Loại bỏ giá trị ngoại lai bằng cách sử dụng IQR
Q1 = data['Revenue'].quantile(0.25)
Q3 = data['Revenue'].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data['Revenue'] < (Q1 - 1.5 * IQR)) | (data['Revenue'] > (Q3 + 1.5 * IQR)))]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         8190 non-null   int64  
 1   Date          8190 non-null   object 
 2   Temperature   8190 non-null   float64
 3   Fuel_Price    8190 non-null   float64
 4   Sale1         4032 non-null   float64
 5   Sale2         2921 non-null   float64
 6   Sale3         3613 non-null   float64
 7   Sale4         3464 non-null   float64
 8   Sale5         4050 non-null   float64
 9   CPI           7605 non-null   float64
 10  Unemployment  7605 non-null   float64
 11  IsHoliday     8190 non-null   bool   
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 712.0+ KB
None
   Store        Date  Temperature  Fuel_Price  Sale1  Sale2  Sale3  Sale4  \
0      1    5/2/2010        42.31       2.572    NaN    NaN    NaN    NaN   
1      1   12/2/2010        38.51       2.548    NaN    NaN

In [2]:
# Tổng hợp doanh thu hàng tuần
weekly_data = data.resample('W-Mon', on='Date').sum()

In [3]:
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.graph_objects as go

# Phân tích chuỗi thời gian
decomposition = seasonal_decompose(weekly_data['Revenue'], model='additive', period=52)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

# Hiển thị các thành phần
fig = go.Figure()
fig.add_trace(go.Scatter(x=weekly_data.index, y=weekly_data['Revenue'], mode='lines', name='Original'))
fig.add_trace(go.Scatter(x=trend.index, y=trend, mode='lines', name='Trend'))
fig.add_trace(go.Scatter(x=seasonal.index, y=seasonal, mode='lines', name='Seasonality'))
fig.add_trace(go.Scatter(x=residual.index, y=residual, mode='lines', name='Residuals (Irregularity)'))
fig.update_layout(title='Time Series Decomposition', xaxis_title='Date', yaxis_title='Revenue')
fig.show()


In [4]:
# Hồi quy đơn biến
X_single = (weekly_data.index - weekly_data.index[0]).days.values.reshape(-1, 1)
y_single = weekly_data['Revenue'].values

# Hồi quy đa biến
X_multi = weekly_data.drop(columns=['Revenue'])
y_multi = weekly_data['Revenue']

# Xử lý giá trị thiếu
X_multi.fillna(X_multi.mean(), inplace=True)


In [5]:
from sklearn.model_selection import train_test_split

# Hồi quy đơn biến
X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y_single, test_size=0.2, shuffle=False)

# Hồi quy đa biến
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.2, shuffle=False)


In [6]:
from sklearn.preprocessing import StandardScaler
import joblib

# Chuẩn hóa dữ liệu đơn biến
scaler_single = StandardScaler()
X_train_single_scaled = scaler_single.fit_transform(X_train_single)
X_test_single_scaled = scaler_single.transform(X_test_single)

# Chuẩn hóa dữ liệu đa biến
scaler_multi = StandardScaler()
X_train_multi_scaled = scaler_multi.fit_transform(X_train_multi)
X_test_multi_scaled = scaler_multi.transform(X_test_multi)

# Lưu scaler
joblib.dump(scaler_single, 'scaler_single.joblib')
joblib.dump(scaler_multi, 'scaler_multi.joblib')


['scaler_multi.joblib']

In [7]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Khởi tạo các mô hình
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(alpha=0.1),
    'Ridge': Ridge(alpha=1),
    'Decision Tree': DecisionTreeRegressor(max_depth=10),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, learning_rate=0.1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5)
}

# Hàm tính AIC
def calculate_aic(n, mse, num_params):
    aic = n * np.log(mse) + 2 * num_params
    return aic

# Huấn luyện và đánh giá các mô hình
def evaluate_models(X_train, X_test, y_train, y_test, model_type):
    results = {}
    predictions = {}
    for name, model in models.items():
        print(f'Training {name}...')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        num_params = len(model.coef_) if hasattr(model, 'coef_') else model.get_params().get('n_estimators', 1)
        aic = calculate_aic(len(y_test), mean_squared_error(y_test, y_pred), num_params)
        
        predictions[name] = y_pred
        results[name] = {'MAE': mae, 'MSE': mse, 'R^2': r2, 'AIC': aic}
        print(f'{name}: MAE = {mae}, MSE = {mse}, R^2 = {r2}, AIC = {aic}')
        
        # Lưu mô hình
        joblib.dump(model, f'{model_type}_{name}.joblib')
    
    return results, predictions

# Hồi quy đơn biến
results_single, predictions_single = evaluate_models(X_train_single_scaled, X_test_single_scaled, y_train_single, y_test_single, 'single')

# Hồi quy đa biến
results_multi, predictions_multi = evaluate_models(X_train_multi_scaled, X_test_multi_scaled, y_train_multi, y_test_multi, 'multi')


Training Linear Regression...
Linear Regression: MAE = 179716.9984259735, MSE = 48351007565.13791, R^2 = -0.8732955257741997, AIC = 912.2648572118658
Training Lasso...
Lasso: MAE = 179716.8554032791, MSE = 48350941604.20495, R^2 = -0.8732929702055194, AIC = 912.2648067360576
Training Ridge...
Ridge: MAE = 178010.66738230467, MSE = 47570880938.25447, R^2 = -0.8430705564659744, AIC = 911.6630066418821
Training Decision Tree...
Decision Tree: MAE = 156710.87135135132, MSE = 43614237534.425316, R^2 = -0.6897756664785868, AIC = 908.4500308691723
Training Random Forest...
Random Forest: MAE = 132355.5593324325, MSE = 32335728719.42557, R^2 = -0.25280483247723695, AIC = 1095.379228501311
Training AdaBoost...
AdaBoost: MAE = 130040.13752056404, MSE = 26779471003.36002, R^2 = -0.037535011976976396, AIC = 988.403352672918
Training Gradient Boosting...
Gradient Boosting: MAE = 156262.68519402543, MSE = 43435166755.81643, R^2 = -0.6828378071607017, AIC = 1106.2978040226244
Training XGBoost...
XGBo

In [8]:
import plotly.graph_objects as go

# Hiển thị kết quả
def display_results(results, predictions, X_test, y_test):
    results_df = pd.DataFrame(results).T
    print(results_df)
    
    # Vẽ biểu đồ dự đoán so với thực tế
    for name, y_pred in predictions.items():
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=X_test, y=y_test, mode='lines', name='Actual'))
        fig.add_trace(go.Scatter(x=X_test, y=y_pred, mode='lines', name='Predicted'))
        fig.update_layout(title=f'{name} Prediction vs Actual', xaxis_title='Date', yaxis_title='Revenue')
        fig.show()
    
    # Trực quan hóa kết quả so sánh
    for metric in ['MAE', 'MSE', 'R^2', 'AIC']:
        fig = go.Figure()
        fig.add_trace(go.Bar(x=results_df.index, y=results_df[metric], name=metric))
        fig.update_layout(
            title=f'Comparison of Models based on {metric}',
            xaxis_title='Model',
            yaxis_title=metric,
            barmode='group'
        )
        fig.show()

# Hiển thị kết quả hồi quy đơn biến
display_results(results_single, predictions_single, X_test_single.flatten(), y_test_single)

# Hiển thị kết quả hồi quy đa biến
display_results(results_multi, predictions_multi, X_test_multi.index, y_test_multi)


                             MAE           MSE       R^2          AIC
Linear Regression  179716.998426  4.835101e+10 -0.873296   912.264857
Lasso              179716.855403  4.835094e+10 -0.873293   912.264807
Ridge              178010.667382  4.757088e+10 -0.843071   911.663007
Decision Tree      156710.871351  4.361424e+10 -0.689776   908.450031
Random Forest      132355.559332  3.233573e+10 -0.252805  1095.379229
AdaBoost           130040.137521  2.677947e+10 -0.037535   988.403353
Gradient Boosting  156262.685194  4.343517e+10 -0.682838  1106.297804
XGBoost            146972.094189  3.934015e+10 -0.524182  1102.633922


                            MAE           MSE       R^2          AIC
Linear Regression  4.745596e-10  4.975060e-19  1.000000 -1537.353133
Lasso              3.110479e+03  2.530149e+07  0.999020   652.715834
Ridge              2.182407e+03  6.916602e+06  0.999732   604.729100
Decision Tree      8.573112e+04  1.223094e+10  0.526128   861.407685
Random Forest      5.521584e+04  6.071004e+09  0.764787  1033.491222
AdaBoost           7.463660e+04  8.957566e+09  0.652951   947.883282
Gradient Boosting  6.164491e+04  7.809643e+09  0.697426  1042.809128
XGBoost            5.890669e+04  7.418315e+09  0.712587  1040.907056
