# Zadanie

1. Zbuduj regresyjny model bazowy

2. Dokonaj optymalizacji regresji. Celem jest wybór najlepszego modelu,
dostrojenie jego hiperparametrów, ocena jakości predykcji oraz analiza wyników.

W pierwszym etapie należy przetestować różne modele regresyjne na przygotowanym zbiorze
danych, porównując ich skuteczność na podstawie wybranych metryk regresyjnych (np. MAE,
RMSE, R²). Po identyfikacji najlepszego modelu należy przeprowadzić dostrajanie jego
hiperparametrów.

Po zakończeniu optymalizacji należy przeanalizować jakość przewidywań, a także – w miarę
możliwości – zweryfikować założenia modelu i rozkład reszt. W kolejnym kroku należy
przeprowadzić analizę istotności cech, jeśli model na to pozwala.

3. Sformułuj wnioski i obserwacje dotyczące działania modelu i jego
dopasowania do danych.


In [9]:
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings


import load_data as dataLoader

warnings.filterwarnings('ignore')
px.defaults.template = "plotly_dark"

In [None]:
# Ładowanie danych
df_currency = dataLoader.load_data_pl()
df_leagues = dataLoader.load_league_info_pl()

print("Currency Data Shape:", df_currency.shape)
print("Leagues Data Shape:", df_leagues.shape)
print("Players Data Shape:", df_players.shape)


Currency Data Shape: (234156, 7)
Leagues Data Shape: (11, 5)
Players Data Shape: (5815, 3)


In [11]:
# Budowa indeksu ekonomicznego na bazie kluczowych walut
currency_basket = ['Redeemer\'s Exalted Orb','Crusader\'s Exalted Orb','Hunter\'s Exalted Orb', 'Warlord\'s Exalted Orb', "Blessing of Chayula"]


df_prices = df_currency.filter(
    (pl.col('Pay') == 'Chaos Orb') & (pl.col('Get').is_in(currency_basket))
)


df_pivot = df_prices.pivot(
    values='Value',
    index=['Date', 'League'],
    columns='Get'
).sort('Date')


for currency in currency_basket:
    if currency not in df_pivot.columns:
        df_pivot = df_pivot.with_columns(pl.lit(None, dtype=pl.Float64).alias(currency))
        
df_pivot = df_pivot.with_columns(
    pl.mean_horizontal([pl.col(c).fill_null(0) for c in currency_basket]).alias('economic_index')
)

df_features = df_pivot.filter(pl.col('economic_index') > 0)

df_features.rename({"League": "Challenge League"})

print("Feature DataFrame after creating index:")
print(df_features.head())

Feature DataFrame after creating index:
shape: (5, 8)
┌────────────┬──────────┬────────────┬────────────┬────────────┬───────────┬───────────┬───────────┐
│ Date       ┆ League   ┆ Hunter's   ┆ Warlord's  ┆ Crusader's ┆ Redeemer' ┆ Blessing  ┆ economic_ │
│ ---        ┆ ---      ┆ Exalted    ┆ Exalted    ┆ Exalted    ┆ s Exalted ┆ of        ┆ index     │
│ date       ┆ str      ┆ Orb        ┆ Orb        ┆ Orb        ┆ Orb       ┆ Chayula   ┆ ---       │
│            ┆          ┆ ---        ┆ ---        ┆ ---        ┆ ---       ┆ ---       ┆ f64       │
│            ┆          ┆ f64        ┆ f64        ┆ f64        ┆ f64       ┆ f64       ┆           │
╞════════════╪══════════╪════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╡
│ 2022-08-20 ┆ Kalandra ┆ 76.77758   ┆ 49.34123   ┆ 49.32625   ┆ 65.0      ┆ 109.3     ┆ 69.949012 │
│ 2022-08-21 ┆ Kalandra ┆ 89.27368   ┆ 44.87255   ┆ 35.09      ┆ 49.19549  ┆ 95.0      ┆ 62.686344 │
│ 2022-08-22 ┆ Kalandra ┆ 120.0      

In [12]:
# Inżynieria cech

df_features = df_features.join(
    df_leagues.select(['League', 'Release Date']), on='League', how='left'
)
print(df_features.head())

df_features = df_features.with_columns(
    (pl.col('Date') - pl.col('Release Date')).dt.total_days().alias('days_since_start')
)

df_players = df_players.with_columns(
    pl.col('DateTime').dt.date().alias('Date')
).group_by('Date').agg(
    pl.mean('Average Players').alias('avg_players') # Aggregate player counts per day
)

df_final = df_features.join(df_players, on='Date', how='left')

df_final = df_final.sort('Date').fill_null(strategy='forward')
df_final = df_final.drop_nulls()


print("Final DataFrame with all features:")
print(df_final.head())

shape: (5, 9)
┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ Date      ┆ League   ┆ Hunter's  ┆ Warlord's ┆ … ┆ Redeemer' ┆ Blessing  ┆ economic_ ┆ Release   │
│ ---       ┆ ---      ┆ Exalted   ┆ Exalted   ┆   ┆ s Exalted ┆ of        ┆ index     ┆ Date      │
│ date      ┆ str      ┆ Orb       ┆ Orb       ┆   ┆ Orb       ┆ Chayula   ┆ ---       ┆ ---       │
│           ┆          ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ f64       ┆ datetime[ │
│           ┆          ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆           ┆ μs]       │
╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2022-08-2 ┆ Kalandra ┆ 76.77758  ┆ 49.34123  ┆ … ┆ 65.0      ┆ 109.3     ┆ 69.949012 ┆ 2022-08-1 │
│ 0         ┆          ┆           ┆           ┆   ┆           ┆           ┆           ┆ 9         │
│           ┆          ┆           ┆           ┆   ┆           ┆           ┆ 

In [13]:
# Budowa i ocena modelu bazowego

features = ['days_since_start', 'avg_players']
target = 'economic_index'

X = df_final.select(features)
y = df_final.select(target)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


y_pred_linear = linear_model.predict(X_test)


mae_linear = mean_absolute_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
r2_linear = r2_score(y_test, y_pred_linear)

print("--- Baseline Model (Linear Regression) ---")
print(f"(MAE): {mae_linear:.2f}")
print(f"(RMSE): {rmse_linear:.2f}")
print(f"(R²): {r2_linear:.2f}")

--- Baseline Model (Linear Regression) ---
Mean Absolute Error (MAE): 41.37
Root Mean Squared Error (RMSE): 60.38
R-squared (R²): 0.14


In [14]:
# Dictionary to store models for iteration
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

results = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train.to_numpy().ravel())
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MAE': mae, 'RMSE': rmse, 'R²': r2, 'model': model}

# Print the results
for name, metrics in results.items():
    print(f"--- {name} ---")
    print(f"MAE: {metrics['MAE']:.2f}")
    print(f"RMSE: {metrics['RMSE']:.2f}")
    print(f"R²: {metrics['R²']:.2f}\n")

--- Linear Regression ---
MAE: 41.37
RMSE: 60.38
R²: 0.14

--- Random Forest ---
MAE: 56.46
RMSE: 76.27
R²: -0.38

--- XGBoost ---
MAE: 62.97
RMSE: 80.76
R²: -0.54



In [15]:
best_model_name = "XGBoost"
best_model = results[best_model_name]['model']

# Create a dataframe for plotting
plot_df = X_test.clone()
plot_df = plot_df.with_columns(y_test.rename({'economic_index': 'Actual'}))
plot_df = plot_df.with_columns(
    pl.Series(name="Predicted", values=best_model.predict(X_test))
)

# Retrieve the date column for the x-axis
dates_test = df_final.select('Date').slice(len(X_train), len(df_final))
plot_df = plot_df.with_columns(dates_test)


# Create the plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=plot_df['Date'], y=plot_df['Actual'],
    mode='lines', name='Wartości Rzeczywiste (Actual)'
))
fig.add_trace(go.Scatter(
    x=plot_df['Date'], y=plot_df['Predicted'],
    mode='lines', name='Wartości Przewidywane (Predicted)', line=dict(dash='dash')
))

fig.update_layout(
    title=f"Porównanie Wartości Rzeczywistych i Przewidywanych dla modelu {best_model_name}",
    xaxis_title="Data",
    yaxis_title="Indeks Ekonomiczny",
    legend_title="Legenda",
    legend=dict(
        orientation="h",  # "h" for horizontal, "v" for vertical
        yanchor="bottom", # Anchor the legend's bottom to the 'y' position
        y=-0.5,           # Position the legend below the plot area (negative values move it down)
        xanchor="right",  # Anchor the legend's right to the 'x' position
        x=1               # Position the legend at the far right of the plot area
    )
)
fig.show()

In [16]:
# Calculate residuals
residuals = plot_df['Actual'] - plot_df['Predicted']

# Plot residuals over time
fig_residuals = px.scatter(
    x=plot_df['Date'], y=residuals,
    title="Analiza Reszt (Błędów Modelu) w Czasie",
    labels={'x': 'Data', 'y': 'Reszta (Actual - Predicted)'}
)
fig_residuals.add_hline(y=0, line_dash="dash", line_color="red")
fig_residuals.show()