In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV, LassoCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
tree_data = pd.read_csv('/Users/timurchiks/Desktop/flight_price_predictor/data/processed/prepared_tree.csv')
linear_data = pd.read_csv('/Users/timurchiks/Desktop/flight_price_predictor/data/processed/prepared_linear.csv')

In [3]:
tree_data

Unnamed: 0,Price,From,To,is_holiday,avialine,duration,days_until_flight,part_of_day
0,25113,0,2,False,1,200,36,1
1,23599,0,2,False,1,200,43,1
2,23599,0,2,False,1,210,50,2
3,23599,0,2,False,1,205,59,1
4,26779,0,2,False,1,215,61,0
...,...,...,...,...,...,...,...,...
2109,15664,1,3,False,3,100,75,3
2110,19146,1,3,False,3,100,75,2
2111,15664,1,3,False,3,110,75,3
2112,19146,1,3,False,3,110,75,0


In [4]:
linear_data

Unnamed: 0,Price,is_holiday,duration,days_until_flight,From_Алматы,From_Астана,From_Шымкент,To_Алматы,To_Астана,To_Атырау,To_Шымкент,avialine_Air Astana,avialine_FlyArystan,avialine_Qazaq Air,avialine_SCAT,part_of_day_вечер,part_of_day_день,part_of_day_ночь,part_of_day_утро
0,25113,False,4.193800,-0.122540,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False
1,23599,False,4.193800,0.203485,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False
2,23599,False,4.658654,0.529510,True,False,False,False,False,True,False,False,True,False,False,False,False,True,False
3,23599,False,4.426227,0.948685,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False
4,26779,False,4.891081,1.041835,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109,15664,False,-0.454739,1.693885,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True
2110,19146,False,-0.454739,1.693885,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False
2111,15664,False,0.010115,1.693885,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True
2112,19146,False,0.010115,1.693885,False,True,False,False,False,False,True,False,False,False,True,True,False,False,False


In [5]:
x_tree = tree_data.drop(columns=['Price'])
y_tree = tree_data['Price']

x_tree_train, x_tree_test, y_tree_train, y_tree_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state = 52)

In [11]:
dt = DecisionTreeRegressor()

dt.fit(x_tree_train, y_tree_train)
dt_test_pred = dt.predict(x_tree_test)
dt_train_pred = dt.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, dt_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, dt_test_pred))
print("test R² Score:", r2_score(y_tree_test, dt_test_pred))
print("train R² Score:", r2_score(y_tree_train, dt_train_pred))

Mean Squared Error (MSE): 8157613.278434463
Mean Absolute Error (MAE): 1557.622537431048
test R² Score: 0.8685234184242263
train R² Score: 0.9968013202382501


In [12]:
gb = GradientBoostingRegressor()

gb.fit(x_tree_train, y_tree_train)
gb_test_pred = gb.predict(x_tree_test)
gb_train_pred = gb.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, gb_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, gb_test_pred))
print("test R² Score:", r2_score(y_tree_test, gb_test_pred))
print("train R² Score:", r2_score(y_tree_train, dt_train_pred))

Mean Squared Error (MSE): 7707254.103075317
Mean Absolute Error (MAE): 2080.3372563903245
test R² Score: 0.8757818753817334
train R² Score: 0.9968013202382501


In [13]:
rf = RandomForestRegressor()

rf.fit(x_tree_train, y_tree_train)
rf_test_pred = rf.predict(x_tree_test)
rf_train_pred = rf.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, rf_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, rf_test_pred))
print("test R² Score:", r2_score(y_tree_test, rf_test_pred))
print("train R² Score:", r2_score(y_tree_train, rf_train_pred))

Mean Squared Error (MSE): 4593715.186165505
Mean Absolute Error (MAE): 1376.8237359187963
test R² Score: 0.9259629074344073
train R² Score: 0.9873131186373905


In [14]:
x_linear = linear_data.drop(columns=['Price'])
y_linear = linear_data['Price']

x_linear_train, x_linear_test, y_linear_train, y_linear_test = train_test_split(x_linear, y_linear, test_size=0.2, random_state = 52)

In [15]:
lm = LinearRegression()

lm.fit(x_linear_train, y_linear_train)
lm_test_pred = lm.predict(x_linear_test)
lm_train_pred = lm.predict(x_linear_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_linear_test, lm_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_linear_test, lm_test_pred))
print("test R² Score:", r2_score(y_linear_test, lm_test_pred))
print("train R² Score:", r2_score(y_linear_train, lm_train_pred))

Mean Squared Error (MSE): 21460051.641888633
Mean Absolute Error (MAE): 3371.287483426499
test R² Score: 0.6541274838592784
train R² Score: 0.6716721841343282


In [19]:
alphas = np.logspace(-4, 4, 100)  # 100 альф от 10^-4 до 10^4
ridge_cv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=5)
ridge_cv.fit(x_linear_train, y_linear_train)

print("Лучшая alpha для Ridge:", ridge_cv.alpha_)

# === LassoCV ===
lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000)
lasso_cv.fit(x_linear_train, y_linear_train)

print("Лучшая alpha для Lasso:", lasso_cv.alpha_)

Лучшая alpha для Ridge: 0.3593813663804629
Лучшая alpha для Lasso: 1.592282793341094


In [17]:
ridge = Ridge(alpha=0.35)

ridge.fit(x_linear_train, y_linear_train)
ridge_test_pred = ridge.predict(x_linear_test)
ridge_train_pred = ridge.predict(x_linear_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_linear_test, ridge_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_linear_test, ridge_test_pred))
print("test R² Score:", r2_score(y_linear_test, ridge_test_pred))
print("train R² Score:", r2_score(y_linear_train, ridge_train_pred))

Mean Squared Error (MSE): 21449486.04415954
Mean Absolute Error (MAE): 3372.617931574656
test R² Score: 0.6542977700231738
train R² Score: 0.6716595497460743


In [21]:
lasso = Lasso(alpha=1.59)

lasso.fit(x_linear_train, y_linear_train)
lasso_test_pred = lasso.predict(x_linear_test)
lasso_train_pred = lasso.predict(x_linear_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_linear_test, lasso_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_linear_test, lasso_test_pred))
print("test R² Score:", r2_score(y_linear_test, lasso_test_pred))
print("train R² Score:", r2_score(y_linear_train, lasso_train_pred))

Mean Squared Error (MSE): 21446331.435405083
Mean Absolute Error (MAE): 3371.939805149932
test R² Score: 0.6543486129794513
train R² Score: 0.6716574853598121
