In [None]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV, LassoCV

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
tree_data = pd.read_csv('/Users/timurchiks/Desktop/flight_price_predictor/data/processed/prepared_tree.csv')
linear_data = pd.read_csv('/Users/timurchiks/Desktop/flight_price_predictor/data/processed/prepared_linear.csv')

In [3]:
tree_data

Unnamed: 0,Price,From,To,is_holiday,avialine,duration,days_until_flight,part_of_day
0,25113,0,2,False,1,200,36,1
1,23599,0,2,False,1,200,43,1
2,23599,0,2,False,1,210,50,2
3,23599,0,2,False,1,205,59,1
4,26779,0,2,False,1,215,61,0
...,...,...,...,...,...,...,...,...
2109,15664,1,3,False,3,100,75,3
2110,19146,1,3,False,3,100,75,2
2111,15664,1,3,False,3,110,75,3
2112,19146,1,3,False,3,110,75,0


In [4]:
linear_data

Unnamed: 0,Price,is_holiday,duration,days_until_flight,From_Алматы,From_Астана,From_Шымкент,To_Алматы,To_Астана,To_Атырау,To_Шымкент,avialine_Air Astana,avialine_FlyArystan,avialine_Qazaq Air,avialine_SCAT,part_of_day_вечер,part_of_day_день,part_of_day_ночь,part_of_day_утро
0,25113,False,4.193800,-0.122540,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False
1,23599,False,4.193800,0.203485,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False
2,23599,False,4.658654,0.529510,True,False,False,False,False,True,False,False,True,False,False,False,False,True,False
3,23599,False,4.426227,0.948685,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False
4,26779,False,4.891081,1.041835,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109,15664,False,-0.454739,1.693885,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True
2110,19146,False,-0.454739,1.693885,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False
2111,15664,False,0.010115,1.693885,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True
2112,19146,False,0.010115,1.693885,False,True,False,False,False,False,True,False,False,False,True,True,False,False,False


In [5]:
x_tree = tree_data.drop(columns=['Price'])
y_tree = tree_data['Price']

In [None]:
x_tree_train, x_tree_test, y_tree_train, y_tree_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state = 52)

In [7]:
x_kross_tree = x_tree.values
y_kross_tree = y_tree.values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

Decision Tree

In [32]:
dt_params = {
    'max_depth' : [3, 7, 12, 16, 20],
    'min_samples_split' : [2, 7, 12, 16, 20],
    'min_samples_leaf' : [1, 2, 5, 7, 10],
    'max_features' : ['sqrt', 'log2'],
    'max_leaf_nodes' : [10, 25, 50, 75, 100]
}

dt_grid = GridSearchCV(DecisionTreeRegressor(), dt_params, cv=5)
dt_grid.fit(x_tree_train, y_tree_train)

print(dt_grid.best_params_)

dt = dt_grid.best_estimator_

dt_test_pred = dt.predict(x_tree_test)
dt_train_pred = dt.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, dt_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, dt_test_pred))
print("test R² Score:", r2_score(y_tree_test, dt_test_pred))
print("train R² Score:", r2_score(y_tree_train, dt_train_pred))

{'max_depth': 20, 'max_features': 'log2', 'max_leaf_nodes': 100, 'min_samples_leaf': 1, 'min_samples_split': 16}
Mean Squared Error (MSE): 16498631.449801367
Mean Absolute Error (MAE): 2803.5718603386217
test R² Score: 0.7340908928064893
train R² Score: 0.8111642365148625


  _data = np.array(data, dtype=dtype, copy=copy,


In [33]:
counter = 1
for train_index, val_index in kf.split(x_kross_tree):
    x_dt_train, x_dt_val = x_kross_tree[train_index], x_kross_tree[val_index]
    y_dt_train, y_dt_val = y_kross_tree[train_index], y_kross_tree[val_index]

    dt.fit(x_dt_train, y_dt_train)
    dt_test_pred = dt.predict(x_dt_val)
    dt_train_pred = dt.predict(x_dt_train)

    print(f'------------{counter}------------')
    counter += 1

    print("Mean Squared Error (MSE):", mean_squared_error(y_dt_val, dt_test_pred))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_dt_val, dt_test_pred))
    print("test R² Score:", r2_score(y_dt_val, dt_test_pred))
    print("train R² Score:", r2_score(y_dt_train, dt_train_pred))

------------1------------
Mean Squared Error (MSE): 11920236.428407295
Mean Absolute Error (MAE): 2421.285560452876
test R² Score: 0.8398798293985539
train R² Score: 0.8746694596405613
------------2------------
Mean Squared Error (MSE): 12853832.52394816
Mean Absolute Error (MAE): 2546.737099161866
test R² Score: 0.8148601731209946
train R² Score: 0.8526910729110343
------------3------------
Mean Squared Error (MSE): 15119518.568728868
Mean Absolute Error (MAE): 2764.911333453745
test R² Score: 0.7704187599137745
train R² Score: 0.8319196216746306
------------4------------
Mean Squared Error (MSE): 11606238.82687
Mean Absolute Error (MAE): 2428.8383990905277
test R² Score: 0.8371863740818816
train R² Score: 0.8643081991351574
------------5------------
Mean Squared Error (MSE): 14227997.333015716
Mean Absolute Error (MAE): 2629.3534466656893
test R² Score: 0.8149803108267218
train R² Score: 0.8575669174059857


Gradient Boosting

In [10]:
param_grid_gb = {
    'n_estimators': [100, 200],            
    'learning_rate': [0.05, 0.1, 0.2],     
    'max_depth': [3, 5, 7],                
    'min_samples_split': [2, 5, 10],       
    'min_samples_leaf': [1, 2, 4],         
    'subsample': [0.8, 1.0],               
    'max_features': ['sqrt', 'log2']       
}

gb_grid = GridSearchCV(GradientBoostingRegressor(), param_grid_gb, scoring='neg_mean_squared_error', cv=5)
gb_grid.fit(x_tree_train, y_tree_train)

print(gb_grid.best_params_)

gb = gb_grid.best_estimator_

gb_test_pred = gb.predict(x_tree_test)
gb_train_pred = gb.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, gb_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, gb_test_pred))
print("test R² Score:", r2_score(y_tree_test, gb_test_pred))
print("train R² Score:", r2_score(y_tree_train, gb_train_pred))

{'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error (MSE): 4616643.193216563
Mean Absolute Error (MAE): 1551.662420788919
test R² Score: 0.9255933758218476
train R² Score: 0.9746534654047005


  _data = np.array(data, dtype=dtype, copy=copy,


In [11]:
counter = 1
for train_index, val_index in kf.split(x_kross_tree):
    x_gb_train, x_gb_val = x_kross_tree[train_index], x_kross_tree[val_index]
    y_gb_train, y_gb_val = y_kross_tree[train_index], y_kross_tree[val_index]

    gb.fit(x_gb_train, y_gb_train)
    gb_test_pred = gb.predict(x_gb_val)
    gb_train_pred = gb.predict(x_gb_train)

    print(f'------------{counter}------------')
    counter += 1

    print("Mean Squared Error (MSE):", mean_squared_error(y_gb_val, gb_test_pred))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_gb_val, gb_test_pred))
    print("test R² Score:", r2_score(y_gb_val, gb_test_pred))
    print("train R² Score:", r2_score(y_gb_train, gb_train_pred))

------------1------------
Mean Squared Error (MSE): 6180888.206219731
Mean Absolute Error (MAE): 1751.643308101715
test R² Score: 0.9169743922452883
train R² Score: 0.9732133962432573
------------2------------
Mean Squared Error (MSE): 6008982.147325124
Mean Absolute Error (MAE): 1683.4912205338214
test R² Score: 0.9134497892047302
train R² Score: 0.9747840830977869
------------3------------
Mean Squared Error (MSE): 5569407.939963846
Mean Absolute Error (MAE): 1697.2826896961487
test R² Score: 0.9154317265069858
train R² Score: 0.9743762987022919
------------4------------
Mean Squared Error (MSE): 5360912.140650041
Mean Absolute Error (MAE): 1635.7525184653693
test R² Score: 0.9247965204862941
train R² Score: 0.972909388046107
------------5------------
Mean Squared Error (MSE): 6576540.363150061
Mean Absolute Error (MAE): 1703.8832314576828
test R² Score: 0.9144792183083973
train R² Score: 0.9731233797142538


In [18]:
param_grid_rf = {     
    'max_depth': [10, 12, 15],         
    'min_samples_split': [4, 7, 10],         
    'min_samples_leaf': [1, 2, 4],           
    'max_features': ['sqrt', 'log2', 0.5],   
    'bootstrap': [True, False]               
}

rf_grid = GridSearchCV(RandomForestRegressor(), param_grid_rf, scoring='neg_mean_squared_error', cv=5)
rf_grid.fit(x_tree_train, y_tree_train)

print(rf_grid.best_params_)

rf = rf_grid.best_estimator_

rf_test_pred = rf.predict(x_tree_test)
rf_train_pred = rf.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, rf_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, rf_test_pred))
print("test R² Score:", r2_score(y_tree_test, rf_test_pred))
print("train R² Score:", r2_score(y_tree_train, rf_train_pred))

{'bootstrap': False, 'max_depth': 15, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 4}
Mean Squared Error (MSE): 5742454.909897856
Mean Absolute Error (MAE): 1660.328694142721
test R² Score: 0.9074486230669566
train R² Score: 0.9765379313941891


  _data = np.array(data, dtype=dtype, copy=copy,


In [19]:
counter = 1
for train_index, val_index in kf.split(x_kross_tree):
    x_rf_train, x_rf_val = x_kross_tree[train_index], x_kross_tree[val_index]
    y_rf_train, y_rf_val = y_kross_tree[train_index], y_kross_tree[val_index]

    rf.fit(x_rf_train, y_rf_train)
    rf_test_pred = rf.predict(x_rf_val)
    rf_train_pred = rf.predict(x_rf_train)

    print(f'------------{counter}------------')
    counter += 1

    print("Mean Squared Error (MSE):", mean_squared_error(y_rf_val, rf_test_pred))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_rf_val, rf_test_pred))
    print("test R² Score:", r2_score(y_rf_val, rf_test_pred))
    print("train R² Score:", r2_score(y_rf_train, rf_train_pred))

------------1------------
Mean Squared Error (MSE): 7228288.721099462
Mean Absolute Error (MAE): 1881.361008343152
test R² Score: 0.902905044700225
train R² Score: 0.9743950492799621
------------2------------
Mean Squared Error (MSE): 6563065.032038078
Mean Absolute Error (MAE): 1837.3529576726291
test R² Score: 0.9054690714568326
train R² Score: 0.9757819879911451
------------3------------
Mean Squared Error (MSE): 5850205.542997209
Mean Absolute Error (MAE): 1736.903006887104
test R² Score: 0.9111679755400091
train R² Score: 0.9751461196001413
------------4------------
Mean Squared Error (MSE): 6723275.728726969
Mean Absolute Error (MAE): 1855.86516815315
test R² Score: 0.9056851305776099
train R² Score: 0.9730445215285238
------------5------------
Mean Squared Error (MSE): 6521541.266139862
Mean Absolute Error (MAE): 1710.8489717043085
test R² Score: 0.9151944219730779
train R² Score: 0.9747441436812626


In [20]:
x_linear = linear_data.drop(columns=['Price'])
y_linear = linear_data['Price']

In [21]:
x_linear_train, x_linear_test, y_linear_train, y_linear_test = train_test_split(x_linear, y_linear, test_size=0.2, random_state=52)

In [23]:
x_kross_linear = x_linear.values
y_kross_linear = y_linear.values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

Linear Regression

In [22]:
lm = LinearRegression()

lm.fit(x_linear_train, y_linear_train)
lm_test_pred = lm.predict(x_linear_test)
lm_train_pred = lm.predict(x_linear_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_linear_test, lm_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_linear_test, lm_test_pred))
print("test R² Score:", r2_score(y_linear_test, lm_test_pred))
print("train R² Score:", r2_score(y_linear_train, lm_train_pred))

Mean Squared Error (MSE): 21460051.641888633
Mean Absolute Error (MAE): 3371.287483426499
test R² Score: 0.6541274838592784
train R² Score: 0.6716721841343282


In [24]:
counter = 1
for train_index, val_index in kf.split(x_kross_linear):
    x_lm_train, x_lm_val = x_kross_linear[train_index], x_kross_linear[val_index]
    y_lm_train, y_lm_val = y_kross_linear[train_index], y_kross_linear[val_index]

    lm.fit(x_lm_train, y_lm_train)
    lm_test_pred = lm.predict(x_lm_val)
    lm_train_pred = lm.predict(x_lm_train)

    print(f'------------{counter}------------')
    counter += 1

    print("Mean Squared Error (MSE):", mean_squared_error(y_lm_val, lm_test_pred))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_lm_val, lm_test_pred))
    print("test R² Score:", r2_score(y_lm_val, lm_test_pred))
    print("train R² Score:", r2_score(y_lm_train, lm_train_pred))

------------1------------
Mean Squared Error (MSE): 23749453.591607563
Mean Absolute Error (MAE): 3682.145390070922
test R² Score: 0.6809822872542275
train R² Score: 0.6652666885677307
------------2------------
Mean Squared Error (MSE): 23541024.28605201
Mean Absolute Error (MAE): 3572.825059101655
test R² Score: 0.6609274974795283
train R² Score: 0.6701929807284888
------------3------------
Mean Squared Error (MSE): 22783000.888888888
Mean Absolute Error (MAE): 3457.933806146572
test R² Score: 0.6540531649086474
train R² Score: 0.6718152847593097
------------4------------
Mean Squared Error (MSE): 23264794.609929077
Mean Absolute Error (MAE): 3517.026004728132
test R² Score: 0.6736388400078821
train R² Score: 0.6676180887700874
------------5------------
Mean Squared Error (MSE): 26945498.670616113
Mean Absolute Error (MAE): 3719.196682464455
test R² Score: 0.6496029854399983
train R² Score: 0.673187598924337


In [25]:
alphas = np.logspace(-4, 4, 100)

ridge_cv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=5)
ridge_cv.fit(x_linear_train, y_linear_train)

print("Лучшая alpha для Ridge:", ridge_cv.alpha_)

lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000)
lasso_cv.fit(x_linear_train, y_linear_train)

print("Лучшая alpha для Lasso:", lasso_cv.alpha_)

Лучшая alpha для Ridge: 0.3593813663804629
Лучшая alpha для Lasso: 1.592282793341094


Ridge

In [26]:
ridge = Ridge(alpha=0.35)

ridge.fit(x_linear_train, y_linear_train)
ridge_test_pred = ridge.predict(x_linear_test)
ridge_train_pred = ridge.predict(x_linear_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_linear_test, ridge_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_linear_test, ridge_test_pred))
print("test R² Score:", r2_score(y_linear_test, ridge_test_pred))
print("train R² Score:", r2_score(y_linear_train, ridge_train_pred))

Mean Squared Error (MSE): 21449486.04415954
Mean Absolute Error (MAE): 3372.617931574656
test R² Score: 0.6542977700231738
train R² Score: 0.6716595497460743


In [28]:
counter = 1
for train_index, val_index in kf.split(x_kross_linear):
    x_r_train, x_r_val = x_kross_linear[train_index], x_kross_linear[val_index]
    y_r_train, y_r_val = y_kross_linear[train_index], y_kross_linear[val_index]

    ridge.fit(x_r_train, y_r_train)
    r_test_pred = ridge.predict(x_r_val)
    r_train_pred = ridge.predict(x_r_train)

    print(f'------------{counter}------------')
    counter += 1

    print("Mean Squared Error (MSE):", mean_squared_error(y_r_val, r_test_pred))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_r_val, r_test_pred))
    print("test R² Score:", r2_score(y_r_val, r_test_pred))
    print("train R² Score:", r2_score(y_r_train, r_train_pred))

------------1------------
Mean Squared Error (MSE): 23759020.00380379
Mean Absolute Error (MAE): 3676.4877317378555
test R² Score: 0.6808537851425367
train R² Score: 0.6653029385111398
------------2------------
Mean Squared Error (MSE): 23548873.656414915
Mean Absolute Error (MAE): 3581.290198842435
test R² Score: 0.6608144392871651
train R² Score: 0.670456559524367
------------3------------
Mean Squared Error (MSE): 22763139.408185724
Mean Absolute Error (MAE): 3455.122123418642
test R² Score: 0.6543547501310243
train R² Score: 0.6718154317722135
------------4------------
Mean Squared Error (MSE): 23265745.249596834
Mean Absolute Error (MAE): 3519.08503925951
test R² Score: 0.6736255043275159
train R² Score: 0.6676740177671425
------------5------------
Mean Squared Error (MSE): 26940602.255921472
Mean Absolute Error (MAE): 3726.9203220534987
test R² Score: 0.6496666580077999
train R² Score: 0.6731519730094511


Lasso

In [27]:
lasso = Lasso(alpha=1.59)

lasso.fit(x_linear_train, y_linear_train)
lasso_test_pred = lasso.predict(x_linear_test)
lasso_train_pred = lasso.predict(x_linear_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_linear_test, lasso_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_linear_test, lasso_test_pred))
print("test R² Score:", r2_score(y_linear_test, lasso_test_pred))
print("train R² Score:", r2_score(y_linear_train, lasso_train_pred))

Mean Squared Error (MSE): 21446331.435405083
Mean Absolute Error (MAE): 3371.939805149932
test R² Score: 0.6543486129794513
train R² Score: 0.6716574853598121


In [29]:
counter = 1
for train_index, val_index in kf.split(x_kross_linear):
    x_l_train, x_l_val = x_kross_linear[train_index], x_kross_linear[val_index]
    y_l_train, y_l_val = y_kross_linear[train_index], y_kross_linear[val_index]

    lasso.fit(x_l_train, y_l_train)
    l_test_pred = lasso.predict(x_l_val)
    l_train_pred = lasso.predict(x_l_train)

    print(f'------------{counter}------------')
    counter += 1

    print("Mean Squared Error (MSE):", mean_squared_error(y_l_val, l_test_pred))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_l_val, l_test_pred))
    print("test R² Score:", r2_score(y_l_val, l_test_pred))
    print("train R² Score:", r2_score(y_l_train, l_train_pred))

------------1------------
Mean Squared Error (MSE): 23763509.18516197
Mean Absolute Error (MAE): 3676.5501193318
test R² Score: 0.6807934836133473
train R² Score: 0.6652986086424586
------------2------------
Mean Squared Error (MSE): 23560400.757562872
Mean Absolute Error (MAE): 3583.066305410829
test R² Score: 0.6606484089995468
train R² Score: 0.6704512810104812
------------3------------
Mean Squared Error (MSE): 22756734.553723928
Mean Absolute Error (MAE): 3454.99113183899
test R² Score: 0.6544520041820199
train R² Score: 0.6718144733160598
------------4------------
Mean Squared Error (MSE): 23262702.012254585
Mean Absolute Error (MAE): 3517.207667178569
test R² Score: 0.6736681952038293
train R² Score: 0.6676706695393058
------------5------------
Mean Squared Error (MSE): 26939736.615275048
Mean Absolute Error (MAE): 3726.15129883802
test R² Score: 0.6496779147264784
train R² Score: 0.6731415145247235


Save Models

In [30]:
from joblib import dump

In [34]:
dump(dt, 'dt_v1.joblib')
dump(gb, 'gb_v1.joblib')
dump(rf, 'rf_v1.joblib')
dump(lm, 'lm_v1.joblib')
dump(ridge, 'ridge_v1.joblib')
dump(lasso, 'lasso_v1.joblib')

['lasso_v1.joblib']

In [35]:
import pickle

In [36]:
with open('dt_v1.pkl', 'wb') as f:
    pickle.dump(dt, f)

with open('gb_v1.pkl', 'wb') as f:
    pickle.dump(gb, f)

with open('rf_v1.pkl', 'wb') as f:
    pickle.dump(rf, f)

with open('lm_v1.pkl', 'wb') as f:
    pickle.dump(lm, f)

with open('ridge_v1.pkl', 'wb') as f:
    pickle.dump(ridge, f)

with open('lasso_v1.pkl', 'wb') as f:
    pickle.dump(lasso, f)
