In [21]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV, LassoCV

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [22]:
tree_data = pd.read_csv('s3://220103263-bucket/csv_files/processed/prepared_tree.csv')
linear_data = pd.read_csv('s3://220103263-bucket/csv_files/processed/prepared_linear.csv')

In [23]:
linear_data

Unnamed: 0,Price,is_holiday,duration,days_until_flight,From_Алматы,From_Астана,From_Шымкент,To_Алматы,To_Астана,To_Атырау,To_Шымкент,avialine_Air Astana,avialine_FlyArystan,avialine_Qazaq Air,avialine_SCAT,part_of_day_вечер,part_of_day_день,part_of_day_ночь,part_of_day_утро
0,10.131181,0,4.193800,-0.122540,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
1,10.069002,0,4.193800,0.203485,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
2,10.069002,0,4.658654,0.529510,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
3,10.069002,0,4.426227,0.948685,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
4,10.195411,0,4.891081,1.041835,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109,9.659184,0,-0.454739,1.693885,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1
2110,9.859901,0,-0.454739,1.693885,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
2111,9.659184,0,0.010115,1.693885,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1
2112,9.859901,0,0.010115,1.693885,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0


In [24]:
x_tree = tree_data.drop(columns=['Price'])
y_tree = tree_data['Price']

In [25]:
x_kross_tree = x_tree.values
y_kross_tree = y_tree.values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [26]:
x_tree_train, x_tree_test, y_tree_train, y_tree_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state = 52)

In [27]:
x_kross_tree = x_tree.values
y_kross_tree = y_tree.values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [28]:
dt_params = {
    'max_depth' : [3, 7, 12, 16, 20],
    'min_samples_split' : [2, 7, 12, 16, 20],
    'min_samples_leaf' : [1, 2, 5, 7, 10],
    'max_features' : ['sqrt', 'log2'],
    'max_leaf_nodes' : [10, 25, 50, 75, 100]
}

dt_grid = GridSearchCV(DecisionTreeRegressor(), dt_params, cv=5)
dt_grid.fit(x_tree_train, y_tree_train)

print(dt_grid.best_params_)

dt = dt_grid.best_estimator_

dt_test_pred = dt.predict(x_tree_test)
dt_train_pred = dt.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, dt_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, dt_test_pred))
print("test R² Score:", r2_score(y_tree_test, dt_test_pred))
print("train R² Score:", r2_score(y_tree_train, dt_train_pred))

{'max_depth': 20, 'max_features': 'log2', 'max_leaf_nodes': 100, 'min_samples_leaf': 2, 'min_samples_split': 20}
Mean Squared Error (MSE): 0.02534416027982933
Mean Absolute Error (MAE): 0.1273167245914586
test R² Score: 0.8200646725838557
train R² Score: 0.8668839512102502


In [29]:
counter = 1
for train_index, val_index in kf.split(x_kross_tree):
    x_dt_train, x_dt_val = x_kross_tree[train_index], x_kross_tree[val_index]
    y_dt_train, y_dt_val = y_kross_tree[train_index], y_kross_tree[val_index]

    dt.fit(x_dt_train, y_dt_train)
    dt_test_pred = dt.predict(x_dt_val)
    dt_train_pred = dt.predict(x_dt_train)

    print(f'------------{counter}------------')
    counter += 1

    print("Mean Squared Error (MSE):", mean_squared_error(y_dt_val, dt_test_pred))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_dt_val, dt_test_pred))
    print("test R² Score:", r2_score(y_dt_val, dt_test_pred))
    print("train R² Score:", r2_score(y_dt_train, dt_train_pred))

------------1------------
Mean Squared Error (MSE): 0.028373005249666537
Mean Absolute Error (MAE): 0.1285079061468828
test R² Score: 0.8165872528488501
train R² Score: 0.8492897027103663
------------2------------
Mean Squared Error (MSE): 0.02582995095842564
Mean Absolute Error (MAE): 0.12333678534420735
test R² Score: 0.8297784460271594
train R² Score: 0.8812522282421359
------------3------------
Mean Squared Error (MSE): 0.026984321330351013
Mean Absolute Error (MAE): 0.12636878519024902
test R² Score: 0.8099276392339402
train R² Score: 0.8611830829666837
------------4------------
Mean Squared Error (MSE): 0.02247051563209039
Mean Absolute Error (MAE): 0.11365452114402042
test R² Score: 0.8511373374397194
train R² Score: 0.8725036161339511
------------5------------
Mean Squared Error (MSE): 0.026188816708847474
Mean Absolute Error (MAE): 0.12405009445266735
test R² Score: 0.8351193488129812
train R² Score: 0.8566645884518158


In [32]:
param_grid_gb = {
    'n_estimators': [100, 200],            
    'learning_rate': [0.05, 0.1, 0.2],     
    'max_depth': [3, 5, 7],                
    'min_samples_split': [2, 5, 10],       
    'min_samples_leaf': [1, 2, 4],         
    'subsample': [0.8, 1.0],               
    'max_features': ['sqrt', 'log2']       
}

gb_grid = GridSearchCV(GradientBoostingRegressor(), param_grid_gb, cv=5)
gb_grid.fit(x_tree_train, y_tree_train)

print(gb_grid.best_params_)

gb = gb_grid.best_estimator_

gb_test_pred = gb.predict(x_tree_test)
gb_train_pred = gb.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, gb_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, gb_test_pred))
print("test R² Score:", r2_score(y_tree_test, gb_test_pred))
print("train R² Score:", r2_score(y_tree_train, gb_train_pred))

{'learning_rate': 0.2, 'max_depth': 7, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 1.0}
Mean Squared Error (MSE): 0.009935120892038837
Mean Absolute Error (MAE): 0.0749406369442922
test R² Score: 0.9294638602782691
train R² Score: 0.9864343301499183


In [33]:


rf = RandomForestRegressor()

rf.fit(x_tree_train, y_tree_train)
rf_test_pred = rf.predict(x_tree_test)
rf_train_pred = rf.predict(x_tree_train)

print("Mean Squared Error (MSE):", mean_squared_error(y_tree_test, rf_test_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_tree_test, rf_test_pred))
print("test R² Score:", r2_score(y_tree_test, rf_test_pred))
print("train R² Score:", r2_score(y_tree_train, rf_train_pred))

Mean Squared Error (MSE): 0.009375869454944076
Mean Absolute Error (MAE): 0.0685369391498913
test R² Score: 0.9334343643048587
train R² Score: 0.9888104879589865
