In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# mo file data da clean
data = pd.read_excel('vietjet_flight_data.xlsx')

# kiem tra
print(data.head())

# chuyen thanh cac features thoi gian
data['hour_from'] = data['f_time_from'].dt.hour
data['day_of_week'] = data['f_time_from'].dt.dayofweek  # 0: Monday, 6: Sunday
data['month_from'] = data['f_time_from'].dt.month

features = ['hour_from', 'day_of_week', 'month_from', 'ticket_type', 'distance']
target_f_price = 'f_price'
target_fees = 'fees'

# tach du lieu thanh x features va y target
X = data[features]
y_f_price = data[target_f_price]
y_fees = data[target_fees]

# chia thanh tap huan luyen va kiem tra
X_train, X_test, y_train_f_price, y_test_f_price = train_test_split(X, y_f_price, test_size=0.2, random_state=42)
X_train, X_test, y_train_fees, y_test_fees = train_test_split(X, y_fees, test_size=0.2, random_state=42)

print(X_train.head())

              id code code_name f_code         f_time_from  \
0  vj16169846100   vj   Vietjet  VJ122 2021-04-01 04:50:00   
1  vj16169846231   vj   Vietjet  VJ176 2021-04-01 05:25:00   
2  vj16169846302   vj   Vietjet  VJ120 2021-04-01 06:00:00   
3  vj16169846383   vj   Vietjet  VJ132 2021-04-01 06:05:00   
4  vj16169846454   vj   Vietjet  VJ134 2021-04-01 07:00:00   

            f_time_to  f_price    fees  total_price    from  \
0 2021-04-01 07:10:00   199000  650900       849900  TP HCM   
1 2021-04-01 07:35:00   199000  650900       849900  TP HCM   
2 2021-04-01 08:10:00   199000  650900       849900  TP HCM   
3 2021-04-01 08:05:00   199000  650900       849900  TP HCM   
4 2021-04-01 09:10:00   409000  661900      1070900  TP HCM   

         airport_from      to     airport_to type airport_code_from  \
0  Tân Sơn Nhất (SGN)  Hà Nội  Nội Bài (HAN)  Eco               SGN   
1  Tân Sơn Nhất (SGN)  Hà Nội  Nội Bài (HAN)  Eco               SGN   
2  Tân Sơn Nhất (SGN)  Hà Nội  Nội 

In [2]:
# Random Forest cho 'f_price'
rf_f_price = RandomForestRegressor(n_estimators=100, random_state=42)
rf_f_price.fit(X_train, y_train_f_price)

# du doan gia ve
y_pred_f_price = rf_f_price.predict(X_test)

# danh gia mo hinh cho f_price
mae_f_price = mean_absolute_error(y_test_f_price, y_pred_f_price)
mse_f_price = mean_squared_error(y_test_f_price, y_pred_f_price)

print(f'Mean Absolute Error for f_price: {mae_f_price:.2f}')
print(f'Mean Squared Error for f_price: {mse_f_price:.2f}')

# Random Forest cho 'fees'
rf_fees = RandomForestRegressor(n_estimators=100, random_state=42)
rf_fees.fit(X_train, y_train_fees)

# du doan phi
y_pred_fees = rf_fees.predict(X_test)

# danh gia mo hinh cho fees
mae_fees = mean_absolute_error(y_test_fees, y_pred_fees)
mse_fees = mean_squared_error(y_test_fees, y_pred_fees)

print(f'Mean Absolute Error for fees: {mae_fees:.2f}')
print(f'Mean Squared Error for fees: {mse_fees:.2f}')

Mean Absolute Error for f_price: 180685.42
Mean Squared Error for f_price: 118410330213.15
Mean Absolute Error for fees: 13036.97
Mean Squared Error for fees: 876000703.19


In [3]:
import joblib

# luu mo hinh
joblib.dump(rf_f_price, 'rf_f_price_model.pkl')
joblib.dump(rf_fees, 'rf_fees_model.pkl')

['rf_fees_model.pkl']