In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Preprocess data - log transformation and create ratios between features
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df['ActualFlightTime'] = np.log(df['ActualFlightTime'] + 1)
    df['ActualTotalFuel'] = np.log(df['ActualTotalFuel'] + 1)
    df['FlownPassengers'] = np.log(df['FlownPassengers'] + 1)
    df['BagsCount'] = np.log(df['BagsCount'] + 1)
    df['FlightBagsWeight'] = np.log(df['FlightBagsWeight'] + 1)

    df['flight_time_fuel_ratio'] = df['ActualFlightTime'] / df['ActualTotalFuel']
    df['bags_weight_ratio'] = df['FlightBagsWeight'] / df['BagsCount']
    df['bags_passengers_ratio'] = df['BagsCount'] / df['FlownPassengers']

    df['flight_time_fuel_ratio'] = np.where(np.isfinite(df['flight_time_fuel_ratio']), df['flight_time_fuel_ratio'], 0)
    df['bags_weight_ratio'] = np.where(np.isfinite(df['bags_weight_ratio']), df['bags_weight_ratio'], 0)
    df['bags_passengers_ratio'] = np.where(np.isfinite(df['bags_passengers_ratio']), df['bags_passengers_ratio'], 0)

    return df

In [None]:
# Split data to train 
train_df = preprocess_data('input_refactor/training_refactor.csv')
x_train, y_train = train_df.drop('ActualTOW', axis=1), train_df['ActualTOW']

In [None]:
# Display histogram of train data
train_df.hist(figsize=(15, 12))

In [None]:
# Display heatmap with correlation between features in train data
plt.figure(figsize=(12, 8))
sns.heatmap(train_df.corr(), annot=True, cmap='coolwarm')

In [None]:
# Scale data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

In [None]:
# Create model of regression using Random Forest
forest = RandomForestRegressor()
param_grid = {}
grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(x_train_scaled, y_train)

# This take too much time to compile
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 4, 8],
#     'min_samples_split': [2, 4]
# }
# 
# grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5,
#                            scoring='neg_mean_squared_error',
#                            return_train_score=True)
# 
# grid_search.fit(x_train_scaled, y_train)

#Second option for time saving - not good enough compared to a default settings
# param_dist = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 4, 8],
#     'min_samples_split': [2, 4]
# }
# 
# random_search = RandomizedSearchCV(estimator=forest, param_distributions=param_dist, cv=5,
#                                    scoring='neg_mean_squared_error', n_iter=10,
#                                    return_train_score=True)
# 
# random_search.fit(x_train_scaled, y_train)

In [None]:
# Select best model from grid search
best_model = grid_search.best_estimator_
# best_model = random_search.best_estimator_

In [None]:
# Check the score of model 
best_model.score(x_train_scaled, y_train)

In [None]:
# Check the RMSE of model
rmse = np.sqrt(mean_squared_error(y_train, best_model.predict(x_train_scaled)))
print('RMSE:', rmse)

In [None]:
# Preprocess validation data
val_df = preprocess_data('input_refactor/validation_refactor.csv')

In [None]:
x_val_scaled = scaler.transform(val_df)

In [None]:
predictions = best_model.predict(x_val_scaled)

In [None]:
# Create output directory if not exists
output_directory = 'output'
os.makedirs(output_directory, exist_ok=True)

# Save predictions to csv file
output_path = os.path.join(output_directory, 'output.csv')
val_df['PredictedActualTOW'] = predictions
val_df.to_csv(output_path, index=False)

print('Data saved to', output_path)