In [368]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [369]:
df = pd.read_csv('split/training_split.csv')
df_val = pd.read_csv('split/validation_split.csv')

In [370]:
df.info()
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29731 entries, 0 to 29730
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   DepartureDate     29731 non-null  int64  
 1   DepartureYear     29731 non-null  int64  
 2   DepartureMonth    29731 non-null  int64  
 3   DepartureDay      29731 non-null  int64  
 4   FlightNumber      29731 non-null  int64  
 5   DepartureAirport  29731 non-null  int64  
 6   ArrivalAirport    29731 non-null  int64  
 7   Route             29731 non-null  int64  
 8   ActualFlightTime  29731 non-null  int64  
 9   ActualTotalFuel   29731 non-null  int64  
 10  ActualTOW         29731 non-null  float64
 11  FlownPassengers   29731 non-null  float64
 12  BagsCount         29731 non-null  float64
 13  FlightBagsWeight  29731 non-null  float64
dtypes: float64(4), int64(10)
memory usage: 3.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1878 entries, 0 to 1877
Data columns (

In [371]:
x = df.drop('ActualTOW', axis=1)  # Dataframe without the target variable
y = df['ActualTOW']  # Target variable

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
train_df = x_train.join(y_train)

In [None]:
train_df.hist(figsize=(15, 12))

In [372]:
train_df['ActualFlightTime'] = np.log(train_df['ActualFlightTime'] + 1)
train_df['ActualTotalFuel'] = np.log(train_df['ActualTotalFuel'] + 1)
train_df['FlownPassengers'] = np.log(train_df['FlownPassengers'] + 1)
train_df['BagsCount'] = np.log(train_df['BagsCount'] + 1)
train_df['FlightBagsWeight'] = np.log(train_df['FlightBagsWeight'] + 1)

In [None]:
train_df.hist(figsize=(15, 12))

In [None]:
plt.figure(figsize=(15, 12))
sns.heatmap(train_df.corr(), annot=True, cmap='coolwarm')

In [373]:
train_df['flight_time_fuel_ratio'] = train_df['ActualFlightTime'] / train_df['ActualTotalFuel']
train_df['bags_weight_ratio'] = train_df['FlightBagsWeight'] / train_df['BagsCount'] 
train_df['bags_passengers_ratio'] = train_df['BagsCount'] / train_df['FlownPassengers']

In [374]:
train_df['flight_time_fuel_ratio'] = np.where(np.isfinite(train_df['flight_time_fuel_ratio']), train_df['flight_time_fuel_ratio'], 0)
train_df['bags_weight_ratio'] = np.where(np.isfinite(train_df['bags_weight_ratio']), train_df['bags_weight_ratio'], 0)
train_df['bags_passengers_ratio'] = np.where(np.isfinite(train_df['bags_passengers_ratio']), train_df['bags_passengers_ratio'], 0)

In [None]:
plt.figure(figsize=(15, 12))
sns.heatmap(train_df.corr(), annot=True, cmap='coolwarm')

In [384]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train, y_train = train_df.drop('ActualTOW', axis=1), train_df['ActualTOW']
x_train_s = scaler.fit_transform(x_train)
reg = LinearRegression()
reg.fit(x_train, y_train)


In [397]:
test_df = x_test.join(y_test)
test_df['ActualFlightTime'] = np.log(test_df['ActualFlightTime'] + 1)
test_df['ActualTotalFuel'] = np.log(test_df['ActualTotalFuel'] + 1)
test_df['FlownPassengers'] = np.log(test_df['FlownPassengers'] + 1)
test_df['BagsCount'] = np.log(test_df['BagsCount'] + 1)
test_df['FlightBagsWeight'] = np.log(test_df['FlightBagsWeight'] + 1)

test_df['flight_time_fuel_ratio'] = test_df['ActualFlightTime'] / test_df['ActualTotalFuel']
test_df['bags_weight_ratio'] = test_df['FlightBagsWeight'] / test_df['BagsCount']
test_df['bags_passengers_ratio'] = test_df['BagsCount'] / test_df['FlownPassengers']

test_df['flight_time_fuel_ratio'] = np.where(np.isfinite(test_df['flight_time_fuel_ratio']), test_df['flight_time_fuel_ratio'], 0)
test_df['bags_weight_ratio'] = np.where(np.isfinite(test_df['bags_weight_ratio']), test_df['bags_weight_ratio'], 0)
test_df['bags_passengers_ratio'] = np.where(np.isfinite(test_df['bags_passengers_ratio']), test_df['bags_passengers_ratio'], 0)


x_test, y_test = test_df.drop('ActualTOW', axis=1), test_df['ActualTOW']


In [398]:
x_train_s = scaler.fit_transform(x_train)


In [399]:
reg.score(x_test, y_test)

-153.14782557624747

In [400]:
scaler = StandardScaler()
x_train, y_train = train_df.drop('ActualTOW', axis=1), train_df['ActualTOW']
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.fit_transform(x_test)

In [401]:
forest = RandomForestRegressor()

forest.fit(x_train_s, y_train)

In [405]:
x_train_s

array([[-0.92268475,  0.        ,  0.        , ..., -0.31886641,
         0.16736275,  0.40542597],
       [ 0.4651937 ,  0.        ,  0.        , ...,  0.89502379,
         0.03315169,  0.8066324 ],
       [ 1.62175907,  0.        ,  0.        , ..., -1.07838094,
         0.64168012, -0.52543607],
       ...,
       [ 0.00256755,  0.        ,  0.        , ...,  0.63682977,
         0.16736275,  0.44887871],
       [-0.92268475,  0.        ,  0.        , ...,  0.70158411,
         0.23998882,  0.23869449],
       [ 1.15913292,  0.        ,  0.        , ...,  1.03138257,
        -0.03017679,  0.93499903]])

In [407]:
forest.score(x_test_s, y_test)

0.13448591213322136

In [None]:
forest = RandomForestRegressor()

param_grid = {
    'n_estimators': [100, 200, 300],  # Different numbers of estimators to try
    'max_depth': [None, 4, 8],  # Different maximum depths to try
    'min_samples_split': [2, 4]  # Different minimum samples split to try
}

grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(x_train_s, y_train)


In [None]:
best_model = grid_search.best_estimator_

In [None]:
best_model.score(x_test_s, y_test)

In [None]:
model.predict(df_val)

In [None]:
# Create the output directory if it doesn't exist
output_directory = 'output'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print('Output directory created successfully.')

# Save data to CSV file
output_path = os.path.join(output_directory, 'output.csv')
df_val.to_csv(output_path, index=False)

print('Data saved to', output_path)