In [260]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [261]:
df = pd.read_csv('split/training_split.csv')
df_val = pd.read_csv('split/validation_split.csv')

In [262]:
x = df.drop("ActualTOW", axis=1)
y = df["ActualTOW"]
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.2, random_state=42)
train_data = x_train.join(y_train)

In [266]:
scaler = StandardScaler()
x_train, y_train = train_data.drop("ActualTOW", axis=1), train_data["ActualTOW"]
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.fit_transform(x_test)

In [None]:
train_data.hist(figsize=(15, 12))

In [None]:
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = train_data[numeric_columns].corr()

plt.figure(figsize=(15, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

In [None]:
train_data.hist(figsize=(15, 12))

In [None]:
plt.figure(figsize=(15, 8))
sns.scatterplot(x='BagsCount', y='ActualTotalFuel', data=train_data, hue='ActualTOW', palette='coolwarm')

In [None]:
model = RandomForestRegressor(n_estimators=100)

param_grid = {
    'n_estimators': [100, 200, 300],  # Different numbers of estimators to try
    'max_depth': [None, 4, 8],  # Different maximum depths to try
    'min_samples_split': [2, 4]  # Different minimum samples split to try
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(x_train_s, y_train)


In [None]:
best_model = grid_search.best_estimator_

In [None]:
best_model.score(x_test_s, y_test)

In [267]:
model = RandomForestRegressor(n_estimators=100)
model.fit(x_train_s, y_train)
# y_pred = model.predict(x_train_s)
# mse = mean_squared_error(Y_train, y_pred)
# print("MSE: " % mse)

In [268]:
model.score(x_test_s, y_test)

0.11181377529387082

In [270]:
model.predict(df_val)



array([54472.37, 70934.57, 54462.41, ..., 18502.11, 18502.11, 18502.11])

In [271]:
# Create the output directory if it doesn't exist
output_directory = 'output'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print("Output directory created successfully.")

# Save data to CSV file
output_path = os.path.join(output_directory, 'output.csv')
df_val.to_csv(output_path, index=False)

print("Data saved to", output_path)

Data saved to output\output.csv
