In [None]:
import os
import pandas as pd

# current_dir = os.path.dirname()
current_directory = os.getcwd()

df = pd.DataFrame(pd.read_csv(os.path.join(current_directory, 'Ice Cream.csv')))
df.head()
current_directory

In [243]:
from sklearn.model_selection import train_test_split

X = df['Temperature']
y = df['Revenue']

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2, random_state=42)

In [244]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# X_train = pd.DataFrame(X_train)
scaler = StandardScaler()

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [245]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
pred = model.predict(X_test_scaled)

In [246]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

mse, rmse, r2

(1100.0180417242352, 33.16651989166538, 0.9647661985994558)

In [247]:
from sklearn.model_selection import KFold, cross_val_score

model_cross_validated = RandomForestRegressor(n_estimators=100, random_state=42)
model_cross_validated.fit(X_train_scaled, y_train)
y_pred_cross_validated = model_cross_validated.predict(X_test_scaled)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_mse = cross_val_score(model_cross_validated, X_train_scaled, y_train, cv=kf, scoring='neg_mean_squared_error')
cross_val_r2 = cross_val_score(model_cross_validated, X_train_scaled, y_train, cv=kf, scoring='r2')

mean_cross_val_mse = -cross_val_mse.mean()
std_cross_val_mse = cross_val_mse.std()
mean_cross_val_r2 = cross_val_r2.mean()
std_cross_val_r2 = cross_val_r2.std()

model_cross_validated.fit(X_train_scaled, y_train)
y_pred_cross_validated = model_cross_validated.predict(X_test_scaled)

mse_rf = mean_squared_error(y_test, y_pred_cross_validated)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_cross_validated)

print(f'Cross-Validation MSE: {mean_cross_val_mse:.4f} ± {std_cross_val_mse:.4f}')
print(f'Cross-Validation R²: {mean_cross_val_r2:.4f} ± {std_cross_val_r2:.4f}')
print(f'Test MSE: {mse_rf:.4f}')
print(f'Test RMSE: {rmse_rf:.4f}')
print(f'Test R²: {r2_rf:.4f}')

Cross-Validation MSE: 1089.6782 ± 287.1400
Cross-Validation R²: 0.9599 ± 0.0123
Test MSE: 1100.0180
Test RMSE: 33.1665
Test R²: 0.9648


In [None]:
import matplotlib.pyplot as plt
y_train_predicted = model_cross_validated.predict(X_train_scaled)

plt.scatter(y_train, y_train_predicted, color='green', alpha=0.4, edgecolors='g', label='Training Data')
plt.plot(y_train_predicted, y_train_predicted, linestyle='--', color='b', label='Training Prediction Trend')
plt.legend()

In [None]:
plt.scatter(y_test, y_pred_cross_validated, color='r', alpha=0.2, edgecolors='b', label='test input')
plt.plot(y_test, y_pred_cross_validated, linestyle='--', color='b', alpha=0.4, label='Test Prediction Trend')
plt.legend()
