In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers
from tqdm import tqdm
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from tabulate import tabulate
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

West_Bengal_Kolkata = '/kaggle/input/historical-weather-data-for-indian-cities/hyderabad.csv'
Karnatka='/kaggle/input/historical-weather-data-for-indian-cities/bengaluru.csv'
Rajasthan='/kaggle/input/historical-weather-data-for-indian-cities/jaipur.csv'
Maharashtra='/kaggle/input/historical-weather-data-for-indian-cities/pune.csv'
Uttar_Pardesh='/kaggle/input/historical-weather-data-for-indian-cities/kanpur.csv'


df = pd.read_csv(West_Bengal_Kolkata)



In [2]:
pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [3]:
df.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph
0,2009-01-01 00:00:00,28,15,0.0,8.7,6,1,31,09:57 AM,09:58 PM,...,21,9,0,83,0.0,1013,16,10,150,6
1,2009-01-01 01:00:00,28,15,0.0,8.7,6,1,31,09:57 AM,09:58 PM,...,20,9,0,85,0.0,1013,16,10,148,5
2,2009-01-01 02:00:00,28,15,0.0,8.7,6,1,31,09:57 AM,09:58 PM,...,20,8,0,86,0.0,1013,15,10,147,5
3,2009-01-01 03:00:00,28,15,0.0,8.7,6,1,31,09:57 AM,09:58 PM,...,19,8,0,88,0.0,1013,15,10,145,5
4,2009-01-01 04:00:00,28,15,0.0,8.7,6,1,31,09:57 AM,09:58 PM,...,21,7,0,80,0.0,1014,16,10,148,5


In [4]:
target_variable = 'maxtempC'


X = df.drop(columns=[target_variable]).values 
y = df[target_variable].values  

date_column_index = np.where(X[0] == 'date_time')[0]
if date_column_index.size > 0:
    X = np.delete(X, date_column_index, axis=1)

non_numeric_cols = [col for col in range(X.shape[1]) if not np.issubdtype(X.dtype, np.number)]
X[:, non_numeric_cols] = np.apply_along_axis(pd.to_numeric, 0, X[:, non_numeric_cols], errors='coerce')


imputer = SimpleImputer(strategy='mean') 
X = imputer.fit_transform(X)


numerical_features = [col for col in range(X.shape[1]) if np.issubdtype(X.dtype, np.number)]
scaler = StandardScaler()
X[:, numerical_features] = scaler.fit_transform(X[:, numerical_features])

model = RandomForestRegressor() 
rfe = RFE(model, n_features_to_select=10)  
X_rfe = rfe.fit_transform(X, y)


X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42)

In [5]:
X_rfe.shape

(96432, 10)

In [6]:
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2), 
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1) 


model.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


epochs = 100


epoch_bar = tqdm(total=epochs, desc='Epochs', unit='epoch', position=0)

for epoch in range(epochs):
    history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.1, callbacks=[early_stopping], verbose=0)

   
    mse = model.evaluate(X_test, y_test, verbose=0)

    epoch_bar.set_postfix_str(f'Epoch {epoch + 1}/{epochs} - loss: {history.history["loss"][0]:.4f} - val_loss: {history.history["val_loss"][0]:.4f} - Test MSE: {mse:.4f}')
    epoch_bar.update(1)

epoch_bar.close()

predictions = model.predict(X_test)

results = []
for i in range(10):
    actual_value = y_test[i]
    predicted_value = predictions[i][0]
    results.append([f'Sample {i + 1}', f'{actual_value:.2f}', f'{predicted_value:.2f}'])


table = tabulate(results, headers=['Sample', 'Actual', 'Predicted'], tablefmt='pretty')

print(table)

backpropagation_mse = model.evaluate(X_test, y_test, verbose=0)
print(f'backpropagation Test MSE: {backpropagation_mse:.4f}')

backpropagation_mae = mean_absolute_error(y_test, predictions)
print(f'backpropagation Test MAE: {backpropagation_mae:.4f}')


backpropagation_rmse = math.sqrt(backpropagation_mse)
print(f'backpropagation Test RMSE: {backpropagation_rmse:.4f}')


Epochs: 100%|██████████| 100/100 [11:12<00:00,  6.72s/epoch, Epoch 100/100 - loss: 1.3816 - val_loss: 1.5199 - Test MSE: 1.4795]

 39/603 [>.............................] - ETA: 0s 




+-----------+--------+-----------+
|  Sample   | Actual | Predicted |
+-----------+--------+-----------+
| Sample 1  | 28.00  |   29.13   |
| Sample 2  | 23.00  |   25.98   |
| Sample 3  | 36.00  |   36.27   |
| Sample 4  | 32.00  |   31.31   |
| Sample 5  | 32.00  |   31.88   |
| Sample 6  | 29.00  |   28.63   |
| Sample 7  | 33.00  |   32.23   |
| Sample 8  | 29.00  |   27.65   |
| Sample 9  | 35.00  |   35.97   |
| Sample 10 | 28.00  |   28.47   |
+-----------+--------+-----------+
backpropagation Test MSE: 1.4795
backpropagation Test MAE: 0.9168
backpropagation Test RMSE: 1.2164


In [7]:
model = keras.Sequential([
    layers.LSTM(128, activation='relu', input_shape=(X_train.shape[1], 1)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])


model.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


epochs = 100

epoch_bar = tqdm(total=epochs, desc='Epochs', unit='epoch', position=0)

for epoch in range(epochs):
    history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.1, callbacks=[early_stopping], verbose=0)

    mse = model.evaluate(X_test, y_test, verbose=0)

    epoch_bar.set_postfix_str(f'Epoch {epoch + 1}/{epochs} - loss: {history.history["loss"][0]:.4f} - val_loss: {history.history["val_loss"][0]:.4f} - Test MSE: {mse:.4f}')
    epoch_bar.update(1)


epoch_bar.close()

predictions = model.predict(X_test)

results = []
for i in range(10):
    actual_value = y_test[i]
    predicted_value = predictions[i][0]
    results.append([f'Sample {i + 1}', f'{actual_value:.2f}', f'{predicted_value:.2f}'])

table = tabulate(results, headers=['Sample', 'Actual', 'Predicted'], tablefmt='pretty')

print(table)


LSTM_mse = model.evaluate(X_test, y_test, verbose=0)
print(f'LSTM Test MSE: {LSTM_mse:.4f}')

LSTM_mae = mean_absolute_error(y_test, predictions)
print(f'LSTM Test MAE: {LSTM_mae:.4f}')


LSTM_rmse = math.sqrt(LSTM_mse)
print(f'LSTM Test RMSE: {LSTM_rmse:.4f}')

Epochs: 100%|██████████| 100/100 [56:34<00:00, 33.95s/epoch, Epoch 100/100 - loss: 0.5711 - val_loss: 0.8729 - Test MSE: 0.8751]


+-----------+--------+-----------+
|  Sample   | Actual | Predicted |
+-----------+--------+-----------+
| Sample 1  | 28.00  |   28.93   |
| Sample 2  | 23.00  |   23.13   |
| Sample 3  | 36.00  |   35.26   |
| Sample 4  | 32.00  |   31.83   |
| Sample 5  | 32.00  |   32.68   |
| Sample 6  | 29.00  |   28.69   |
| Sample 7  | 33.00  |   32.22   |
| Sample 8  | 29.00  |   28.46   |
| Sample 9  | 35.00  |   35.46   |
| Sample 10 | 28.00  |   28.03   |
+-----------+--------+-----------+
LSTM Test MSE: 0.8751
LSTM Test MAE: 0.6938
LSTM Test RMSE: 0.9355


In [11]:


tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

tree_predictions = tree_model.predict(X_test)

Decision_tree_mse = mean_squared_error(y_test, tree_predictions)

def assign_zone(temperature):
    red_zone_min = 35  
    red_zone_max = 40  
    orange_zone_min = 25  # Minimum temperature for orange zone
    orange_zone_max = 30  # Maximum temperature for orange zone
    green_zone_min = 20
    green_zone_max = 25
    yellow_zone_min = 20  # Minimum temperature for yellow zone
    yellow_zone_max = 30  # Maximum temperature for yellow zone
    
    if red_zone_min <= temperature <= red_zone_max:
        return 'Red'
    elif orange_zone_min <= temperature <= orange_zone_max:
        return 'Orange'
    elif green_zone_min <= temperature <= green_zone_max:
        return 'Green'
    elif yellow_zone_min <= temperature <= yellow_zone_max:
        return 'Yellow'
    else:
        return 'Unknown'

# Create a list to store results with zones
tree_results_with_zones = []
for i in range(10):
    actual_value = y_test[i]
    predicted_value = tree_predictions[i]
    zone = assign_zone(predicted_value)  # Assign zone based on predicted temperature
    tree_results_with_zones.append([f'Sample {i + 1}', f'{actual_value:.2f}', f'{predicted_value:.2f}', zone])

# Generate a table with zones included
tree_table_with_zones = tabulate(tree_results_with_zones, headers=['Sample', 'Actual', 'Predicted', 'Zone'], tablefmt='pretty')

# Print the table with zones
print("Decision Tree Results with Zones:")
print(tree_table_with_zones)

# Calculate and display other metrics
print(f'Decision Tree Test MSE: {Decision_tree_mse:.4f}')



Decision Tree Results with Zones:
+-----------+--------+-----------+---------+
|  Sample   | Actual | Predicted |  Zone   |
+-----------+--------+-----------+---------+
| Sample 1  | 28.00  |   28.00   | Orange  |
| Sample 2  | 23.00  |   23.00   |  Green  |
| Sample 3  | 36.00  |   36.00   |   Red   |
| Sample 4  | 32.00  |   32.00   | Unknown |
| Sample 5  | 32.00  |   32.00   | Unknown |
| Sample 6  | 29.00  |   29.00   | Orange  |
| Sample 7  | 33.00  |   33.00   | Unknown |
| Sample 8  | 29.00  |   28.00   | Orange  |
| Sample 9  | 35.00  |   36.00   |   Red   |
| Sample 10 | 28.00  |   28.00   | Orange  |
+-----------+--------+-----------+---------+
Decision Tree Test MSE: 0.9329


In [16]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the SVM regression model
svm_model = SVR(kernel='rbf')  # You can try different kernels such as 'linear', 'poly', or 'sigmoid'
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test_scaled)

# Calculate MSE for the SVM model
svm_mse = mean_squared_error(y_test, svm_predictions)
def assign_zone(temperature):
    red_zone_min = 35  
    red_zone_max = 40  
    orange_zone_min = 25  # Minimum temperature for orange zone
    orange_zone_max = 30  # Maximum temperature for orange zone
    green_zone_min = 20
    green_zone_max = 25
    yellow_zone_min = 20  # Minimum temperature for yellow zone
    yellow_zone_max = 30  # Maximum temperature for yellow zone
    
    if red_zone_min <= temperature <= red_zone_max:
        return 'Red'
    elif orange_zone_min <= temperature <= orange_zone_max:
        return 'Orange'
    elif green_zone_min <= temperature <= green_zone_max:
        return 'Green'
    elif yellow_zone_min <= temperature <= yellow_zone_max:
        return 'Yellow'
    else:
        return 'Unknown'
# Store results in a list
svm_results_with_zones = []
for i in range(10):
    actual_value = y_test[i]
    zone = assign_zone(predicted_value)
    predicted_value = svm_predictions[i]
    svm_results_with_zones.append([f'Sample {i + 1}', f'{actual_value:.2f}', f'{predicted_value:.2f}',zone])

# Create a table for SVM results
svm_table_with_zones = tabulate(svm_results_with_zones, headers=['Sample', 'Actual', 'Predicted','Zone'], tablefmt='pretty')

# Print the SVM results table
print("SVM Results with zones:")
print(svm_table_with_zones)

# Print the MSE for the SVM
print(f'SVM Test MSE: {svm_mse:.4f}')

# Calculate MAE and Print the MAE
svm_mae = mean_absolute_error(y_test, svm_predictions)
print(f'svm Test MAE: {svm_mae:.4f}')

# Calculate and Print the RMSE
svm_rmse = math.sqrt(svm_mse)
print(f'svm Test RMSE: {svm_rmse:.4f}')


SVM Results with zones:
+-----------+--------+-----------+---------+
|  Sample   | Actual | Predicted |  Zone   |
+-----------+--------+-----------+---------+
| Sample 1  | 28.00  |   28.83   | Orange  |
| Sample 2  | 23.00  |   24.81   | Orange  |
| Sample 3  | 36.00  |   37.39   |  Green  |
| Sample 4  | 32.00  |   30.61   |   Red   |
| Sample 5  | 32.00  |   32.11   | Unknown |
| Sample 6  | 29.00  |   28.45   | Unknown |
| Sample 7  | 33.00  |   33.07   | Orange  |
| Sample 8  | 29.00  |   26.85   | Unknown |
| Sample 9  | 35.00  |   36.82   | Orange  |
| Sample 10 | 28.00  |   27.66   |   Red   |
+-----------+--------+-----------+---------+
SVM Test MSE: 1.6350
svm Test MAE: 0.9488
svm Test RMSE: 1.2787


In [19]:
# Create and train the XGBoost regression model
xgb_model = XGBRegressor(objective='reg:squarederror')  # 'reg:squarederror' for regression
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Calculate MSE for the XGBoost model
xgb_mse = mean_squared_error(y_test, xgb_predictions)
def assign_zone(temperature):
    red_zone_min = 35  
    red_zone_max = 40  
    orange_zone_min = 25  # Minimum temperature for orange zone
    orange_zone_max = 30  # Maximum temperature for orange zone
    green_zone_min = 20
    green_zone_max = 25
    yellow_zone_min = 20  # Minimum temperature for yellow zone
    yellow_zone_max = 30  # Maximum temperature for yellow zone
    
    if red_zone_min <= temperature <= red_zone_max:
        return 'Red'
    elif orange_zone_min <= temperature <= orange_zone_max:
        return 'Orange'
    elif green_zone_min <= temperature <= green_zone_max:
        return 'Green'
    elif yellow_zone_min <= temperature <= yellow_zone_max:
        return 'Yellow'
    else:
        return 'Unknown'
# Store results in a list
xgb_results_with_zones = []
for i in range(10):
    actual_value = y_test[i]
    predicted_value = xgb_predictions[i]
    xgb_results_with_zones.append([f'Sample {i + 1}', f'{actual_value:.2f}', f'{predicted_value:.2f}',zone])

# Create a table for XGBoost results
xgb_table_with_zones = tabulate(xgb_results_with_zones, headers=['Sample', 'Actual', 'Predicted','Zones'], tablefmt='pretty')

# Print the XGBoost results table
print("XGBoost Results:")
print(xgb_table_with_zones)

# Print the MSE for XGBoost
print(f'XGBoost Test MSE: {xgb_mse:.4f}')

# Calculate MAE and Print the MAE
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
print(f'XGBoost Test MAE: {xgb_mae:.4f}')

# Calculate and Print the RMSE
xgb_rmse = math.sqrt(xgb_mse)
print(f'XGBoost Test RMSE: {xgb_rmse:.4f}')


XGBoost Results:
+-----------+--------+-----------+-------+
|  Sample   | Actual | Predicted | Zones |
+-----------+--------+-----------+-------+
| Sample 1  | 28.00  |   28.29   |  Red  |
| Sample 2  | 23.00  |   24.33   |  Red  |
| Sample 3  | 36.00  |   37.36   |  Red  |
| Sample 4  | 32.00  |   31.07   |  Red  |
| Sample 5  | 32.00  |   32.01   |  Red  |
| Sample 6  | 29.00  |   28.52   |  Red  |
| Sample 7  | 33.00  |   32.68   |  Red  |
| Sample 8  | 29.00  |   29.16   |  Red  |
| Sample 9  | 35.00  |   36.22   |  Red  |
| Sample 10 | 28.00  |   27.55   |  Red  |
+-----------+--------+-----------+-------+
XGBoost Test MSE: 1.0750
XGBoost Test MAE: 0.7836
XGBoost Test RMSE: 1.0368


In [21]:
# Create and train the CatBoost regression model
catboost_model = CatBoostRegressor(iterations=500, depth=10, learning_rate=0.05, loss_function='RMSE')
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=20, verbose=50)

# Make predictions on the test set
catboost_predictions = catboost_model.predict(X_test)

# Calculate MSE for the CatBoost model
catboost_mse = mean_squared_error(y_test, catboost_predictions)
def assign_zone(temperature):
    red_zone_min = 35  
    red_zone_max = 40  
    orange_zone_min = 25  # Minimum temperature for orange zone
    orange_zone_max = 30  # Maximum temperature for orange zone
    green_zone_min = 20
    green_zone_max = 25
    yellow_zone_min = 20  # Minimum temperature for yellow zone
    yellow_zone_max = 30  # Maximum temperature for yellow zone
    
    if red_zone_min <= temperature <= red_zone_max:
        return 'Red'
    elif orange_zone_min <= temperature <= orange_zone_max:
        return 'Orange'
    elif green_zone_min <= temperature <= green_zone_max:
        return 'Green'
    elif yellow_zone_min <= temperature <= yellow_zone_max:
        return 'Yellow'
    else:
        return 'Unknown'
# Store results in a list
catboost_results_with_zones = []
for i in range(10):
    actual_value = y_test[i]
    predicted_value = catboost_predictions[i]
    catboost_results_with_zones.append([f'Sample {i + 1}', f'{actual_value:.2f}', f'{predicted_value:.2f}',zone])

# Create a table for CatBoost results
catboost_table_with_zones = tabulate(catboost_results_with_zones, headers=['Sample', 'Actual', 'Predicted','Zones'], tablefmt='pretty')

# Print the CatBoost results table
print("CatBoost Results:")
print(catboost_table_with_zones)

# Print the MSE for CatBoost
print(f'CatBoost Test MSE: {catboost_mse:.4f}')

# Calculate MAE and Print the MAE
catboost_mae = mean_absolute_error(y_test, catboost_predictions)
print(f'CatBoost Test MAE: {catboost_mae:.4f}')

# Calculate and Print the RMSE
catboost_rmse = math.sqrt(catboost_mse)
print(f'CatBoost Test RMSE: {catboost_rmse:.4f}')

0:	learn: 4.2755126	test: 4.2821550	best: 4.2821550 (0)	total: 22.7ms	remaining: 11.3s
50:	learn: 1.4530577	test: 1.4566424	best: 1.4566424 (50)	total: 1.1s	remaining: 9.72s
100:	learn: 1.3087988	test: 1.3159632	best: 1.3159632 (100)	total: 2.18s	remaining: 8.6s
150:	learn: 1.2450193	test: 1.2562964	best: 1.2562964 (150)	total: 3.25s	remaining: 7.51s
200:	learn: 1.1933513	test: 1.2086451	best: 1.2086451 (200)	total: 4.32s	remaining: 6.43s
250:	learn: 1.1501396	test: 1.1697984	best: 1.1697984 (250)	total: 5.39s	remaining: 5.34s
300:	learn: 1.1112564	test: 1.1360061	best: 1.1360061 (300)	total: 6.69s	remaining: 4.42s
350:	learn: 1.0776650	test: 1.1074343	best: 1.1074343 (350)	total: 7.84s	remaining: 3.33s
400:	learn: 1.0470723	test: 1.0808461	best: 1.0808461 (400)	total: 8.97s	remaining: 2.21s
450:	learn: 1.0187890	test: 1.0567636	best: 1.0567636 (450)	total: 10s	remaining: 1.09s
499:	learn: 0.9931974	test: 1.0345271	best: 1.0345271 (499)	total: 11s	remaining: 0us

bestTest = 1.034527082

In [22]:
# Create and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions on the test set
linear_predictions = linear_model.predict(X_test)

# Calculate MSE for the Linear Regression model
linear_mse = mean_squared_error(y_test, linear_predictions)
def assign_zone(temperature):
    red_zone_min = 35  
    red_zone_max = 40  
    orange_zone_min = 25  # Minimum temperature for orange zone
    orange_zone_max = 30  # Maximum temperature for orange zone
    green_zone_min = 20
    green_zone_max = 25
    yellow_zone_min = 20  # Minimum temperature for yellow zone
    yellow_zone_max = 30  # Maximum temperature for yellow zone
    
    if red_zone_min <= temperature <= red_zone_max:
        return 'Red'
    elif orange_zone_min <= temperature <= orange_zone_max:
        return 'Orange'
    elif green_zone_min <= temperature <= green_zone_max:
        return 'Green'
    elif yellow_zone_min <= temperature <= yellow_zone_max:
        return 'Yellow'
    else:
        return 'Unknown'
# Store results in a list
linear_results_with_zones = []
for i in range(10):
    actual_value = y_test[i]
    predicted_value = linear_predictions[i]
    linear_results_with_zones.append([f'Sample {i + 1}', f'{actual_value:.2f}', f'{predicted_value:.2f}',zone])

# Create a table for Linear Regression results
linear_table_with_zones = tabulate(linear_results_with_zones, headers=['Sample', 'Actual', 'Predicted','Zones'], tablefmt='pretty')

# Print the Linear Regression results table
print("Linear Regression Results:")
print(linear_table_with_zones)

# Print the MSE for Linear Regression
print(f'Linear Regression Test MSE: {linear_mse:.4f}')

# Calculate MAE and Print the MAE
linear_mae = mean_absolute_error(y_test, linear_predictions)
print(f'Linear Regression Test MAE: {linear_mae:.4f}')

# Calculate and Print the RMSE
linear_rmse = math.sqrt(linear_mse)
print(f'Linear Regression Test RMSE: {linear_rmse:.4f}')


Linear Regression Results:
+-----------+--------+-----------+-------+
|  Sample   | Actual | Predicted | Zones |
+-----------+--------+-----------+-------+
| Sample 1  | 28.00  |   29.33   |  Red  |
| Sample 2  | 23.00  |   22.78   |  Red  |
| Sample 3  | 36.00  |   37.34   |  Red  |
| Sample 4  | 32.00  |   31.12   |  Red  |
| Sample 5  | 32.00  |   32.74   |  Red  |
| Sample 6  | 29.00  |   29.56   |  Red  |
| Sample 7  | 33.00  |   33.45   |  Red  |
| Sample 8  | 29.00  |   25.83   |  Red  |
| Sample 9  | 35.00  |   37.81   |  Red  |
| Sample 10 | 28.00  |   26.45   |  Red  |
+-----------+--------+-----------+-------+
Linear Regression Test MSE: 2.8482
Linear Regression Test MAE: 1.3280
Linear Regression Test RMSE: 1.6877


In [23]:
# Assuming X and y are your features and target variable as NumPy arrays
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train your model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    #print(f'Mean Squared Error: {mse}')

# After the loop, you can choose to train the final model on the entire dataset for submission
final_model = RandomForestRegressor(n_estimators=100, random_state=42)
final_model.fit(X, y)

# Make predictions on new data (if applicable)
new_data_predictions = final_model.predict(X_train)

# Make predictions on the entire dataset using the final model
y_pred_final = final_model.predict(X)

# Assuming y and y_pred_final are NumPy arrays or pandas Series
actual_predicted_table = pd.DataFrame({
    'Actual_Value': y,
    'Predicted_Value': y_pred_final
})

# Display the table
print(actual_predicted_table)

# Calculate the Mean Squared Error
kfold_mse = mean_squared_error(y, y_pred_final)
print(f'Kfold Test MSE: {kfold_mse:.4f}')

# Calculate Mean Absolute Error
kfold_mae = mean_absolute_error(y, y_pred_final)
print(f'Kfold Test MAE: {kfold_mae:.4f}')

# Calculate and Print the RMSE
kfold_rmse = math.sqrt(kfold_mse)
print(f'Kfold Test RMSE: {kfold_rmse:.4f}')

       Actual_Value  Predicted_Value
0                28            27.97
1                28            28.00
2                28            28.01
3                28            28.02
4                28            28.03
...             ...              ...
96427            26            25.92
96428            26            25.94
96429            26            25.94
96430            26            25.92
96431            26            25.95

[96432 rows x 2 columns]
Kfold Test MSE: 0.0487
Kfold Test MAE: 0.1333
Kfold Test RMSE: 0.2207
