In [None]:
#PART 1 - LINEAR REGRESSION MODEL
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

data = pd.read_csv('/new_weather_data.csv')

# Selecting features
features = ['Temperature..C.', 'Humidity', 'Wind.Speed..km.h.', 'Wind.Bearing..degrees.', 'Visibility..km.', 'Pressure..millibars.', 'Summary', 'Precip.Type']
X = data[features]
y = data['Apparent.Temperature..C.']


numeric_features = ['Temperature..C.', 'Humidity', 'Wind.Speed..km.h.', 'Wind.Bearing..degrees.', 'Visibility..km.', 'Pressure..millibars.']
categorical_features = ['Summary', 'Precip.Type']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Evaluation
y_pred_lr = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

print(f'Linear Regression - R2: {lr_r2}, MSE: {lr_mse}, RMSE: {lr_rmse}, MAE: {lr_mae}')

# Get feature importance
feature_importance = best_gbr.feature_importances_

encoded_cat_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(input_features=categorical_features)
all_feature_names = features_to_use + list(encoded_cat_columns)

feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)

In [None]:
#PART 2 - LINEAR REGRESSION MODEL INTERPRETATION
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.model_selection import learning_curve

#part 2 b - Calculate the average of the target variable
y_avg = np.mean(y_test)
rmse_percentage = (lr_rmse / y_avg) * 100
print(f"RMSE as a percentage of the target variable's average: {rmse_percentage}%")

#part 2 - c
# Scatter plot of actual vs predicted values
plt.scatter(y_test, y_pred_lr, alpha=0.5)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_pred_lr, 1))(np.unique(y_test)), color='red')  # Linear fit
plt.show()

#part 2 d - Calculate Pearson's correlation
correlation, _ = pearsonr(y_test, y_pred_lr)
print(f"Pearson's correlation: {correlation}")

#part 2 e - Function to plot learning curves
def plot_learning_curves(model, X, y):
    train_sizes, train_scores, validation_scores = learning_curve(model, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=3, scoring='neg_mean_squared_error')

    train_scores_mean = -train_scores.mean(axis=1)
    train_scores_std = train_scores.std(axis=1)
    validation_scores_mean = -validation_scores.mean(axis=1)
    validation_scores_std = validation_scores.std(axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, color="r", alpha=0.1)
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, color="g", alpha=0.1)

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.title('Learning curves')
    plt.xlabel('Training examples')
    plt.ylabel('MSE')
    plt.legend(loc="best")
    plt.grid()
    plt.show()

plot_learning_curves(lr_model, X_train, y_train)


In [None]:
#PART 1 - NEURAL NETWORKS MODEL
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('/new_weather_data.csv')

features = ['Temperature..C.', 'Humidity', 'Wind.Speed..km.h.', 'Wind.Bearing..degrees.', 'Visibility..km.', 'Pressure..millibars.', 'Summary', 'Precip.Type']
X = data[features]
y = data['Apparent.Temperature..C.']

numeric_features = ['Temperature..C.', 'Humidity', 'Wind.Speed..km.h.', 'Wind.Bearing..degrees.', 'Visibility..km.', 'Pressure..millibars.']
categorical_features = ['Summary', 'Precip.Type']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Hyperparameters
learning_rate = 0.001
n_epochs = 100
batch_size = 16

# Neural Network Model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

# Evaluation
y_pred = model.predict(X_test)
nn_mse = mean_squared_error(y_test, y_pred)
nn_rmse = np.sqrt(nn_mse)
nn_mae = mean_absolute_error(y_test, y_pred)
nn_r2 = r2_score(y_test, y_pred)
print(f'Neural Network - MSE: {nn_mse}, RMSE: {nn_rmse}, MAE: {nn_mae}, R^2: {nn_r2}, Learning Rate: {learning_rate}, Epochs: {n_epochs}, Batch Size: {batch_size}')

# Get feature importance
feature_importance = best_gbr.feature_importances_

encoded_cat_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(input_features=categorical_features)
all_feature_names = features_to_use + list(encoded_cat_columns)

feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)

In [None]:
# PART 2 - NEURAL NETWORK MODEL INTERPRETATION
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Part b - Calculate the percentage of RMSE to the average of the target variable
y_avg = np.mean(y_test)
rmse_percentage = (nn_rmse / y_avg) * 100
print(f"RMSE as a percentage of the target variable's average: {rmse_percentage:.2f}%")

# Part c - Scatter plot of actual vs predicted values
plt.scatter(y_test, y_pred.flatten(), alpha=0.5)
plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_pred.flatten(), 1))(np.unique(y_test)), color='red')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.show()

# Part d - Calculate Pearson's correlation
correlation, _ = pearsonr(y_test, y_pred.flatten())
print(f"Pearson's correlation: {correlation:.2f}")

# Part e - Plot the learning curves using the history from model training
def plot_learning_curves(history):
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Learning Curves')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

plot_learning_curves(history)

In [None]:
#PART 1 - DESICION TREE REGRESSION MODEL
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('/new_weather_data.csv')

features_to_use = ['Temperature..C.', 'Humidity', 'Wind.Speed..km.h.', 'Wind.Bearing..degrees.', 'Visibility..km.', 'Pressure..millibars.']
categorical_features = ['Summary', 'Precip.Type']  # Assuming these are the only categorical features

X = data[features_to_use + categorical_features]
y = data['Apparent.Temperature..C.']

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough')

X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
best_dt = grid_search.best_estimator_
y_pred_dt = best_dt.predict(X_test)

# Evaluating the model
dt_r2 = r2_score(y_test, y_pred_dt)
dt_rmse = np.sqrt(mean_squared_error(y_test, y_pred_dt))
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_mae = mean_absolute_error(y_test, y_pred_dt)

best_dt.fit(X_processed, y)

print(f'Decision Tree - R2: {dt_r2}, RMSE: {dt_rmse}, MSE: {dt_mse}, MAE: {dt_mae}')
print(f'Best Hyperparameters: {best_params}')

# Get feature importance
feature_importance = best_gbr.feature_importances_

encoded_cat_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(input_features=categorical_features)
all_feature_names = features_to_use + list(encoded_cat_columns)

feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)

# Function to recursively traverse the tree and extract rules
def tree_to_rules(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    def recurse(node):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print(f"If {name} <= {threshold}:")
            recurse(tree_.children_left[node])

            print(f"Else (If {name} > {threshold}):")
            recurse(tree_.children_right[node])
        else:
            print(f"Predict {tree_.value[node]}")

    recurse(0)

tree_to_rules(best_dt, all_feature_names)

In [None]:
#PART 2 - DESICION TREE REGRESSION MODEL INTERPRETATION
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Part b - Calculate the percentage of RMSE to the average of the target variable
y_avg_dt = np.mean(y_test)
rmse_percentage_dt = (dt_rmse / y_avg_dt) * 100
print(f"RMSE as a percentage of the target variable's average: {rmse_percentage_dt:.2f}%")

# Part c - Scatter plot of actual vs predicted values for Decision Tree
plt.scatter(y_test, y_pred_dt, alpha=0.5)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values - Decision Tree')
plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_pred_dt, 1))(np.unique(y_test)), color='red')
plt.show()

# Part d - Calculate Pearson's correlation for Decision Tree
correlation_dt, _ = pearsonr(y_test, y_pred_dt)
print(f"Pearson's correlation for Decision Tree: {correlation_dt:.2f}")

# Part e - Learning curves are typically used with models that can be incrementally trained.
# Decision trees do not fit this category as they do not support partial fitting.
# However, we can still generate a plot to show model complexity vs error.
# This would involve varying the 'max_depth' of the tree and plotting the training and validation error.

# Function to calculate error metrics for varying model complexities (max_depth)
def model_complexity_curve(X_train, y_train, X_test, y_test, max_depths):
    train_errors = []
    test_errors = []
    for max_depth in max_depths:
        model = DecisionTreeRegressor(max_depth=max_depth)
        model.fit(X_train, y_train)
        train_predictions = model.predict(X_train)
        test_predictions = model.predict(X_test)
        train_errors.append(mean_squared_error(y_train, train_predictions))
        test_errors.append(mean_squared_error(y_test, test_predictions))

    return train_errors, test_errors

max_depths = np.arange(1, 21)
train_errors, test_errors = model_complexity_curve(X_train, y_train, X_test, y_test, max_depths)

plt.plot(max_depths, train_errors, label='Training Error')
plt.plot(max_depths, test_errors, label='Validation Error')
plt.xlabel('Max Depth of Decision Tree')
plt.ylabel('Mean Squared Error')
plt.title('Model Complexity vs Error')
plt.legend()
plt.show()


In [None]:
#PART 1 - GRADIENT BOOSTING REGRESSOR MODEL
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

data = pd.read_csv('/new_weather_data.csv')

features_to_use = ['Temperature..C.', 'Humidity', 'Wind.Speed..km.h.', 'Wind.Bearing..degrees.', 'Visibility..km.', 'Pressure..millibars.']
categorical_features = ['Summary', 'Precip.Type']

X = data[features_to_use + categorical_features]
y = data['Apparent.Temperature..C.']

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough')

X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Best hyperparameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1]
}

gbr = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_gbr = grid_search.best_estimator_

y_pred = best_gbr.predict(X_test)

# Evaluating the model
gbr_r2 = r2_score(y_test, y_pred)
gbr_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
gbr_mse = mean_squared_error(y_test, y_pred)
gbr_mae = mean_absolute_error(y_test, y_pred)

print(f'Best Hyperparameters: {best_params}')
print(f'Gradient Boosted Regressor - R2: {gbr_r2}, RMSE: {gbr_rmse}, MSE: {gbr_mse}, MAE: {gbr_mae}')

# Get feature importance
feature_importance = best_gbr.feature_importances_

encoded_cat_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(input_features=categorical_features)
all_feature_names = features_to_use + list(encoded_cat_columns)

feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)


In [None]:
#PART 2 - Gradient Boosting Regressor Model Interpretation
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.model_selection import learning_curve

# Part c - Scatter plot of actual vs predicted values for Gradient Boosting Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values - Gradient Boosting Regressor')
m, b = np.polyfit(y_test, y_pred, 1)
plt.plot(y_test, m*y_test + b, color='red')
plt.show()

# Part d - Calculate Pearson's correlation
correlation_gbr, _ = pearsonr(y_test, y_pred)
print(f"Pearson's correlation for Gradient Boosting Regressor: {correlation_gbr:.2f}")

# Part e - Learning curves
def plot_learning_curves(estimator, X, y):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

    train_scores_mean = -np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = -np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.title('Learning Curves')
    plt.xlabel('Training examples')
    plt.ylabel('Negative Mean Squared Error')
    plt.legend(loc="best")
    plt.grid()
    plt.show()

plot_learning_curves(best_gbr, X_train, y_train)