<a href="https://colab.research.google.com/github/marcusflygar1-hash/AI_Course_Submissions_real/blob/main/Excersise_2_MarcusFlygar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
#import dataset
url = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise_2_regression_model/Exercise2BusData.csv'
df = pd.read_csv(url)
df.head(10)

In [None]:
df.info()

In [None]:
df = df.drop(['Arrival_time', 'Stop_id','Bus_id','Line_id'], axis=1) #These columns of information is not needed for the prediction of delay in busses.
df.head(10)

In [None]:
df.shape


In [None]:
df.info()

In [None]:
#create a correlation matrix
corr_matrix = df.corr()
corr_matrix['Arrival_delay'].sort_values(ascending=False) #The correlation matrix focuses on the correlation between the remaining columns to Arrival Delay

As we can see the upstream stop delay is heavily correlated to the arrival delay, however this is not true for the remaining independent variables.

In [None]:
x = df.drop('Arrival_delay', axis=1)
y = df['Arrival_delay']
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

The reason as to why we set the test_size = 0,2 is that this will devide the model into 80% training data and 20% of the data to test this on.

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

Now we have trained and tested the data set, now we will evaluate the model and see if it performs well.

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score
lin_mse = mean_squared_error(y_test, y_pred)
lin_mae = mean_absolute_error(y_test, y_pred)
lin_rmse = root_mean_squared_error(y_test, y_pred)
lin_r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {lin_mse}")
print(f"Mean Absolute Error: {lin_mae}")
print(f"Root Mean Squared Error: {lin_rmse}")
print(f"R2 Score: {lin_r2}")

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)  # Plot actual vs. predicted values

# Add labels and title
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")

# Add a diagonal line for reference (perfect predictions)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', lw=2)

# Show the plot
plt.show()

Now that we have performed a Linear regression together with training and test data and evaluated it. It is time to perform a linear regression witht the XgBoost package.  

# XGBoost BusDelay

In [None]:
#Setting upp the features and labels into a Dmatrix so XGBoost can read it
#(Maybe idk, but I read online that you should do this)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
# Setting the booster parameters, these are just generic, I might play around
# with them later if needed
param_xg = {'max_depth':3,
            'eta':0.1,
            'objective': 'reg:squarederror',
            'seed': 42,
            'nthread':4,
            'eval_metric':'rmse'}


In [None]:
evallist = [(dtrain, 'train'), (dtest, 'eval')]

In [None]:
#Here I set upp the training and saving the model.
# This so I can evaluate it later
num_round = 100 # Updated number from 10 to 100, to get better results.
bst = xgb.train(param_xg, dtrain, num_round, evallist)
bst.save_model('xgboost_model.model')


In [None]:
bst = xgb.Booster({'nthread': 4})  # load model
bst.load_model('xgboost_model.model')

In [None]:
#Predicting..
data = np.random.rand(5, 10)  # 5 entities, each contains 10 features
dtest = xgb.DMatrix(X_test)
y_pred_xgb = bst.predict(dtest)

In [None]:
#Results we can evaulate, MSE, MAE, RMSE, R2.
xgb_mse = mean_squared_error(y_test, y_pred_xgb)
xgb_mae = mean_absolute_error(y_pred, y_pred_xgb)
xgb_rmse = root_mean_squared_error(y_test, y_pred_xgb)
xgb_r2 = r2_score(y_test, y_pred_xgb)
# Printing the results
print(f"Mean Squared Error: {xgb_mse}")
print(f"Mean Absolute Error: {xgb_mae}")
print(f"Root Mean Squared Error: {xgb_rmse}")
print(f"R2 Score: {xgb_r2}")

In [None]:
xgb.plot_importance(bst) # This will show us the attributes that impact the models score the most.
plt.show()

Below we plot the figure to see how we did with the XGBoost.

In [None]:
# OBS This code for the plotting is from taken the Exercises.

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_xgb, alpha=0.5)  # Plot actual vs. predicted values

# Add labels and title
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")

# Add a diagonal line for reference (perfect predictions)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', lw=2)

# Show the plot
plt.show()

Lets try to improve on this model by using a GridSearch

In [None]:
# Here a Grid Search will be used to improve the performance of the XGBooster


from xgboost import XGBRegressor


X, y = df.drop('Arrival_delay', axis=1), df['Arrival_delay']

# Split data into train and test sets..

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# parameter grid

param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.3]
}


# Create XGBoost regressor

xgb_model = XGBRegressor(n_estimators=200, objective='reg:squarederror', random_state=42)


# execute the grid search

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


#Print the results

print(f"The combination of parameters that provides the highest score: {grid_search.best_params_}")
print(f"Highest score: {grid_search.best_score_}")

After playing around the parameters with the grid search, I did not find any improvements

# Bike-sharing.

In [None]:
url_bike = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise_2_regression_model/Exercise2BikeSharing.csv'
df_bike = pd.read_csv(url_bike)
df_bike.head(10)

In [None]:
df_bike.info()

In [None]:
df_bike = df_bike.drop(['dteday','instant','casual','registered'], axis=1) #Drop the date as it is an uncompatible datatype
corr_matrix_bike = df_bike.corr() #Creates a correlation matrix
corr_matrix_bike['cnt'].sort_values(ascending=False) #The correlation matrix focuses on the correlation between the remaining columns to count, aka count of total rental bikes including both casual and registered

In [None]:
x_bike = df_bike.drop('cnt', axis=1) # Dropping count as we want this as our predictor
y_bike = df_bike['cnt']

In [None]:
#Split into train and test data sets.
X_train_bike, X_test_bike, y_train_bike, y_test_bike = train_test_split(x_bike,y_bike, test_size=0.2, random_state=42) #WE use a  80/20 split. Common practice in train / test splitted data.
#Training.
bike_reg = LinearRegression()
bike_reg.fit(X_train_bike, y_train_bike)
y_pred_bike = bike_reg.predict(X_test_bike)

In [None]:
#calc mse, mae.. etc..
bike_mse = mean_squared_error(y_test_bike, y_pred_bike)
bike_mae = mean_absolute_error(y_test_bike, y_pred_bike)
bike_rmse = root_mean_squared_error(y_test_bike, y_pred_bike)
bike_r2 = r2_score(y_test_bike, y_pred_bike)
#Print results.
print(f"Mean Squared Error: {bike_mse}")
print(f"Mean Absolute Error: {bike_mae}")
print(f"Root Mean Squared Error: {bike_rmse}")
print(f"R2 Score: {bike_r2}")


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test_bike, y_pred_bike, alpha=0.5)


plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.plot([min(y_test_bike), max(y_test_bike)], [min(y_test_bike), max(y_test_bike)], linestyle='--', color='red', lw=2)

plt.show()

As we can see these results are not very good. Therefore we will move on to a more advanced model. Like a SVM and a XGBoost.

In [None]:
# Defining and trainging the XGBoost model.
X,y = x_bike, y_bike
dtrain = xgb.DMatrix(X_train_bike, label=y_train_bike)
dtest = xgb.DMatrix(X_test_bike, label=y_test_bike)

vibe = xgb.XGBRegressor()
vibe.fit(X_train_bike, y_train_bike)
y_pred_bike_xgb = vibe.predict(X_test_bike)
vibe.save_model('xgboost_model_bike.model')

In [None]:
#Calculation of evaluation metrics.
xgb_bike_mse = mean_squared_error(y_test_bike, y_pred_bike_xgb)
xgb_bike_mae = mean_absolute_error(y_test_bike, y_pred_bike_xgb)
xgb_bike_rmse = root_mean_squared_error(y_test_bike, y_pred_bike_xgb)
xgb_bike_r2 = r2_score(y_test_bike, y_pred_bike_xgb)

#Print results.
print(f"XGBoost Mean Squared Error: {xgb_bike_mse}")
print(f"XGBoost Mean Absolute Error: {xgb_bike_mae}")
print(f"XGBoost Root Mean Squared Error: {xgb_bike_rmse}")
print(f"XGBoost R2 Score: {xgb_bike_r2}")

In [None]:
xgb.plot_importance(vibe) # This will show us the attributes that impact the models score the most.
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test_bike, y_pred_bike_xgb, alpha=0.5)


plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.plot([min(y_test_bike), max(y_test_bike)], [min(y_test_bike), max(y_test_bike)], linestyle='--', color='red', lw=2)

plt.show()

# Bike Sharing SVM Model



In [None]:
#Importting and training the model

from sklearn.svm import SVR
svm_reg_bike = SVR()
svm_reg_bike.fit(X_train_bike, y_train_bike)

In [None]:
#Scaling as SVM models are sensetive to features scales
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled_bike = scaler.fit_transform(X_train_bike)
X_test_scaled_bike = scaler.transform(X_test_bike)

In [None]:
#Training the SVM Model
svr_bike = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_bike.fit(X_train_scaled_bike, y_train_bike)

In [None]:
#predicting the data
y_pred_bike_svm = svr_bike.predict(X_test_scaled_bike)

#calculating our evaluation metrics
bike_svm_mse = mean_squared_error(y_test_bike, y_pred_bike_svm)
bike_svm_mae = mean_absolute_error(y_test_bike, y_pred_bike_svm)
bike_svm_rmse = root_mean_squared_error(y_test_bike, y_pred_bike_svm)
bike_svm_r2 = r2_score(y_test_bike, y_pred_bike_svm)
 # Print results
print(f"SVM Mean Squared Error: {bike_svm_mse}")
print(f"SVM Mean Absolute Error: {bike_svm_mae}")
print(f"SVM Root Mean Squared Error: {bike_svm_rmse}")
print(f"SVM R2 Score: {bike_svm_r2}")

In [None]:
# Plotting the actual vs. predicted values for the SVM model
plt.figure(figsize=(8, 6))
plt.scatter(y_test_bike, y_pred_bike_svm, alpha=0.7)

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values for SVM")

plt.plot([min(y_test_bike), max(y_test_bike)], [min(y_test_bike), max(y_test_bike)], linestyle='--', color='orange', lw=2)

plt.show()

A basic SVM model without any kind of hypertuning etc. works less well than a basic XGBoost and normal linear regression.