
# Machine Learning Workshop

In this workshop, we'll explore various machine learning algorithms using a weather dataset.
The algorithms covered include:

1. Linear Regression
2. k-NN (k-Nearest Neighbors)
3. Classification and Regression Trees (CART)
4. Naïve Bayes
5. SVM (Support Vector Machines)
6. Random Forest
7. K-Means
8. PCA (Principal Component Analysis)


### Install Necessary Library

In [1]:
!pip install scikit-learn pandas numpy seaborn plotly



### Data Preparation

In [2]:
import pandas as pd

data = pd.read_csv('intro_ml_workshop_dataset.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47622 entries, 0 to 47621
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ds                   47622 non-null  object 
 1   power                47622 non-null  float64
 2   wetbulb_temperature  43269 non-null  float64
 3   humidity             47608 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1.5+ MB


In [4]:
data.head()

Unnamed: 0,ds,power,wetbulb_temperature,humidity
0,2023-06-27 17:08:00,2088.999031,88.385,67.695832
1,2023-06-27 17:09:00,2089.85051,88.46,68.916667
2,2023-06-27 17:10:00,2086.172455,88.295,68.816667
3,2023-06-27 17:11:00,2091.607552,88.28,69.025
4,2023-06-27 17:12:00,2085.903702,87.905,68.279167


In [5]:
data.tail()

Unnamed: 0,ds,power,wetbulb_temperature,humidity
47617,2023-08-22 23:54:00,144.897199,79.46,96.674999
47618,2023-08-22 23:55:00,144.886308,79.49,96.704166
47619,2023-08-22 23:56:00,144.906492,79.52,96.787502
47620,2023-08-22 23:57:00,143.665265,79.52,96.775002
47621,2023-08-22 23:58:00,143.717877,79.52,96.708332


In [6]:
data.describe()

Unnamed: 0,power,wetbulb_temperature,humidity
count,47622.0,43269.0,47608.0
mean,884.582245,82.693403,82.35391
std,734.36204,2.858567,7.689505
min,54.802158,75.365,59.291666
25%,146.925452,80.42,76.454167
50%,789.03753,82.22,84.762501
75%,1712.687208,85.1,87.733332
max,2569.346466,91.22,98.550003


In [7]:
data.set_index('ds', inplace=True)


In [8]:
data.index = pd.to_datetime(data.index)

In [9]:
resampled_5m_data = data.resample('5min').mean()

In [14]:
resampled_5m_data = resampled_5m_data.fillna(method='ffill')

  resampled_5m_data = resampled_5m_data.fillna(method='ffill')


### Import all nesccary library

In [10]:
import plotly.graph_objects as go


# Linear Regression

In [15]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Define features and target
X = resampled_5m_data[['wetbulb_temperature','humidity']]
y = resampled_5m_data['power']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for Linear Regression: {mse:.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Predicted Values'))
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Perfect Fit'))
fig.update_layout(title='Linear Regression: True vs Predicted', xaxis_title='True Values', yaxis_title='Predicted Values', legend_title='Legend')
fig.show()



Mean Squared Error for Linear Regression: 196514.88


# k-Nearest Neighbors

In [16]:

from sklearn.neighbors import KNeighborsRegressor

# Initialize k-NN model
knn_model = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Predict on test set
y_pred_knn = knn_model.predict(X_test)

# Calculate the mean squared error
mse_knn = mean_squared_error(y_test, y_pred_knn)
print(f"Mean Squared Error for k-NN: {mse_knn:.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred_knn, mode='markers', name='Predicted Values'))
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Perfect Fit'))
fig.update_layout(title='k-NN: True vs Predicted', xaxis_title='True Values', yaxis_title='Predicted Values', legend_title='Legend')
fig.show()


Mean Squared Error for k-NN: 159699.13


# Classification and Regression Trees (CART)

In [17]:

from sklearn.tree import DecisionTreeRegressor

# Initialize CART model
cart_model = DecisionTreeRegressor()

# Train the model
cart_model.fit(X_train, y_train)

# Predict on test set
y_pred_cart = cart_model.predict(X_test)

# Calculate the mean squared error
mse_cart = mean_squared_error(y_test, y_pred_cart)
print(f"Mean Squared Error for CART: {mse_cart:.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred_cart, mode='markers', name='Predicted Values'))
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Perfect Fit'))
fig.update_layout(title='CART: True vs Predicted', xaxis_title='True Values', yaxis_title='Predicted Values', legend_title='Legend')
fig.show()



Mean Squared Error for CART: 248058.98


# Naïve Bayes

In [18]:

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Convert target into categories for demonstration purposes
y_class = (y > y.median()).astype(int)

# Split data for classification
X_train, X_test, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=42)

# Initialize Naïve Bayes model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train_class)

# Predict on test set
y_pred_nb = nb_model.predict(X_test)

# Calculate the accuracy
acc_nb = accuracy_score(y_test_class, y_pred_nb)
print(f"Accuracy for Naïve Bayes: {acc_nb:.2f}")

from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_test_class, y_pred_nb)
x = ['Predicted Negative', 'Predicted Positive']
y = ['True Negative', 'True Positive']
fig = ff.create_annotated_heatmap(cm, x=x, y=y, colorscale='Blues')
fig.update_layout(title='Naïve Bayes: Confusion Matrix')
fig.show()


Accuracy for Naïve Bayes: 0.88


# Support Vector Machines (SVM)

In [19]:

from sklearn.svm import SVR

# Initialize SVM model
svm_model = SVR()

# Train the model
svm_model.fit(X_train, y_train)

# Predict on test set
y_pred_svm = svm_model.predict(X_test)

# Calculate the mean squared error
mse_svm = mean_squared_error(y_test, y_pred_svm)
print(f"Mean Squared Error for SVM: {mse_svm:.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred_svm, mode='markers', name='Predicted Values'))
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Perfect Fit'))
fig.update_layout(title='SVM: True vs Predicted', xaxis_title='True Values', yaxis_title='Predicted Values', legend_title='Legend')
fig.show()



Mean Squared Error for SVM: 205523.48


# Random Forest

In [20]:

from sklearn.ensemble import RandomForestRegressor

# Initialize Random Forest model
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_model.predict(X_test)

# Calculate the mean squared error
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Mean Squared Error for Random Forest: {mse_rf:.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred_rf, mode='markers', name='Predicted Values'))
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Perfect Fit'))
fig.update_layout(title='Random Forest: True vs Predicted', xaxis_title='True Values', yaxis_title='Predicted Values', legend_title='Legend')
fig.show()


Mean Squared Error for Random Forest: 156458.27


# K-Means

In [24]:

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Determine the number of clusters
kmeans = KMeans(n_clusters=3)
resampled_5m_data['clusters'] = kmeans.fit_predict(X)

# Plot the clusters
fig = go.Figure()
for cluster in resampled_5m_data['clusters'].unique():
    subset = resampled_5m_data[resampled_5m_data['clusters'] == cluster]
    fig.add_trace(go.Scatter(x=subset['wetbulb_temperature'], y=subset['humidity'], mode='markers', name=f'Cluster {cluster}'))
fig.update_layout(title='K-Means Clustering', xaxis_title='Wet Bulb Temperature', yaxis_title='Humidity', legend_title='Clusters')
fig.show()






# Principal Component Analysis (PCA)

In [25]:

from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=2)
data_pca = pca.fit_transform(X)

# Plot the reduced data
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_pca[:, 0], y=data_pca[:, 1], mode='markers'))
fig.update_layout(title='PCA Result', xaxis_title='Principal Component 1', yaxis_title='Principal Component 2')
fig.show()