# Supervised and Unsupervised Learning Project

This notebook contains the tree-based regression and clustering analysis projects.

## Part 1: Supervised Learning using a Tree-Based Model
Use statistical learning to answer a business question.

In [None]:

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error


In [None]:

# Import insurance claim data
df = pd.read_csv('ClaimsData.csv', sep=",")
df.shape


In [None]:

df.head()


In [None]:

# One-hot encode the "Category" variable
ohe = OneHotEncoder(categories='auto')
Xd = ohe.fit_transform(df.Category.values.reshape(-1, 1)).toarray()
df_ohe = pd.DataFrame(Xd, columns=ohe.get_feature_names_out())

# Combine with main dataframe
df_to_use1 = pd.concat([df.loc[:, ["Claim","Mileage","AgeP"]], df_ohe], axis=1)
df_to_use1.head()


In [None]:

# Features and response
X = df_to_use1.iloc[:, 1:8]
y = df_to_use1.loc[:, ["Claim"]]
X.head()
y.head()


In [None]:

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Fit tree-based model
reg = DecisionTreeRegressor(random_state=0)
reg.fit(X_train, y_train)

# MSE before pruning
y_pred = reg.predict(X_test)
error = mean_squared_error(y_test, y_pred)
print("MSE:", error)


In [None]:

# Find optimal max leaf nodes
nodes = list(range(2,35))
regs = [DecisionTreeRegressor(random_state=0, max_leaf_nodes=n).fit(X_train, y_train) for n in nodes]

train_scores = [r.score(X_train, y_train) for r in regs]
test_scores = [r.score(X_test, y_test) for r in regs]

plt.figure()
plt.plot(nodes, train_scores, marker='o', label='train')
plt.plot(nodes, test_scores, marker='o', label='test')
plt.xlabel('Maximum leaf nodes')
plt.ylabel('Score')
plt.legend()
plt.show()


In [None]:

# Optimal leaf nodes via cross-validation
validation_scores = [cross_val_score(DecisionTreeRegressor(random_state=0, max_leaf_nodes=n), X_train, y_train, cv=5).mean() for n in nodes]
nodes_optimum = nodes[validation_scores.index(max(validation_scores))]
print("Optimal max leaf nodes:", nodes_optimum)

# Fit parsimonious tree
parsimonious_tree_model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=nodes_optimum)
parsimonious_tree_model.fit(X_train, y_train)

# MSE after pruning
y_pred_optimal = parsimonious_tree_model.predict(X_test)
error2 = mean_squared_error(y_test, y_pred_optimal)
print("MSE before pruning =", error)
print("MSE after pruning =", error2)


## Part 2: Unsupervised Learning / Clustering
K-means and hierarchical clustering on fitness tracker dataset.

In [None]:

# Import clustering libraries
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage

# Import fitness tracker data
df2 = pd.read_csv("FitnessTrackerData.txt", delimiter=" ")
df2.shape


In [None]:

# 3D scatter plot
fig = plt.figure(figsize=(9,9))
ax = plt.axes(projection='3d')
ax.scatter3D(df2['Distance'], df2['Frequency'], df2['Pace'])
ax.set_xlabel('Distance')
ax.set_ylabel('Frequency')
ax.set_zlabel('Pace')
plt.show()


In [None]:

# K-means clustering (elbow method)
distortions = []
for i in range(1,16):
    km = KMeans(n_clusters=i, init='k-means++', random_state=0)
    km.fit(df2)
    distortions.append(km.inertia_)

plt.plot(range(1,16), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()


In [None]:

# Silhouette method
sil = []
kmax = 10
for k in range(2,kmax+1):
    km = KMeans(n_clusters=k, init='k-means++', random_state=0)
    km.fit(df2)
    sil.append(silhouette_score(df2, km.labels_))

plt.plot(range(2,kmax+1), sil, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.show()


In [None]:

# Final K-means clustering
km_final = KMeans(n_clusters=2, init='k-means++', random_state=0)
km_final.fit(df2)
df2['CLUSTERS'] = km_final.labels_

# Plot clusters using seaborn
sns.pairplot(df2, hue='CLUSTERS', palette='viridis')
plt.show()


In [None]:

# Hierarchical clustering
Z = linkage(df2, method='ward')
plt.figure(figsize=(12,8))
dendrogram(Z)
plt.show()

# Agglomerative clustering
agg_cluster = AgglomerativeClustering(n_clusters=2, linkage='ward')
agg_cluster.fit(df2)
df2['CLUSTERS'] = agg_cluster.labels_
sns.pairplot(df2, hue='CLUSTERS', palette='viridis')
plt.show()
