# Unsupervised Learning

In [None]:
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('data/wine-clustering.csv')
print(df.head().to_string())

# Check data description
print(df.describe(include='all').to_string())

# Standardizing the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Visualize the data using scatter plot Flavanoids vs Alcohol
plt.scatter(df['Alcohol'], df['Flavanoids'])
plt.xlabel('Alcohol')
plt.ylabel('Flavanoids')
plt.title('Flavanoids vs Alcohol')
plt.show()

## K-Means Clustering

In [ ]:
# K-means clustering
from sklearn.cluster import KMeans

# Create a KMeans model with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(df_scaled)

# Assume the k-means labels are the original labels or the ground truth
labels = kmeans.predict(df_scaled)

# assign the cluster labels to the original dataframe
df_kmeans = df.copy()
df_kmeans['cluster'] = kmeans.labels_

# Evaluate the model using Adjusted Mutual Information
mi = adjusted_mutual_info_score(df_kmeans['cluster'], labels)
print('K-means Clustering: Mutual Information:', mi)
# Evaluate the model using Silhouette Score
silhouette = silhouette_score(df_scaled, kmeans.labels_)
print('K-means Clustering: Silhouette Score:', silhouette)

# Visualize the clustering result
plt.scatter(df_kmeans['Alcohol'], df_kmeans['Flavanoids'], c=df_kmeans['cluster'], cmap='viridis')
plt.xlabel('Alcohol')
plt.ylabel('Flavanoids')
plt.title('K-means Clustering with 3 clusters')
plt.show()

## Hierarchical Clustering  

In [ ]:
# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering

# Create an AgglomerativeClustering model with 3 clusters
agg = AgglomerativeClustering(n_clusters=3)
agg.fit(df_scaled)

# assign the cluster labels to the original dataframe
df_hiera = df.copy()
df_hiera['cluster'] = agg.labels_

# Evaluate the model using Mutual Information
mi = adjusted_mutual_info_score(df_hiera['cluster'], labels)
print('Hierarchical Clustering compared to k-means labels: Mutual Information:', mi)
# Evaluate the model using Silhouette Score
silhouette = silhouette_score(df_scaled, agg.labels_)
print('Hierarchical Clustering: Silhouette Score:', silhouette)

# Visualize the clustering result
plt.scatter(df_hiera['Alcohol'], df_hiera['Flavanoids'], c=df_hiera['cluster'], cmap='viridis')
plt.xlabel('Alcohol')
plt.ylabel('Flavanoids')
plt.title('Hierarchical Clustering with 3 clusters')
plt.show()

## DBSCAN Clustering

In [ ]:
# DBSCAN clustering
from sklearn.cluster import DBSCAN

# Create a DBSCAN model
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(df_scaled)

# assign the cluster labels to the original dataframe
df_dbscan = df.copy()
df_dbscan['cluster'] = dbscan.labels_

# Evaluate the model using Mutual Information
mi = adjusted_mutual_info_score(df_dbscan['cluster'], labels)
print('DBSCAN Clustering compared to k-means labels: Mutual Information:', mi)
# Evaluate the model using Silhouette Score
silhouette = silhouette_score(df_scaled, dbscan.labels_) # DBSCAN provides just 1 cluster, raise error
print('DBSCAN Clustering: Silhouette Score:', silhouette)

# Visualize the clustering result
plt.scatter(df_dbscan['Alcohol'], df_dbscan['Flavanoids'], c=df_dbscan['cluster'], cmap='viridis')
plt.xlabel('Alcohol')
plt.ylabel('Flavanoids')
plt.title('DBSCAN Clustering')
plt.show()

# # Aprori Algorithm for Association Problem

In [ ]:
# # Apriori Algorithm
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Load dataset
df_rating = pd.read_csv('data/ratings_small.csv')
df_movies = pd.read_csv('data/movies_metadata.csv')

print(df_rating.head().to_string())
print(df_movies.head().to_string())

# Check data description
print(df_rating.describe(include='all').to_string())
print(df_movies.describe(include='all').to_string())

# Clean and Merge the Data
title_mask = df_movies['title'].isna()
movies_df = df_movies.loc[title_mask == False]
movies_df = movies_df.astype({'id': 'int64'})
df = pd.merge(df_rating, movies_df[['id', 'title']], left_on='movieId', right_on='id')
df.drop(['timestamp', 'id'], axis=1, inplace=True)
df = df.drop_duplicates(['userId','title'])

print(df.head().to_string())

# Transform data into Apriori required format
df_pivot = df.pivot(index='userId', columns='title', values='rating').fillna(0)
# the pivot table is in the form of user vs movie rating (we will recommend if the rating is >= 3)
df_pivot = df_pivot.applymap(lambda x: 1 if x >= 3 else 0)
print(df_pivot.head().to_string())

# Apriori Algorithm
frequent_itemsets = apriori(df_pivot, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)

# order frequent itemsets by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print(frequent_itemsets.head().to_string())

# order rules by support
rules = rules.sort_values(by='support', ascending=False)
print(rules.head().to_string())