# Training Unsupervised Models
## 04_unsupervised_model_learning

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 28/09/2025   | Adrienne | Update | Creating models |

In [None]:
import pandas as pd

# train test split
from sklearn.model_selection import train_test_split

#unsupervised learning methods
# Feature agglomeration uses agglomerative(or hierarchical) clustering to group similar features, so it has its own dimensionality reduction technique
from sklearn.cluster import KMeans, AgglomerativeClustering, FeatureAgglomeration, DBSCAN
from sklearn.mixture import GaussianMixture

# dimensionality reduction methods
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE

# model selection
from sklearn.model_selection import GridSearchCV

# model scores
from sklearn.metrics import silhouette_score, calinski_harabasz_score

Train/Test Split

In [None]:
# X = features data
# y = target variable data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # stratify is also a optional argument

Dimensionality Reduction

In [None]:
# pretend X_train is your training set
# this works for all dimensionality reduction methods above

pca = PCA(n_components=2)
fitted_pca = pca.fit_transform(X_train)

Unsupervised Learning Methods code

In [None]:
# kmeans code
kmeans = KMeans(n_clusters=3, random_state=0, n_init=10) 
kmeans.fit(X_train) # pretend X is your training set
labels = kmeans.labels_

# works for AgglomerativeClustering, FeatureAgglomerization and DBScan
from sklearn.cluster import AgglomerativeClustering
agg_clustering = AgglomerativeClustering(n_clusters=3)
labels = agg_clustering.fit_predict(X_train)

# GMM code
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3, random_state=0)
gmm.fit(X)
labels = gmm.predict(X_train)

Model Evaluation

In [None]:
# works for all sklearn unsupervised model evaluation metrics
score = silhouette_score(X_train, labels)

Model Selection

In [None]:
# example grid search

param_grid = {n_components: [2, 5, 8, 10]} # easily add another parameter to this structure

grid_search = GridSearchCV(
    estimator=KMeans(random_state=42, n_init='auto'),
    param_grid=param_grid,
    scoring=silhouette_scorer,
    n_jobs=-1
)

grid_search.fit(X)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")