In [None]:
# kmeans code
kmeans = KMeans(init = 'k-means++', n_clusters=9, n_init= 5, random_state = 42) 
km = kmeans.fit(X_pca) 
labels = km.predict(X_pca)

# add cluster results to dataset
df['cluster'] = labels

In [None]:
# quick look at cluster values
vals = df['cluster'].value_counts(normalize=True) * 100
pd.DataFrame({
  'cluster': vals
}).head(12)

In [None]:
# LSA/Truncated SVD Dimension Reduction
maxDimensions = min(df.shape)-1

# The "arpack" algorithm is typically more efficient for large sparse matrices compared to the default "randomized" algorithm
lsa = TruncatedSVD(n_components=maxDimensions, algorithm="arpack") 
X_lsa = lsa.fit_transform(df)

explained_variance_ = lsa.explained_variance_

In [None]:
X = df
k = 10  # number of singular vectors to keep
U, S, Vt = np.linalg.svd(X)
X_approx = U[:, :k] @ np.diag(S[:k]) @ Vt[:k, :]

In [None]:
approx_error = np.linalg.norm(X - X_approx) / np.linalg.norm(X)
print(f'Relative approximation error: {approx_error:.2f}')

In [None]:
feature_names = X_approx.columns
Vk = Vt[:k, :]
feature_importance = np.abs(Vk).sum(axis=0)
sorted_idx = np.argsort(feature_importance)[::-1]

# Save the names of the top 10 most frequent features in a list
top_features = [feature_names[i] for i in sorted_idx[:10]]

In [None]:
plt.barh(range(10), feature_importance[sorted_idx[:10]])
plt.yticks(range(10), top_features)
plt.xlabel('Importance Score')
plt.title('Most Frequent Features in Truncated SVD')
plt.show()


In [None]:
# example grid search

param_grid = {'eps': [0.5, 0.75, 1, 1.25], 'min_samples': [5, 10, 15]} # easily add another parameter to this structure

grid_search = GridSearchCV(
    estimator=DBSCAN(metric='euclidean'),
    param_grid=param_grid,
    scoring='explained_variance',
    cv = 5,
    n_jobs=-1
)

grid_search.fit(X_pca)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")