In [1]:
import os

from ufcdata.query import DatabaseQuery
from ufcdata.tools import query_to_df

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sb
import pickle as pkl

from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from umap import UMAP

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT_DIR = os.path.dirname(os.path.abspath("top_level_file.txt"))
DATA_DIR = os.path.join(ROOT_DIR, 'data')
DATA_PATH = os.path.join(DATA_DIR, 'data_1.pkl')
DATA_2_PATH = os.path.join(DATA_DIR, 'data_2.pkl')

In [3]:
with open(DATA_2_PATH, 'rb') as f:
    df = pkl.load(f)

df = df.dropna()

In [4]:
preprocessor = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('umap', UMAP())
    ]
)

clusterer = Pipeline(
    [
        ('kmeans', KMeans(n_clusters=3))
    ]
)

pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('clusterer', clusterer)
    ]
)

parameters = {'preprocessor__scaler': [StandardScaler(), MinMaxScaler(), Normalizer()],
              'preprocessor__umap__n_neighbors': [5, 10, 15, 20, 25],
              'preprocessor__umap__min_dist': [0.5, 1],
              'clusterer__kmeans__n_clusters': [1, 2, 3, 4, 5]
             }

pipe.get_params()

{'memory': None,
 'steps': [('preprocessor',
   Pipeline(steps=[('scaler', StandardScaler()), ('umap', UMAP())])),
  ('clusterer', Pipeline(steps=[('kmeans', KMeans(n_clusters=3))]))],
 'verbose': False,
 'preprocessor': Pipeline(steps=[('scaler', StandardScaler()), ('umap', UMAP())]),
 'clusterer': Pipeline(steps=[('kmeans', KMeans(n_clusters=3))]),
 'preprocessor__memory': None,
 'preprocessor__steps': [('scaler', StandardScaler()), ('umap', UMAP())],
 'preprocessor__verbose': False,
 'preprocessor__scaler': StandardScaler(),
 'preprocessor__umap': UMAP(),
 'preprocessor__scaler__copy': True,
 'preprocessor__scaler__with_mean': True,
 'preprocessor__scaler__with_std': True,
 'preprocessor__umap__a': None,
 'preprocessor__umap__angular_rp_forest': False,
 'preprocessor__umap__b': None,
 'preprocessor__umap__dens_frac': 0.3,
 'preprocessor__umap__dens_lambda': 2.0,
 'preprocessor__umap__dens_var_shift': 0.1,
 'preprocessor__umap__densmap': False,
 'preprocessor__umap__disconnection_dis

In [5]:
pipe.fit(df)

preprocessed_data = pipe['preprocessor'].transform(df.to_numpy())
labels = pipe['clusterer']['kmeans'].labels_

print(silhouette_score(preprocessed_data, labels))

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


0.47947308


In [6]:
grid = RandomizedSearchCV(pipe, parameters, cv=2, scoring='adjusted_rand_score').fit(df)

In [7]:
df_rand = pd.DataFrame.from_dict(grid.cv_results_, orient='columns')
df_rand

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor__umap__n_neighbors,param_preprocessor__umap__min_dist,param_preprocessor__scaler,param_clusterer__kmeans__n_clusters,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,2.422366,0.074597,0.000711,0.000633,15,1.0,StandardScaler(),5,"{'preprocessor__umap__n_neighbors': 15, 'prepr...",,,,,1
1,2.3827,0.076745,0.000119,4.6e-05,15,0.5,StandardScaler(),2,"{'preprocessor__umap__n_neighbors': 15, 'prepr...",,,,,2
2,1.916462,0.028009,8.6e-05,7e-06,5,0.5,MinMaxScaler(),5,"{'preprocessor__umap__n_neighbors': 5, 'prepro...",,,,,3
3,1.837282,4.4e-05,7.6e-05,1e-06,5,1.0,Normalizer(),3,"{'preprocessor__umap__n_neighbors': 5, 'prepro...",,,,,4
4,2.822933,0.071979,0.000164,0.0,25,0.5,StandardScaler(),4,"{'preprocessor__umap__n_neighbors': 25, 'prepr...",,,,,5
5,1.938451,0.036575,0.000164,1e-06,5,1.0,StandardScaler(),3,"{'preprocessor__umap__n_neighbors': 5, 'prepro...",,,,,6
6,2.799399,0.026945,0.000163,4e-06,25,0.5,StandardScaler(),2,"{'preprocessor__umap__n_neighbors': 25, 'prepr...",,,,,7
7,2.517333,0.017591,7.5e-05,2e-06,20,0.5,StandardScaler(),1,"{'preprocessor__umap__n_neighbors': 20, 'prepr...",,,,,8
8,2.283338,0.011844,0.000128,5e-05,10,0.5,Normalizer(),2,"{'preprocessor__umap__n_neighbors': 10, 'prepr...",,,,,9
9,2.199815,0.075378,0.00012,4.7e-05,10,0.5,Normalizer(),1,"{'preprocessor__umap__n_neighbors': 10, 'prepr...",,,,,10


In [8]:
best = grid.best_params_
best

{'preprocessor__umap__n_neighbors': 15,
 'preprocessor__umap__min_dist': 1,
 'preprocessor__scaler': StandardScaler(),
 'clusterer__kmeans__n_clusters': 5}