# Laboratorium 4 - Algorytmy grupowania danych

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ucimlrepo import fetch_ucirepo
from ydata_profiling import ProfileReport
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.pipeline import Pipeline

## 1. Wczytanie zbioru Abalone, AutoEDA

In [4]:
abalone = fetch_ucirepo(id=1) 
df = abalone.data.original
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
print(abalone.metadata['additional_info']['summary']) 

Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- a boring and time-consuming task.  Other measurements, which are easier to obtain, are used to predict the age.  Further information, such as weather patterns and location (hence food availability) may be required to solve the problem.

From the original data examples with missing values were removed (the majority having the predicted value missing), and the ranges of the continuous values have been scaled for use with an ANN (by dividing by 200).


In [6]:
print(abalone.variables) 

             name     role         type demographic  \
0             Sex  Feature  Categorical        None   
1          Length  Feature   Continuous        None   
2        Diameter  Feature   Continuous        None   
3          Height  Feature   Continuous        None   
4    Whole_weight  Feature   Continuous        None   
5  Shucked_weight  Feature   Continuous        None   
6  Viscera_weight  Feature   Continuous        None   
7    Shell_weight  Feature   Continuous        None   
8           Rings   Target      Integer        None   

                   description  units missing_values  
0         M, F, and I (infant)   None             no  
1    Longest shell measurement     mm             no  
2      perpendicular to length     mm             no  
3           with meat in shell     mm             no  
4                whole abalone  grams             no  
5               weight of meat  grams             no  
6  gut weight (after bleeding)  grams             no  
7        

In [7]:
profile = ProfileReport(df, title = "Abalone AutoEDA")
profile.to_file("abalone_autoeda.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Wnioski:

* Zbiór danych dotyczy uchowców (rodzaj skorupiaków morskich)
* Są zawarte takie cechy jak płeć, długość, szerokość, waga (i inne wymiary fizyczne), a także liczba pierścieni która jest wyznacznikiem wieku uchowca
* W zbiorze danych nie ma żadnych brakujących wartości

## 2. Trenowanie i testowanie (k-means, DBSCAN)

In [8]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, make_scorer



def model_kmeans(dataset, dim_reduction = None):
    
    def preprocessor(dataset):
        cat = dataset.select_dtypes(include = ['object']).columns
        num = dataset.select_dtypes(include = ['int', 'float']).columns
        scaler = MinMaxScaler()
        ohe = OneHotEncoder(drop = 'first')
        preprocessor = make_column_transformer((scaler, num), (ohe, cat), remainder = 'passthrough')
        return preprocessor
        
    kmeans = KMeans()
    param_grid = {'n_clusters' : [(i + 2) for i in range(39)]}
    scorers = {'silhouette_score' : make_scorer(silhouette_score),
               'calinski_harabash_score' : make_scorer(calinski_harabasz_score),
               'davies_boulding_score' : make_scorer(davies_bouldin_score)}
    
    preprocessor = preprocessor(dataset)
    model = GridSearchCV(estimator = kmeans, param_grid = param_grid, scoring = scorers, refit = False, cv = 10)
    if dim_reduction is not None:
        pipe = Pipeline([('preprocessor', preprocessor), ('dim_reduction', dim_reduction), ('model', model)])
    else:
        pipe = Pipeline([('preprocessor', preprocessor), ('model', model)])
    
    pipe.fit(dataset)
    
    return pipe

In [9]:
model = model_kmeans(df)

for scorer_name, scorer_result in model.named_steps['model'].cv_results_.items():
    print(scorer_name, scorer_result)

mean_fit_time [0.0127322  0.0023046  0.00326221 0.00370693 0.00405869 0.00430789
 0.00495789 0.00541635 0.00566108 0.0059149  0.00641031 0.00716736
 0.00691714 0.00861831 0.00815425 0.00882339 0.00907576 0.01012242
 0.01001804 0.01007421 0.01018043 0.00926757 0.01032052 0.01107569
 0.01122253 0.01022029 0.01188846 0.01177807 0.01152589 0.01177583
 0.01222854 0.0114763  0.01235569 0.01387532 0.01318212 0.01377554
 0.01308217 0.01397519 0.01408477]
std_fit_time [0.03185152 0.00040147 0.00064859 0.00033066 0.00056952 0.00102813
 0.00047493 0.00091855 0.00090003 0.00079939 0.00066278 0.00149231
 0.00109741 0.00167967 0.00225286 0.00127433 0.00110444 0.00212367
 0.00145573 0.00154157 0.00155829 0.00075155 0.00131129 0.00154664
 0.00147335 0.00103299 0.0021494  0.00110626 0.00153176 0.00158961
 0.00133126 0.00117213 0.00158087 0.00141554 0.00168916 0.00110249
 0.00077348 0.0016551  0.00196061]
mean_score_time [3.50737572e-04 2.00176239e-04 1.50346756e-04 4.50539589e-04
 2.00128555e-04 5.0072