**EDA**

Евгений Борисов esborisov@sevsu.ru

* проверка на пропуски
* проверка на дубликаты
* проверка на противоречивую разметку
* статистические характеристики: корреляции, распределения, персентили
* поиск аномалий
* визуализация, признаки попарно, PCA, статистики

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.set_printoptions(precision=3)
pd.options.display.float_format = '{:,.3f}'.format

#  загружаем данные

In [3]:
# pip install ucimlrepo

In [4]:
%%time 

from ucimlrepo import fetch_ucirepo 
data = fetch_ucirepo(id=186) 

CPU times: user 49.3 ms, sys: 9.75 ms, total: 59.1 ms
Wall time: 12.5 s


# изучаем структуру

In [5]:
display( list(data) )

['data', 'metadata', 'variables']

In [6]:
display( list(data['metadata']) )

['uci_id',
 'name',
 'repository_url',
 'data_url',
 'abstract',
 'area',
 'tasks',
 'characteristics',
 'num_instances',
 'num_features',
 'feature_types',
 'demographics',
 'target_col',
 'index_col',
 'has_missing_values',
 'missing_values_symbol',
 'year_of_dataset_creation',
 'last_updated',
 'dataset_doi',
 'creators',
 'intro_paper',
 'additional_info']

In [None]:
# print(data['metadata']['abstract'])
# print(data['metadata']['tasks'])

In [None]:
# !mkdir data

In [None]:
# with open('data/metadata-abstract.txt','wt') as f: f.write(data['metadata']['abstract'])

In [None]:
# display( data['variables'] )

In [None]:
# data['variables'].to_csv('data/variables.csv',sep='\t',index=False)

In [None]:
# list(data['data'])

In [None]:
# df = data['data']['original']
# display(len(df))
# display(df.sample(3))

In [None]:
# df.to_csv('data/data.csv',sep='\t',index=False)

In [None]:
df = pd.read_csv('data/data.csv',sep='\t')
display(len(df))
display(df.sample(3))

In [None]:
df.info()

In [None]:
# df['id'] = range(len(df))

# изучение и визуализация данных

In [None]:
set(df['color'])

In [None]:
# заменяем название на номер
df['color_'] = pd.Categorical(df['color']).codes

In [None]:
set(df['quality'])

In [None]:
# сквозная нумерация классов от нуля
df['target'] = pd.Categorical(df['quality']).codes 

In [None]:
display(df.sample(4))

In [None]:
# список признаков
cols_features =[
#    'id',
    'fixed_acidity',
    'volatile_acidity',
    'citric_acid',
    'residual_sugar',
    'chlorides',
    'free_sulfur_dioxide',
    'total_sulfur_dioxide',
    'density',
    'pH',
    'sulphates',
    'alcohol',
#    'color',
    'color_',
#    'target'
#    'quality',    
]

## проверка на дубликаты

In [None]:
# проверка на дубликаты
display( len(df) )
display( len( df[cols_features].drop_duplicates() ) )
display( len( df[cols_features+['target']].drop_duplicates() ) )
# ЕСТЬ ДУБЛИКАТЫ ЗАПИСЕЙ

In [None]:
# удаляем дубликаты
display( len(df) )
df = df.drop_duplicates(cols_features+['target']).reset_index(drop=True)
display( len(df) )
display( df.sample(3) )

## проверка на противоречивую разметку

In [None]:
# проверка на противоречивую разметку
# т.е. таблица НЕ должна содержать одинаковых но разноразмеченных примеров
assert (df.groupby(cols_features)['target'].apply(set).reset_index()['target'].str.len()==1).all() 

## сбалансированность датасета

In [None]:
df['id'] = range(len(df))
df.groupby('target')[['id']].count().T #.`plot.barh()
# датасет не сбалансирован 

## распределения признаков

In [None]:
from matplotlib import pyplot as plt

In [None]:
# import seaborn as sns
# sns.pairplot(df[cols_features+['target']], hue='target',palette='viridis')
# plt.show()

In [None]:
from pandas.plotting import scatter_matrix
# from matplotlib import colors as mcolors
# colors = list(mcolors.CSS4_COLORS.keys()) 
# colors = np.random.permutation(colors)
# colors = ['blue','green','red','cyan','magenta','yellow','black',]
# colors = { n:c for n,c in enumerate(colors) }

scatter_matrix(
        df[cols_features], 
        figsize=(26, 26), 
        diagonal='hist', 
        alpha=.7, 
        s=5, 
        marker='o',
        # color=df['target'].map(colors) 
        c=df['target'],
        cmap='rainbow',    
    )
plt.show()

In [None]:
# pd.DataFrame.plot.kde?

In [None]:
# распределения признаков
n_cols_plot = 3
n_features = len(cols_features)
n_rows_plot = int(np.ceil(len(cols_features)/n_cols_plot))

fig = plt.figure( figsize=(4*n_cols_plot,3*n_rows_plot) )
for n,f in enumerate(cols_features):
    ax = plt.subplot(n_rows_plot,n_cols_plot,n+1) 
    # drop tiny class
    df[df['target']!=6].groupby('target')[f].plot.kde(ax=ax)
    ax.set_title(f)
    ax.grid()

handles, labels = plt.gca().get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center',ncol=n_features-1)  
plt.tight_layout()
plt.show()

In [None]:
# # распределения признаков
# n_cols_plot = 4
# n_features = len(cols_features)
# n_rows_plot = int(np.ceil(len(cols_features)/n_cols_plot))

# df[cols_features].plot.box(subplots=True, layout=(n_rows_plot,n_cols_plot), figsize=(4*n_cols_plot,4*n_rows_plot), sharex=False,grid=True)
# plt.show()

## корреляции признаков и таргета

In [None]:
# from lib.stat import CorrelationAnalyzer

In [None]:
# aca = CorrelationAnalyzer(df[cols_features+['target']])
# ca.plot(abs_value=True,figsize=(9,7))

In [None]:
# ca.hight(corr_bound=.5)

In [None]:
# ca.table[['target']].query('abs(target)>.2').sort_values('target')

In [None]:
# import seaborn as sns

# df_corr = df[cols_features[:5]].corr()
# #display( df_corr.style.background_gradient(axis=None, vmin=-1, vmax=1, cmap='rainbow') )

# mask = np.triu(np.ones_like(df_corr, dtype=bool))
# plt.figure(figsize=(5,3))
# sns.heatmap(
#     df_corr, 
#     mask=mask, 
#     #center=0, 
#     annot=True, 
#     fmt='.2f', 
#     square=True, 
#     cmap='rainbow'
# )
# plt.show()

In [None]:
import seaborn as sns

df_corr = df[cols_features+['target']].corr()
#display( df_corr.style.background_gradient(axis=None, vmin=-1, vmax=1, cmap='rainbow') )

mask = np.triu(np.ones_like(df_corr, dtype=bool))
plt.figure(figsize=(12,9))
sns.heatmap(
    df_corr.abs(), 
    mask=mask, 
    #center=0, 
    annot=True, 
    fmt='.2f', 
    square=True, 
    cmap='rainbow'
)
plt.show()

In [None]:
# corr_lim = 0.4

# fet_other = sorted(set(cols_features))
# hi_corr = []
# while len(fet_other) > 1:
#     fet_name = fet_other[0]
#     fet_other = fet_other[1:]
#     df_corr_fet = df_corr.loc[fet_name,fet_other]
#     hi_corr_ = [ [ fet_name, f, df_corr.loc[fet_name, f] ] for f in df_corr_fet[ df_corr_fet.abs()> corr_lim ].index ]
#     if len(hi_corr_)>0: hi_corr.extend(hi_corr_)

# # hi_corr    
# pd.DataFrame(hi_corr ,columns=['fet0','fet1','corr'])

In [None]:
# pip install seaborn

In [None]:
# df[cols_features+['target']].corr().abs()

In [None]:
# # корреляции признаков и таргета
# import matplotlib.pyplot as plt
# import seaborn as sb
# fig, ax = plt.subplots(figsize=(10,10))
# sb.heatmap( df[cols_features+['target']].corr(numeric_only=True), annot=True,ax=ax,cmap="rainbow")
# plt.show()

In [None]:
# # список признаков с высокой корреляцией с target
# cols_features_ =[
# #    'id',
# #    'fixed_acidity',
#     'volatile_acidity',
# #    'citric_acid',
# #    'residual_sugar',
#     'chlorides',
# #    'free_sulfur_dioxide',
# #    'total_sulfur_dioxide',
#     'density',
# #    'pH',
# #    'sulphates',
#     'alcohol',
# #    'color',
#     'color_',
# #    'target'
# #    'quality',    
# ]

# # корреляции признаков и таргета
# import matplotlib.pyplot as plt
# import seaborn as sb
# fig, ax = plt.subplots(figsize=(5,5))
# sb.heatmap( df[cols_features_+['target']].corr(numeric_only=True), annot=True,ax=ax,cmap="rainbow")

## поиск выбросов

Выявляем аномалии с помощью Isolation Forest   
https://habr.com/ru/companies/otus/articles/881086/

* Строим дерево, где каждый узел случайно выбирает один признак и случайное значение разбиения.
* Рекурсивно делим данные, пока каждая точка не окажется в своём отдельном листе.
* Считаем аномальность точки по тому, насколько быстро она была изолирована (чем короче путь, тем аномальнее).

In [None]:
from sklearn.ensemble import IsolationForest

# 5% данных считаем аномальными
df['is_anomaly'] = IsolationForest(contamination=0.1).fit_predict(df[cols_features])==-1

In [None]:
df.groupby(['target','is_anomaly'])[['id']].count()

In [None]:
from sklearn.decomposition import PCA
X2 = PCA(n_components=2).fit_transform(df[cols_features].values)
display( X2.shape )

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
for c in [True,False]:
    ax.scatter(X2[df['is_anomaly']==c, 0], X2[df['is_anomaly']==c, 1],s=1,label=f'{c}')
ax.grid()
ax.legend()
plt.show()

In [None]:
df_anom = df.query('is_anomaly').reset_index(drop=True)
df = df.query('~is_anomaly').reset_index(drop=True)
display( len(df),len(df_anom) )

## слияние классов

In [None]:
# считаем центры классов как медианны
# оцениваем близость медиан

In [None]:
df_median = df[cols_features+['target']].groupby('target').median()

In [None]:
from sklearn.metrics import pairwise_distances
d = pairwise_distances(df_median, metric='euclidean')

mask = np.triu(np.ones_like(d, dtype=bool))
plt.figure(figsize=(7,6))
sns.heatmap(
    d, 
    mask=mask, 
    #center=0, 
    annot=True, 
    fmt='.2f', 
    square=True, 
    cmap='rainbow'
)
plt.show()

In [None]:
mask = np.eye(*d.shape, dtype=bool)
neighbor_idx = np.ma.masked_array(d, mask=mask).argmin(axis=0)
d_ = d.flatten()[ np.ravel_multi_index((range(len(neighbor_idx)),neighbor_idx),d.shape) ]
pd.DataFrame({
    'class0':range(len(neighbor_idx)),
    'class1':neighbor_idx,
    'dist':d_
})


In [None]:
from sklearn.cluster import MeanShift
MeanShift(bandwidth=7.).fit(d).labels_

In [None]:
# удалим классы 3,6

In [None]:
df_3_6 = df.query('(target==6) or (target==3)').reset_index(drop=True)
df = df.query('(target!=6) and (target!=3)').reset_index(drop=True)
display( len(df),len(df_3_6) )

In [None]:
# объединим классы 4,5 и 0,1
df['target_'] = df['target'].apply(lambda t: 4 if t==5 else t).apply(lambda t: 1 if t==0 else t)

In [None]:
df.groupby('target_')[['id']].count().T #.`plot.barh()

## отбор признаков



Построение и отбор признаков. Часть 2: feature selection   
https://proglib.io/p/postroenie-i-otbor-priznakov-chast-2-feature-selection-2021-09-25

Методы отбора фич   
https://habr.com/ru/articles/264915/

Отбор признаков в задачах машинного обучения. Часть 1   
https://habr.com/ru/articles/550978/

In [None]:
# X = df[cols_features_].values
# X = df[cols_features].values

In [None]:
# import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(df[cols_features].values, df['target_'].values)
# importances = mutual_info_classif(df[cols_features].values, df['target'].values)
importances = pd.Series(importances, cols_features).sort_values()
importances.plot(kind='barh', color='teal',grid=True)
plt.show()

In [None]:
cols_features_ = list(importances.sort_values(ascending=False).head(5).index)
cols_features_

In [None]:
# from pandas.plotting import scatter_matrix
# # from matplotlib import colors as mcolors
# # colors = list(mcolors.CSS4_COLORS.keys()) 
# # colors = np.random.permutation(colors)
# colors = ['blue','green','red','cyan','magenta','yellow','black',]
# colors = { n:c for n,c in enumerate(colors) }

# scatter_matrix(
#         df[cols_features_], 
#         figsize=(7,7), 
#         diagonal='kde', 
#         alpha=.5, 
#         s=4, 
#         color=df['target'].map(colors) 
#     )
# plt.show()

In [None]:
import seaborn as sns
# sns.pairplot(df[cols_features_+['target']], hue='target',palette='rainbow')
sns.pairplot(df[cols_features_+['target_']], hue='target_',palette='viridis')
plt.show()

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# fsel = SelectKBest(chi2, k=4).fit( df[cols_features], df['target'])
# fsel.pvalues_
# # X = SelectKBest(chi2, k=4).fit_transform( df[cols_features], df['target'])

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.svm import SVR
# support = RFE(SVR(kernel="linear"), n_features_to_select=5, step=1).fit(df[cols_features].values, df['target'].values ).support_

In [None]:
# X = df[cols_features_].values
X = df[cols_features].values
y = df['target_'].values
display( X.shape,y.shape )

In [None]:
from sklearn.decomposition import PCA
X2 = PCA(n_components=2).fit_transform(X)
display( X2.shape )

In [None]:
# from sklearn.manifold import TSNE
# X2 = TSNE(n_components=2).fit_transform(X)
# display( X2.shape )

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
for c in sorted(set(y)): # for c in [3,4]: 
    ax.scatter(X2[y==c, 0], X2[y==c, 1],s=1,label=f'{c}')
ax.grid()
ax.legend()
plt.show()

# выделить тестовые данные

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.10, random_state=42)
display( X_train.shape, X_test.shape )

In [None]:
# set(y_train) , set(y_test)

# загружаем и обучаем модель классификатора

Scikit-Learn : Nearest Neighbors.    
https://scikit-learn.org/stable/modules/neighbors.html     
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

In [None]:
# model = KNeighborsClassifier().fit(X_train,y_train)

Scikit-Learn : Model selection: choosing estimators and their parameters.    
https://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html

In [None]:
# metrics = [
#     'braycurtis',
#     'canberra',
#     'chebyshev',
#     'cityblock',
#     'correlation',
#     'cosine',
#     'dice',
#     'euclidean',
#     'hamming',
#     'jaccard',
#     #'jensenshannon',
#     #'kulczynski1',
#     # 'mahalanobis',
#     'minkowski',
#     'rogerstanimoto',
#     'russellrao',
#     #'seuclidean',
#     'sokalmichener',
#     'sokalsneath',
#     'sqeuclidean',
#     #'yule',
# ]

# param_grid= {
#     'n_neighbors': range(1,10),
#     'metric': metrics,
# }

In [None]:
# %%time

# from sklearn.model_selection import GridSearchCV
# # применяем методы поиска оптимальных гиперпараметров модели
# grid = GridSearchCV(
#         estimator=KNeighborsClassifier(),
#         param_grid=param_grid,
#     ).fit(X_train,y_train)

In [None]:
# display( grid.best_score_ )
# display( grid.best_params_ )

In [None]:
# model_ = grid.best_estimator_

---

In [None]:
# RandomForestClassifier?

In [None]:
from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier( n_estimators=128, max_depth=8,).fit(X_train,y_train)
model = RandomForestClassifier().fit(X_train,y_train)

In [None]:
# from sklearn.naive_bayes import GaussianNB
# model = GaussianNB().fit(X_train,y_train)

In [None]:
# import ipytest

In [None]:
# %%ipytest

# def test_no_duplicates():
#     assert df.duplicated().sum() == 0

# оценка результатов классификации

Scikit-Learn : Metrics and scoring: quantifying the quality of predictions   
https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
from sklearn.metrics import classification_report # метрики качества

In [None]:
y_pred = model.predict(X_train)
# y_pred = model_.predict(X_train)

print(classification_report(
    y_true=y_train,
    y_pred=y_pred,
))

In [None]:
# y_pred = model_.predict(X_test)
y_pred = model.predict(X_test)

print(classification_report(
    y_true=y_test,
    y_pred=y_pred,
))

In [None]:
from sklearn.metrics import confusion_matrix # количество ошибок
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay