In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectFromModel, SelectKBest, VarianceThreshold, f_classif
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics

In [2]:
data = load_iris()
x = data.data
y = data.target

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 42)

In [4]:
tree = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)

In [5]:
tree.fit(x_train, y_train)

In [6]:
y_pred = tree.predict(x_test)

In [7]:
def result_metrics(test, pred):
    result = pd.DataFrame({
            'accurancy' : [metrics.accuracy_score(test, pred)],
            'recall' : [metrics.recall_score(test, pred, average='micro')], 
            'f1' : [metrics.f1_score(test, pred, average='micro')]})
    return result

In [8]:
result_metrics(y_test, y_pred)

Unnamed: 0,accurancy,recall,f1
0,1.0,1.0,1.0


In [9]:
def roc_auc(test_x, test_y):
    y_probability = tree.predict_proba(test_x)
    fpr, tpr, thresholds = metrics.roc_curve(test_y, y_probability[:, 1], pos_label=1)
    roc_auc_tree = metrics.roc_auc_score(test_y, y_probability, multi_class='ovr')
    return roc_auc_tree

In [11]:
roc_auc(x_test, y_test)

1.0

In [10]:
y_probability = tree.predict_proba(x_test)

In [30]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probability[:, 1], pos_label=1)

In [33]:
roc_auc_tree = metrics.roc_auc_score(y_test, y_probability, multi_class='ovr')
roc_auc_tree

1.0

________

In [12]:
featuers = pd.DataFrame(x, columns = data.feature_names)

In [13]:
var_threshold = VarianceThreshold(threshold = 0.2)
var_threshold.fit_transform(featuers)
var_threshold.get_support()

array([ True, False,  True,  True])

In [14]:
x_var = featuers.loc[:, featuers.columns != 'sepal width (cm)']
x_var_train, x_var_test, y_train, y_test = train_test_split(x_var, y, test_size=0.25, random_state = 42)

In [15]:
tree.fit(x_var_train, y_train)

In [16]:
var_predict = tree.predict(x_var_test)

In [18]:
result_metrics(y_test, var_predict)

Unnamed: 0,accurancy,recall,f1
0,1.0,1.0,1.0


In [17]:
roc_auc(x_var_test, y_test)

1.0

__________

In [19]:
k_best = SelectKBest(f_classif, k = 3)
k_best.fit_transform(x, y)
k_best.get_support()

array([ True, False,  True,  True])

####Same result like a past

_____

In [20]:
selector_tree = SelectFromModel(estimator=DecisionTreeClassifier(max_depth= 3, min_samples_leaf= 5), threshold = 0.2)
selector_tree.fit(x, y)
selector_tree.get_support()

array([False, False,  True,  True])

In [22]:
x_selector = x_var.loc[:, x_var.columns != 'sepal length (cm)']

In [23]:
x_selector_train, x_selector_test, y_train, y_test = train_test_split(x_selector, y, test_size=0.25, random_state = 42)

In [24]:
tree.fit(x_selector_train, y_train)

In [25]:
selector_predict = tree.predict(x_selector_test)

In [26]:
result_metrics(y_test, selector_predict)

Unnamed: 0,accurancy,recall,f1
0,1.0,1.0,1.0


In [28]:
roc_auc(x_selector_test, y_test)

1.0

_____

In [32]:
pca = PCA(n_components= 2)
x_train_pca = pca.fit_transform(x_train)

In [33]:
tree.fit(x_train_pca, y_train)

In [34]:
x_test_pca = pca.fit_transform(x_test)
pca_predict = tree.predict(x_test_pca)

In [35]:
result_metrics(y_test, pca_predict)

Unnamed: 0,accurancy,recall,f1
0,0.947368,0.947368,0.947368


In [36]:
roc_auc(x_test_pca, y_test)

0.9978092894759562

________

In [40]:
t_svd = TruncatedSVD(n_components=2)
x_svd_train = t_svd.fit_transform(x_train)
x_svd_test = t_svd.fit_transform(x_test)

In [41]:
tree.fit(x_svd_train, y_train)

In [42]:
svd_predict = tree.predict(x_svd_test)

In [43]:
result_metrics(y_test, svd_predict)

Unnamed: 0,accurancy,recall,f1
0,0.921053,0.921053,0.921053


In [44]:
roc_auc(x_svd_test, y_test)

1.0

In [47]:
lda = LinearDiscriminantAnalysis(n_components=2)
x_train_lda = lda.fit_transform(x_train, y_train)
x_test_lda = lda.fit_transform(x_test, y_test)

In [48]:
tree.fit(x_train_lda, y_train)

In [49]:
lda_predict = tree.predict(x_test_lda)

In [50]:
result_metrics(y_test, lda_predict)

Unnamed: 0,accurancy,recall,f1
0,0.973684,0.973684,0.973684


In [51]:
roc_auc(x_test_lda, y_test)

0.9762475179141846

###За отриманими даними, можна сказати, що вибір та перетворення характеристик не дав осаточного результату над даним датасетом. Всі отримані значення метрик дають надійний результат побудованих моделей. 