In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import pandas as pd

from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

In [3]:
csv = pd.read_csv("../Data/Biral_Products_csv.csv", delimiter=",", header=0)
labels = csv.Gruppenbezeichnung
csv.drop(columns=["Bezeichnung", "Gruppenbezeichnung", "Beschreibung", "Anwendungsgebiet"], inplace=True)

data = np.array(csv)
print(data.shape)

(222, 15)


In [4]:
imputer_scaler = Imputer(missing_values="NaN", strategy="most_frequent", verbose=1)
data = imputer_scaler.fit_transform(data)
print(data.shape)

(222, 15)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33)

#### Classifier Vergleich mit allen Features

In [24]:
names = ["Naive Bayes",
         "KNN",
         "Random Forest",
         "Decision Tree",
         "Linear SVM",
         "RBF SVM"]




for name, clf in zip(names, classifiers):
    clf_model = clf.fit(X_train, y_train)
    clf_pred = clf.predict(X_test)
    print(f"{name}:")
    print("Test Score: {:.3%}\n".format(clf_model.score(X_test, y_test)))
    print(classification_report(y_test, clf_pred))

Naive Bayes:
Test Score: 79.730%

             precision    recall  f1-score   support

       BLUE       0.59      1.00      0.74        20
      GREEN       1.00      0.90      0.95        10
        RED       0.97      0.68      0.80        44

avg / total       0.87      0.80      0.80        74

KNN:
Test Score: 79.730%

             precision    recall  f1-score   support

       BLUE       0.94      0.80      0.86        20
      GREEN       0.50      0.50      0.50        10
        RED       0.81      0.86      0.84        44

avg / total       0.80      0.80      0.80        74

Random Forest:
Test Score: 97.297%

             precision    recall  f1-score   support

       BLUE       1.00      1.00      1.00        20
      GREEN       1.00      0.80      0.89        10
        RED       0.96      1.00      0.98        44

avg / total       0.97      0.97      0.97        74

Decision Tree:
Test Score: 97.297%

             precision    recall  f1-score   support

       BLU

  'precision', 'predicted', average, warn_for)


#### Versuch mit Recursive Feature Elimination

In [24]:
names = ["Random Forest",
         "Decision Tree",
         "Linear SVM"]

classifiers = [RandomForestClassifier(random_state=0, max_depth=3),
               DecisionTreeClassifier(max_depth=3),
               SVC(kernel="linear", C=0.025)]

for name, clf in zip(names, classifiers):
    selector = RFE(clf, n_features_to_select=3, step=1)
    selector.fit(X_train, y_train)
    clf_pred = selector.predict(X_test)
    print(f"{name}:")
    print("Test Score: {:.3%}\n".format(selector.score(X_test, y_test)))
    print(classification_report(y_test, clf_pred))
    print("Features sorted by their rank:")
    print(sorted(zip(map(lambda x: round(x, 4), selector.ranking_), csv.columns)))
    print()

Random Forest:
Test Score: 93.243%

             precision    recall  f1-score   support

       BLUE       1.00      0.71      0.83        17
      GREEN       1.00      1.00      1.00        15
        RED       0.89      1.00      0.94        42

avg / total       0.94      0.93      0.93        74

Features sorted by their rank:
[(1, 'Mediumtemperatur max.'), (1, 'Mediumtemperatur min.'), (1, 'pro 100 m Höhe'), (2, 'EEI-Wert'), (3, 'Nennstrom min.'), (4, 'Baulänge'), (5, 'Nettogewicht'), (6, 'Leistung P min.'), (7, 'Nennstrom max.'), (8, 'Nennweite'), (9, 'Förderhöhe H max.'), (10, 'Betriebsdruck max.'), (11, 'Leistung P max.'), (12, 'bei   75°C Wassertemperatur'), (13, 'Nennweite G/DN')]

Decision Tree:
Test Score: 95.946%

             precision    recall  f1-score   support

       BLUE       1.00      0.88      0.94        17
      GREEN       1.00      0.93      0.97        15
        RED       0.93      1.00      0.97        42

avg / total       0.96      0.96      0.96     