# Klasyfikacja jakości czerwonego wina <a class="tocSkip">
'''Hubert Łapsa
Mateusz Wójcik'''

In [None]:
# Analiza danych <a class="analiza">

## Usuwanie części danych (problem brakujących danych)

In [None]:
import pandas as pd

df = pd.read_csv("winequality-red.csv")
#Fukcja obliczająca procent z danego parametru
def percentage(part, whole):
   return (float(part) * float(whole))/100

#Obliczenie ilości danych do usunięcia
rnd = int(percentage(15, df['citric acid'].size))

#Losowanie danych do usunięcia
data = df['citric acid'].head(len(df)).sample(rnd)

#Zastępowanie wylosowanych danych wartościami pustymi
for el in data.index[0:rnd][0:rnd]:
   df.loc[el, 'citric acid'] = ''

#Zapis danych do pliku csv
#df.to_csv('data_after_delete_some_records.csv')

## Problem danych niezrównoważonych

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
red_wines = pd.read_csv('winequality-red.csv')
red_wines.head()


X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

# Balans danych
sns.countplot(x='quality', data=red_wines)
plt.show()

# Analiza danych odstających
for column in X:
    sns.boxplot(x='quality', y=column, data=red_wines)
    plt.show()

In [None]:
# Histogram danych nierównoważonych
sns.set(color_codes=True)
sns.distplot(Y, kde=False, bins=6)

In [None]:
# łączenie klas
red_wines = red_wines.replace(8, 7)
red_wines = red_wines.replace(3, 4)

X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

sns.set(color_codes=True)
sns.distplot(Y, kde=False, bins=4)

In [None]:
# usunięcie nadmiaru danych
red_wines = red_wines.drop(red_wines.query('quality == 5').sample(frac=.5).index)
red_wines = red_wines.drop(red_wines.query('quality == 6').sample(frac=.5).index)
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

sns.set(color_codes=True)
sns.distplot(Y, kde=False, bins=4)

# duplikacja danych
from imblearn.over_sampling import SMOTE
smote = SMOTE("minority")
X, Y = smote.fit_sample(X, Y)
sns.distplot(Y, kde=False, bins=6)

frames = [X, Y]
red_wines = pd.concat(frames)
# red_wines.to_csv("red_wines_after_over_sampling.csv")

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

red_wines.isnull()

In [None]:
# Analiza danych odstających
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1

((red_wines < (Q1 - 1.5 * IQR)) | (red_wines > (Q3 + 1.5 * IQR))).sum()

In [None]:
red_wines = pd.read_csv('winequality-red.csv')
red_wines.head()


X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

# Balans danych
sns.countplot(x='quality', data=red_wines)
plt.show()

# Analiza danych odstających
for column in X:
    sns.boxplot(x='quality', y=column, data=red_wines)
    plt.show()

In [None]:
# Histogram danych nierównoważonych
sns.set(color_codes=True)
sns.distplot(Y, kde=False, bins=6)

In [None]:
# łączenie klas
red_wines = red_wines.replace(8, 7)
red_wines = red_wines.replace(3, 4)

X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

sns.set(color_codes=True)
sns.distplot(Y, kde=False, bins=4)

In [None]:
# usunięcie nadmiaru danych
red_wines = red_wines.drop(red_wines.query('quality == 5').sample(frac=.5).index)
red_wines = red_wines.drop(red_wines.query('quality == 6').sample(frac=.5).index)
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

sns.set(color_codes=True)
sns.distplot(Y, kde=False, bins=4)

# duplikacja danych
from imblearn.over_sampling import SMOTE
smote = SMOTE("minority")
X, Y = smote.fit_sample(X, Y)
sns.distplot(Y, kde=False, bins=6)

frames = [X, Y]
red_wines = pd.concat(frames)
# red_wines.to_csv("red_wines_after_over_sampling.csv")

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

red_wines.isnull()

## Problem danych brakujących

In [None]:
from sklearn.impute import KNNImputer
import pandas as pd

#Deklaracja Imputer-a, parameter n_neighbors oznacza ilość sąsiadów
imputer = KNNImputer(n_neighbors=5)

data = pd.read_csv('data.csv')

#uzupełnianie danych brakujących
df_filled = imputer.fit_transform(data)

## Problem danych odstających

In [None]:
# Analiza danych odstających
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1

((red_wines < (Q1 - 1.5 * IQR)) | (red_wines > (Q3 + 1.5 * IQR))).sum()

# Preprocessing danych <a class="preprocessing">


## Normalizacja

In [None]:
from sklearn.preprocessing import MinMaxScaler

import pandas as pd

mms = MinMaxScaler()

df = pd.read_csv('data.csv')

x = df.values

min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

df = pd.DataFrame(x_scaled)

## Standaryzacja

In [None]:
from sklearn.preprocessing import StandardScaler

import pandas as pd

df = pd.read_csv('data.csv')

x = df.values

stdsc = StandardScaler()
x_scaled = stdsc.fit_transform(x)

df = pd.DataFrame(x_scaled)

# Redukcja wymiarowości <a class="reduction">

## Selekcja cech (Sequential Backward  Selector)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SBS
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25)

knn = KNeighborsClassifier(n_neighbors=4)

sbs = SBS(knn,
         k_features=3,
         forward=False,
         floating=False,
         scoring='accuracy',
         cv=4,
         n_jobs=-1)

sbs = sbs.fit(X_TRAIN, Y_TRAIN.values.ravel())

## Selekcja cech (Sequential Feature Selector)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SBS
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25)

knn = KNeighborsClassifier(n_neighbors=4)

sbs = SBS(knn,
          k_features=3,
          forward=False,
          floating=False,
          scoring='accuracy',
          cv=4,
          n_jobs=-1)

sbs = sbs.fit(X_TRAIN, Y_TRAIN.values.ravel())

## Analiza głównych składowych

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
# algorytm.fit(X_train_pca, y_train)

### Wybrane algorytmy uczenia maszynowego

## Algorytm k-najbliższych sąsiadów (KNN)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X,Y, test_size=0.25, random_state=0)

knn.fit(X_TRAIN, Y_TRAIN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X,Y, test_size=0.25, random_state=0)

knn.fit(X_TRAIN, Y_TRAIN)

## Klasyfikator drzewa decyzyjnego
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd

dt = DecisionTreeClassifier()

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25)

dt.fit(X_TRAIN, Y_TRAIN.values.ravel())

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd

dt = DecisionTreeClassifier()

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25)

dt.fit(X_TRAIN, Y_TRAIN.values.ravel())

## Regresja logistyczna

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

red_wines = pd.read_csv("data.csv")

X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.20)

model = LogisticRegression(multi_class='ovr', max_iter=10000)

model.fit(X_TRAIN, Y_TRAIN.values.ravel())

## Maszyna wektorów nośnych


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

red_wines = pd.read_csv("data.csv")

X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_train, X_test, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size = 0.20)

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, Y_TRAIN.values.ravel())

## Las losowy
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
red_wines = pd.read_csv("data.csv")
   X = red_wines.iloc[:, :-1]
   Y = red_wines.iloc[:, -1:]

   X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25)

rf.fit(X_TRAIN, Y_TRAIN)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
red_wines = pd.read_csv("data.csv")
   X = red_wines.iloc[:, :-1]
   Y = red_wines.iloc[:, -1:]

   X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25)

rf.fit(X_TRAIN, Y_TRAIN)

### Metryka
from sklearn.model_selection import train_test_split
import pandas as pd

# Dzielenie na podzbiory

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25, random_state=0)

# Nauka

# todo: Tutaj uczy się algorytm

# Test skuteczności
from sklearn import metrics

cm = metrics.confusion_matrix(Y_TEST, y_pred)
print(cm)
accuracy = metrics.accuracy_score(Y_TEST, y_pred)
print("Accuracy score:", accuracy)
precision = metrics.precision_score(Y_TEST, y_pred, average='micro')
print("Precision score:", precision)
recall = metrics.recall_score(Y_TEST, y_pred, average='micro')
print("Recall score:", recall)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Dzielenie na podzbiory

red_wines = pd.read_csv("data.csv")
X = red_wines.iloc[:, :-1]
Y = red_wines.iloc[:, -1:]

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.25, random_state=0)

In [1]:
# Test skuteczności

from sklearn import metrics

cm = metrics.confusion_matrix(Y_TEST, y_pred)
print(cm)
accuracy = metrics.accuracy_score(Y_TEST, y_pred)
print("Accuracy score:", accuracy)
precision = metrics.precision_score(Y_TEST, y_pred, average='micro')
print("Precision score:", precision)
recall = metrics.recall_score(Y_TEST, y_pred, average='micro')
print("Recall score:", recall)

NameError: name 'Y_TEST' is not defined