In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv


In [2]:
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression
from sklearn.base import TransformerMixin, BaseEstimator 
from sklearn.decomposition import PCA 
from plotly import express as px 
from sklearn import set_config 
set_config(display = 'diagram')

In [3]:
class FitModelBeforePredictingException(Exception):
    
    def __init__(self, message = 'You need to fit the model before predicting the values.'):
        self.message = message 
        super().__init__(self.message)

class Perceptron(TransformerMixin, BaseEstimator): 
    
    def __init__(self, epoch = 1000, learning_rate = 0.001):
        self.epoch = epoch 
        self.learning_rate = learning_rate 
        
    def step(self, prediction):
        if prediction > 0:
            return 1 
        else: return 0
        
    def fit(self, X: np.array, y: np.array):
        rows, cols = X.shape 
        y = y.reshape((rows, 1))
        self.w = np.ones((cols+1, 1))
        
        
        X_ = np.insert(X, 0, 1, axis = 1)
        
        for i in range(self.epoch):
            #acessing a random row
            idx = np.random.randint(rows)
        
            y_prime = self.step(X_[idx, :].reshape((1, cols+1)).dot(self.w)[0][0]) 
            self.w = self.w + self.learning_rate*(y[idx] - y_prime)*X_[idx, :].reshape((cols+1, 1))
            
        return self
    
    def predict(self, X: np.array):
        X_ = np.insert(X, 0, 1, axis = 1)
        return pd.DataFrame(X_.dot(self.w)).iloc[:, 0].apply(self.step).values.reshape(-1, 1)
            
    def transform(self, X, y):
        return self 
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)
    


In [4]:
wine = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [5]:
wine['quality'] = wine.quality.map({5:0, 3:0, 4:0, 7:1, 8:1, 6:1})

In [6]:
X, y = wine.drop(columns = ['quality']), wine.quality
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, train_size = 0.8)

ct = ColumnTransformer([
    ('preprocess', StandardScaler(), slice(0, 11))
])

#Creating the pipeline 
pipe = Pipeline([
    ('preprocess', ct), 
    ('perceptron', Perceptron(epoch=1000, learning_rate = 0.01))
])

pipe.fit(X_train, y_train.values)

In [7]:
pipe.score(X_test, y_test)

0.7125

In [8]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320 entries, 1109 to 1023
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         320 non-null    float64
 1   volatile acidity      320 non-null    float64
 2   citric acid           320 non-null    float64
 3   residual sugar        320 non-null    float64
 4   chlorides             320 non-null    float64
 5   free sulfur dioxide   320 non-null    float64
 6   total sulfur dioxide  320 non-null    float64
 7   density               320 non-null    float64
 8   pH                    320 non-null    float64
 9   sulphates             320 non-null    float64
 10  alcohol               320 non-null    float64
dtypes: float64(11)
memory usage: 30.0 KB


In [9]:
wine.quality.value_counts()

1    855
0    744
Name: quality, dtype: int64

In [10]:
X, y = wine.drop(columns = ['quality']), wine.quality
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, train_size = 0.8)

ct = ColumnTransformer([
    ('preprocess', StandardScaler(), slice(0, 11))
])

#Creating the pipeline 
pipe = Pipeline([
    ('preprocess', ct), 
    ('logistic', LogisticRegression())
])

pipe.fit(X_train, y_train)

In [11]:
pipe.score(X_test, y_test)

0.75

In [12]:
pca = PCA(n_components = 2)
vis = pd.DataFrame( pca.fit_transform(ct.fit_transform(X_train)), columns = ['f1', 'f2']) 

In [13]:
px.scatter(x = vis.f1, y = vis.f2, color = y_train)