In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('Wine.csv')
df.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Alcohol               178 non-null    float64
 1   Malic_Acid            178 non-null    float64
 2   Ash                   178 non-null    float64
 3   Ash_Alcanity          178 non-null    float64
 4   Magnesium             178 non-null    int64  
 5   Total_Phenols         178 non-null    float64
 6   Flavanoids            178 non-null    float64
 7   Nonflavanoid_Phenols  178 non-null    float64
 8   Proanthocyanins       178 non-null    float64
 9   Color_Intensity       178 non-null    float64
 10  Hue                   178 non-null    float64
 11  OD280                 178 non-null    float64
 12  Proline               178 non-null    int64  
 13  Customer_Segment      178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


In [4]:
df.isnull().sum()

Alcohol                 0
Malic_Acid              0
Ash                     0
Ash_Alcanity            0
Magnesium               0
Total_Phenols           0
Flavanoids              0
Nonflavanoid_Phenols    0
Proanthocyanins         0
Color_Intensity         0
Hue                     0
OD280                   0
Proline                 0
Customer_Segment        0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,1.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,1.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,1.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,2.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,3.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,3.0


In [6]:
x = df.drop(['Customer_Segment'], axis=1)
y = df.Customer_Segment

In [7]:
x_norm = StandardScaler().fit_transform(x)
x_norm = pd.DataFrame(x_norm)
x_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


In [8]:
summary = sm.OLS(y, sm.add_constant(x_norm)).fit()
print(summary.summary())

                            OLS Regression Results                            
Dep. Variable:       Customer_Segment   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.892
Method:                 Least Squares   F-statistic:                     113.7
Date:                Fri, 16 Jul 2021   Prob (F-statistic):           7.57e-75
Time:                        00:14:37   Log-Likelihood:                -1.6977
No. Observations:                 178   AIC:                             31.40
Df Residuals:                     164   BIC:                             75.94
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.9382      0.019    101.605      0.0

In [9]:
ds = x_norm.drop(columns=[1,2,4,7,8,10])
ds

Unnamed: 0,0,3,5,6,9,11,12
0,1.518613,-1.169593,0.808997,1.034819,0.251717,1.847920,1.013009
1,0.246290,-2.490847,0.568648,0.733629,-0.293321,1.113449,0.965242
2,0.196879,-0.268738,0.808997,1.215533,0.269020,0.788587,1.395148
3,1.691550,-0.809251,2.491446,1.466525,1.186068,1.184071,2.334574
4,0.295700,0.451946,0.808997,0.663351,-0.319276,0.449601,-0.037874
...,...,...,...,...,...,...,...
173,0.876275,0.301803,-0.985614,-1.424900,1.142811,-1.231206,-0.021952
174,0.493343,1.052516,-0.793334,-1.284344,0.969783,-1.485445,0.009893
175,0.332758,0.151661,-1.129824,-1.344582,2.224236,-1.485445,0.280575
176,0.209232,0.151661,-1.033684,-1.354622,1.834923,-1.400699,0.296498


In [10]:
summary = sm.OLS(y, sm.add_constant(ds)).fit()
print(summary.summary())

                            OLS Regression Results                            
Dep. Variable:       Customer_Segment   R-squared:                       0.892
Model:                            OLS   Adj. R-squared:                  0.888
Method:                 Least Squares   F-statistic:                     201.1
Date:                Fri, 16 Jul 2021   Prob (F-statistic):           8.92e-79
Time:                        00:14:38   Log-Likelihood:                -8.4032
No. Observations:                 178   AIC:                             32.81
Df Residuals:                     170   BIC:                             58.26
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.9382      0.019     99.623      0.0

In [11]:
x_train, x_test, y_train, y_test = train_test_split(ds, y, test_size=0.5, random_state=0)

lda = LinearDiscriminantAnalysis()
lda_ = lda.fit(x_train, y_train)

In [12]:
lda.decision_function(x_train)

array([[-1.15771163e+01,  3.62741147e-01, -1.83487935e+01],
       [ 1.06969218e+01, -9.58439010e+00, -3.70553029e+01],
       [ 3.49482576e+00, -8.34931482e+00, -2.84476391e+01],
       [-1.11876906e+01,  1.20228482e+00, -1.99848905e+01],
       [ 1.33692542e+01, -9.20028605e+00, -4.13372416e+01],
       [-2.93891617e+01, -1.73759857e+00,  9.59787624e+00],
       [ 4.66402461e+00, -3.38770952e+00, -3.65127443e+01],
       [-2.98159927e+01, -5.58353478e+00,  1.51702212e+01],
       [ 5.72907906e+00, -5.30075323e+00, -3.55505567e+01],
       [ 6.45411459e+00, -8.99472480e+00, -3.18063104e+01],
       [-1.15608810e+01, -1.69867547e-02, -1.78813117e+01],
       [ 9.81722139e+00, -8.05548408e+00, -3.77838976e+01],
       [-7.69738130e+00,  9.72512691e-01, -2.46327063e+01],
       [-1.64034307e+01,  4.69532298e-02, -1.11036221e+01],
       [ 1.53331563e+01, -1.24148877e+01, -3.99672424e+01],
       [-2.86781627e+01, -7.05321201e+00,  1.54566285e+01],
       [-1.41265964e+01,  7.87459036e-02

In [13]:
pred = lda.predict(x_test)

print(pred)

[1 3 2 1 2 1 1 3 2 2 3 3 1 2 3 2 1 1 3 1 2 1 2 2 3 2 2 2 2 3 1 1 2 1 1 1 3
 2 2 3 1 1 2 2 2 1 3 2 3 1 3 3 1 3 1 2 3 3 2 3 3 1 2 3 2 2 3 2 1 3 2 2 2 1
 2 1 3 3 2 2 2 3 3 1 3 2 2 2 2]


In [14]:
print(classification_report(y_test,pred))
print('Accuracy :', metrics.accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           1       0.92      0.92      0.92        25
           2       0.95      0.88      0.91        40
           3       0.89      1.00      0.94        24

    accuracy                           0.92        89
   macro avg       0.92      0.93      0.92        89
weighted avg       0.92      0.92      0.92        89

Accuracy : 0.9213483146067416
