In [34]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# IMPORT

In [38]:
dt = pd.read_csv('winequality.csv', sep=";");
dt.describe(include = 'all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
type,6497,2.0,White,4898.0,,,,,,,
fixed acidity,6497,,,,7.21531,1.29643,3.8,6.4,7.0,7.7,15.9
volatile acidity,6497,,,,0.339666,0.164636,0.08,0.23,0.29,0.4,1.58
citric acid,6497,,,,0.318633,0.145318,0.0,0.25,0.31,0.39,1.66
residual sugar,6497,,,,5.44324,4.7578,0.6,1.8,3.0,8.1,65.8
chlorides,6497,,,,0.0560339,0.0350336,0.009,0.038,0.047,0.065,0.611
free sulfur dioxide,6497,,,,30.5253,17.7494,1.0,17.0,29.0,41.0,289.0
total sulfur dioxide,6497,,,,115.745,56.5219,6.0,77.0,118.0,156.0,440.0
density,6497,,,,1.71088,7.63609,0.98711,0.99234,0.99489,0.99699,103.898
pH,6497,,,,3.2185,0.160787,2.72,3.11,3.21,3.32,4.01


# CLEANING

* train/set: regular train_test_split
* NAs NO NEED (IMPUTER si besoin)
* alcohol broken: regexp
* density > 1
* Outliers: sulphates, total sulfur dioxide, free sulfur dioxide, fixed acidity, volatile acidity, citric acid,
 residual sugar, chlorides, alcohol
* scale continuous: scaler
* bin discrete values

### Alcohol and density

In [39]:
dt.loc[dt.alcohol.str.match('[0-9]+\.+[0-9]+\.+'),'alcohol'] = dt.loc[dt.alcohol.str.match('[0-9]+\.+[0-9]+\.+'),:]['alcohol'].apply(lambda x: x.split('.')[0]).map(lambda x: int(x)/10)
dt.alcohol = dt['alcohol'].map(float)
dt.alcohol.describe()

count    6497.000000
mean       10.597477
std         3.214091
min         8.000000
25%         9.500000
50%        10.300000
75%        11.300000
max        97.300000
Name: alcohol, dtype: float64

In [40]:
dt = dt.loc[dt.density<=1,:]

### OUTLIERS

In [77]:
def inliner(row):
    return np.vstack([row < row.quantile(0.99), row > row.quantile(0.01)]).all(axis = 0)
outl_col = ["sulphates", "total sulfur dioxide","free sulfur dioxide",
            "fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","alcohol"]
dt = dt.loc[dt.loc[:,outl_col].apply(inliner ,axis = 0).all(axis = 1),:]

In [78]:
dt.shape

(5179, 13)

### SCALER

In [80]:
from sklearn.preprocessing import StandardScaler

In [92]:
dt.drop(columns = 'quality', axis = 1).select_dtypes(include = 'float')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
5,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
6,6.2,0.320,0.16,7.00,0.045,30.0,136.0,0.99490,3.18,0.47,9.6
8,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
9,8.1,0.220,0.43,1.50,0.044,28.0,129.0,0.99380,3.22,0.45,11.0
10,8.1,0.270,0.41,1.45,0.033,11.0,63.0,0.99080,2.99,0.56,12.0
11,8.6,0.230,0.40,4.20,0.035,17.0,109.0,0.99470,3.14,0.53,9.7


In [97]:
scaler = StandardScaler()
x = scaler.fit_transform(dt.drop(columns = 'quality', axis = 1).select_dtypes(include = 'float'))

# TEST MODEL

In [99]:
from sklearn.linear_model import LinearRegression

In [106]:
lm = LinearRegression()

In [101]:
lm.fit(x,dt.quality)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [102]:
y_pred = lm.predict(x)

In [103]:
from sklearn.metrics import r2_score

In [107]:
r2_score(dt.quality, y_pred)

0.2817426432678767

In [108]:
pd.DataFrame([dt.quality, y_pred]).T

Unnamed: 0,quality,Unnamed 0
1,6.0,5.913143
2,6.0,5.725487
3,6.0,5.725487
4,6.0,5.913143
5,6.0,5.489325
6,6.0,5.225161
8,6.0,6.306244
9,6.0,5.651627
10,5.0,6.164294
11,5.0,6.777469
