In [None]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno

In [None]:
%matplotlib inline  
style.use('fivethirtyeight') 
sns.set(style='whitegrid',color_codes=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder

from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification

In [None]:
df=pd.read_csv(r'Data/winequality.csv')

In [None]:
df.shape

In [None]:
df.head(20)

In [None]:
df.columns 

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
msno.matrix(df)

In [None]:
df.describe(include='all')

In [None]:
sns.factorplot(data=df,kind='box',size=10,aspect=2.5)

In [None]:
fig,axes=plt.subplots(5,5)
columns=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality']
for i in range (5):
    for j in range (5):
        axes[i,j].hist(x=columns[i+j],data=df,edgecolor='#000000',linewidth=2,color='#ff4125')
        axes[i,j].set_title('Variation of '+columns[i+j])
fig=plt.gcf()
fig.set_size_inches(18,18)
fig.tight_layout()

In [None]:
cor_mat= df.corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

In [None]:
def plot(feature_x,target='quality'):
    sns.factorplot(x=target,y=feature_x,data=df,kind='bar',size=5,aspect=1)
    sns.factorplot(x=target,y=feature_x,data=df,kind='violin',size=5,aspect=1)
    sns.factorplot(x=target,y=feature_x,data=df,kind='swarm',size=5,aspect=1)

In [None]:
plot('fixed acidity','quality')

In [None]:
plot('alcohol','quality')

In [None]:
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

In [None]:
label_quality = LabelEncoder()

In [None]:
df['quality'] = label_quality.fit_transform(df['quality'])

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df.drop('quality',axis=1),df['quality'],test_size=0.25,random_state=42)

In [None]:
models=[LogisticRegression(),LinearSVC(),SVC(kernel='rbf'),KNeighborsClassifier(),RandomForestClassifier(),DecisionTreeClassifier(),GradientBoostingClassifier(),GaussianNB()]

model_names=['LogisticRegression','LinearSVM','rbfSVM','KNearestNeighbors','RandomForestClassifier','DecisionTree','GradientBoostingClassifier','GaussianNB']

In [None]:
acc=[]
d={}

for model in range(len(models)):
    clf=models[model]
    clf.fit(x_train,y_train)
    pred=clf.predict(x_test)
    acc.append(accuracy_score(pred,y_test))

d={'Modeling Algo':model_names,'Accuracy':acc}
d

In [None]:
acc_frame=pd.DataFrame(d)
acc_frame

In [None]:
sns.barplot(y='Modeling Algo',x='Accuracy',data=acc_frame)

In [None]:
sns.factorplot(x='Modeling Algo',y='Accuracy',data=acc_frame,kind='point',size=4,aspect=3.5)

In [None]:
acc_frame

In [None]:
def scale_func(x_train,x_test,y_train,y_test,name_scaler):
    models=[LogisticRegression(),LinearSVC(),SVC(kernel='rbf'),KNeighborsClassifier(),RandomForestClassifier(),DecisionTreeClassifier(),GradientBoostingClassifier(),GaussianNB()]
    acc_sc=[]
    x_train_scale = name_scaler.fit_transform(x_train)
    x_test_scale = name_scaler.transform(x_test)

    for model in range(len(models)):
        clf=models[model]
        clf.fit(x_train_scale,y_train)
        pred=clf.predict(x_test_scale)
        acc_sc.append(accuracy_score(pred,y_test))
     
    acc_frame["Standard Scaler"]=np.array(acc_sc)


In [None]:
scale_func(x_train,x_test,y_train,y_test,StandardScaler())

In [None]:
acc_frame

In [None]:
sns.barplot(y='Modeling Algo',x='Standard Scaler',data=acc_frame)

# Parameter Tuning
## Logistic Regression

In [None]:
params_dict={'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],'penalty':['l1','l2']}
clf_lr=GridSearchCV(estimator=LogisticRegression(),param_grid=params_dict,scoring='accuracy',cv=10)
clf_lr.fit(x_train,y_train)

In [None]:
clf_lr.best_params_

In [None]:
clf_lr.best_score_

In [None]:
pred=clf_lr.predict(x_test)
accuracy_score(pred,y_test)

## k-Nearest Neighbor (KNN)

In [None]:
l=[i+1 for i in range(50)]
params_dict={'n_neighbors':l,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsClassifier(),param_grid=params_dict,scoring='accuracy',cv=10)
clf_knn.fit(x_train,y_train)

In [None]:
clf_knn.best_params_

In [None]:
clf_knn.best_score_

In [None]:
pred=clf_knn.predict(x_test)
accuracy_score(pred,y_test)

## Support Vector Machine - SVM

In [None]:
params_dict={'C':[0.001,0.01,0.1,1,10,100],'gamma':[0.001,0.01,0.1,1,10,100],'kernel':['linear','rbf']}
clf=GridSearchCV(estimator=SVC(),param_grid=params_dict,scoring='accuracy',cv=10)
clf.fit(x_train,y_train)

In [None]:
clf.best_score_

In [None]:
clf.best_params_

In [None]:
# now tuning finally around these values of C and gamma and the kernel for further increasing the accuracy.
params_dict={'C':[0.90,0.92,0.96,0.98,1.0,1.2,1.5],'gamma':[0.90,0.92,0.96,0.98,1.0,1.2,1.5],'kernel':['linear','rbf']}
clf_svm=GridSearchCV(estimator=SVC(),param_grid=params_dict,scoring='accuracy',cv=10)
clf_svm.fit(x_train,y_train)

In [None]:
clf_svm.best_score_

In [None]:
clf_svm.best_params_

In [None]:
clf_svm.grid_scores_

In [None]:
pred=clf_svm.predict(x_test)
accuracy_score(pred,y_test)   # actual accuarcy on our test set.

## Random Forest

In [None]:
params_dict={'n_estimators':[500],'max_features':['auto','sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1),param_grid=params_dict,scoring='accuracy',cv=10)
clf_rf.fit(x_train,y_train)

In [None]:
clf_rf.best_score_

In [None]:
clf_rf.best_params_

In [None]:
clf_rf.grid_scores_

In [None]:
pred=clf_rf.predict(x_test)
accuracy_score(pred,y_test)   # actual accuarcy on our test set.

## Gradient Boosting

In [None]:
clf_gb=GridSearchCV(estimator=GradientBoostingClassifier(),cv=10,param_grid=dict({'n_estimators':[500]}))
clf_gb.fit(x_train,y_train)

In [None]:
clf_gb.best_score_

In [None]:
clf_gb.best_params_

In [None]:
pred=clf_gb.predict(x_test)
accuracy_score(pred,y_test)