In [None]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')



In [None]:


data=pd.read_csv("/kaggle/input/wine-quality-dataset/WineQT.csv")
data.head()



In [None]:


data.columns



In [None]:


data.describe().transpose()



In [None]:
data.duplicated().sum()

In [None]:


data.info()



In [None]:


data.isnull().sum()



In [None]:
data.quality.value_counts().head()

In [None]:


plt.figure(figsize=(10,8))
sns.heatmap(data.corr(),annot=True,lw=.5,center=0,cmap="YlGnBu")



In [None]:


data['quality'].value_counts().sort_index().plot.area(figsize=(16,8),title='Quality of Wine',color='orange')



In [None]:


plt.figure(figsize=(16,8))
sns.lineplot(x='residual sugar',y='quality',data=data,color='cyan')



In [None]:


data.hist(figsize=(20,15))
plt.show()



In [None]:


cols_to_draw = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
                'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
                'pH', 'sulphates', 'alcohol']
cols = 3
rows = int(np.ceil(len(cols_to_draw) / cols))
fig, ax = plt.subplots(rows, cols, figsize=(16, rows * 5))
ax = ax.flatten()

quality_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink']

for x, col in enumerate(cols_to_draw):
    for quality in range(1, 8):
        sns.kdeplot(data=data[data['quality'] == quality][col], ax=ax[x], color=quality_colors[quality - 1],
                    common_norm=False, label=f'Quality {quality}')
    ax[x].set_title(f'{col} distribution')
    ax[x].set_xlabel(None)
    ax[x].legend()

plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(16,7))
sns.distplot(data['volatile acidity'],color='red')

In [None]:
d_corr=data.corr()['quality'][1:].drop('quality',axis=0)

plt.figure(figsize=(16,7))
plt.title('Correlation b/w quality and different variables')
sns.barplot(y=d_corr.sort_values().index,x=d_corr.sort_values().values)
plt.xlabel('Value')

In [None]:


plt.figure(figsize=(18,9))
o_count=1
for i in cols_to_draw:
    plt.subplot(4,3,o_count)
    sns.boxplot(data[i])
    o_count=o_count+1
plt.tight_layout()    



In [None]:
df=data.copy()
df.quality.value_counts()

In [None]:


df['quality']=df['quality'].map({3:'Bad',4:'Bad',5:'Bad',6:'Good',7:'Good',8:'Good'})
df['quality'].value_counts()



In [None]:


from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['quality']=le.fit_transform(df['quality'])
df['quality'].value_counts()



In [None]:


sns.countplot(df.quality)



In [None]:


x=df.drop(['quality','Id'],axis=1)
y=df.quality



In [None]:


x



In [None]:
y           

In [None]:


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=30)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)



In [None]:


from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)



In [None]:


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV,cross_val_score

model=LogisticRegression()
model.fit(x_train,y_train)

y_pred=model.predict(x_test)

print('Training Accuracy:',model.score(x_train,y_train))
print('Testing Accuracy:',model.score(x_test,y_test))

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))



In [None]:


from sklearn.svm import SVC
model=SVC()

model.fit(x_train,y_train)

y_pred=model.predict(x_test)

print('Training Accuracy:',model.score(x_train,y_train))
print('Testing Accuracy:',model.score(x_test,y_test))



In [None]:


param={
    'C':[0.8,0.9,1,1.1,1.2,1.3,1.5],
    'kernel':['linear','rbf'],
    'gamma':[0.1,0.5,0.8,0.9,1.1,1.2,1.4,1.5]
}
grid_svc=GridSearchCV(model,param_grid=param,scoring='accuracy',cv=8)



In [None]:


grid_svc.fit(x_train,y_train)



In [None]:
grid_svc.best_params_

In [None]:


model2=SVC(C=1,gamma=0.1,kernel='rbf')
model2.fit(x_train,y_train)
print('Training Accuracy:',model2.score(x_train,y_train))
print('Testing Accuracy:',model2.score(x_test,y_test))
y_pred = model2.predict(x_test)

print(classification_report(y_test, y_pred))



In [None]:
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(n_estimators=200)
model.fit(x_train,y_train)
print('Training Accuracy:',model.score(x_train,y_train))
print('Testing Accuracy:',model.score(x_test,y_test))


In [None]:


y_pred=model.predict(x_test)
print(classification_report(y_test, y_pred))



In [None]:
model_eval=cross_val_score(estimator=model,X=x_train,y=y_train,cv=10)
model_eval.mean()

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()

model.fit(x_train,y_train)

print('Training Accuracy:',model.score(x_train,y_train))
print('Testing Accuracy:',model.score(x_test,y_test))

In [None]:


y_pred=model.predict(x_test)
print(classification_report(y_pred,y_test))



In [None]:
model_eval=cross_val_score(estimator=model,X=x_train,y=y_train,cv=10)
model_eval.mean()

In [None]:
import pickle
pickle.dump(model,open('model.pkl','wb'))