In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("apple_quality.csv")

In [3]:
df

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good
...,...,...,...,...,...,...,...,...,...
3996,3996.0,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235285,good
3997,3997.0,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611391,bad
3998,3998.0,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229719806,good
3999,3999.0,0.278540,-1.715505,0.121217,-1.154075,1.266677,-0.776571,1.599796456,good


## Exploratory Data Analysis

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4001 entries, 0 to 4000
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4001 non-null   object 
 8   Quality      4000 non-null   object 
dtypes: float64(7), object(2)
memory usage: 281.4+ KB


In [5]:
df.dtypes

A_id           float64
Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity         object
Quality         object
dtype: object

In [6]:
df.isnull().sum()

A_id           1
Size           1
Weight         1
Sweetness      1
Crunchiness    1
Juiciness      1
Ripeness       1
Acidity        0
Quality        1
dtype: int64

In [7]:
df  = df.dropna()

In [8]:
## There are no null values
df.isnull().sum()

A_id           0
Size           0
Weight         0
Sweetness      0
Crunchiness    0
Juiciness      0
Ripeness       0
Acidity        0
Quality        0
dtype: int64

In [9]:
# There are no duplicated values
df.duplicated().sum()

0

In [10]:
df["Quality"].value_counts()

good    2004
bad     1996
Name: Quality, dtype: int64

In [11]:
## Coverting Object type to float type
df[["Acidity"]] = df[["Acidity"]].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["Acidity"]] = df[["Acidity"]].astype(float)


In [12]:
## Now Coverting Quality column into 0s and 1s
df["Quality"] = np.where(df["Quality"].str.contains("good"),0,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Quality"] = np.where(df["Quality"].str.contains("good"),0,1)


In [13]:
## Good values are my 0s and Bad Values are my 1s
df["Quality"].value_counts()

0    2004
1    1996
Name: Quality, dtype: int64

In [14]:
df[["Quality"]] = df[["Quality"]].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["Quality"]] = df[["Quality"]].astype(int)


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   float64
 8   Quality      4000 non-null   int32  
dtypes: float64(8), int32(1)
memory usage: 296.9 KB


In [16]:
df.to_csv('apples_cleaned.csv', index=False)

## Splitting Training and Testing Data

In [17]:
x = df.drop(["A_id","Quality"],axis=1)
y = df.iloc[:,-1]

In [18]:
x

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity
0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636
3,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984
...,...,...,...,...,...,...,...
3995,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784
3996,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235
3997,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611
3998,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229720


In [19]:
y

0       0
1       0
2       1
3       0
4       0
       ..
3995    1
3996    0
3997    1
3998    0
3999    0
Name: Quality, Length: 4000, dtype: int32

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [22]:
x_train.shape, x_test.shape , y_train.shape , y_test.shape

((2680, 7), (1320, 7), (2680,), (1320,))

## Standardization

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
scaler = StandardScaler()

In [25]:
x_train_scaled = scaler.fit_transform(x_train)

In [26]:
x_test_scaled = scaler.transform(x_test)

## Model Training

In [27]:
from sklearn.svm import SVC

In [28]:
model = SVC(kernel="linear")

In [29]:
model.fit(x_train,y_train)

In [32]:
y_pred = model.predict(x_test)

## Classification report , Accurracy Score , Confusion Matrix

In [33]:
from sklearn.metrics import classification_report , accuracy_score , confusion_matrix

In [34]:
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))
print(accuracy_score(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       689
           1       0.74      0.76      0.75       631

    accuracy                           0.75      1320
   macro avg       0.75      0.75      0.75      1320
weighted avg       0.75      0.75      0.75      1320

[[517 172]
 [153 478]]
0.7537878787878788


## HyperParameter Tunning

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
param_grid =  {
    "C":[1,0.1,10,100],
    "kernel" : ["linear", "rbf"],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001]
}

In [37]:
model_tuning = GridSearchCV(SVC(),param_grid=param_grid,refit=True,cv=5,verbose=3)

In [38]:
model_tuning.fit(x_train,y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .......C=1, gamma=1, kernel=linear;, score=0.735 total time=   0.3s
[CV 2/5] END .......C=1, gamma=1, kernel=linear;, score=0.772 total time=   0.3s
[CV 3/5] END .......C=1, gamma=1, kernel=linear;, score=0.765 total time=   0.3s
[CV 4/5] END .......C=1, gamma=1, kernel=linear;, score=0.705 total time=   0.3s
[CV 5/5] END .......C=1, gamma=1, kernel=linear;, score=0.744 total time=   0.3s
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.849 total time=   0.5s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.851 total time=   0.5s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.899 total time=   0.5s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.882 total time=   0.5s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.862 total time=   0.5s
[CV 1/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.735 total time=   0.3s
[CV 2/5] END .....C=1, gamma=0.1, kernel=linear

In [39]:
y_preds = model_tuning.predict(x_test)

In [40]:
print(classification_report(y_preds,y_test))
print(confusion_matrix(y_preds,y_test))
print(accuracy_score(y_preds,y_test))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       656
           1       0.92      0.90      0.91       664

    accuracy                           0.91      1320
   macro avg       0.91      0.91      0.91      1320
weighted avg       0.91      0.91      0.91      1320

[[604  52]
 [ 66 598]]
0.9106060606060606


In [41]:
model_tuning.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

## Now we will train our model with best parameteers

In [42]:
model_tunned = SVC(kernel="rbf",C=10,gamma=0.1)

In [43]:
model_tunned.fit(x_train,y_train)

In [44]:
y_pred_tunned = model_tunned.predict(x_test)

In [45]:
print(classification_report(y_pred_tunned,y_test))
print(confusion_matrix(y_pred_tunned,y_test))
print(accuracy_score(y_pred_tunned,y_test))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       656
           1       0.92      0.90      0.91       664

    accuracy                           0.91      1320
   macro avg       0.91      0.91      0.91      1320
weighted avg       0.91      0.91      0.91      1320

[[604  52]
 [ 66 598]]
0.9106060606060606


In [46]:
import pickle
pickle.dump(scaler,open('scaler_apple.pkl','wb'))
pickle.dump(model_tunned,open('model_tunned.pkl','wb'))