In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

# METHODS
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# METRICS
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv('final.csv')
data.shape

(6113, 9)

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,0,9.3,0,0.016047,249.8092,1.0,0,0,3735.138
1,1,5.92,1,0.019278,48.2692,1.0,2,3,443.4228
2,2,17.5,0,0.01676,141.618,1.0,0,0,2097.27
3,3,8.93,0,0.0,53.8614,2.0,2,0,994.7052
4,4,10.395,1,0.0,51.4008,1.0,2,3,556.6088


In [6]:
x = data.drop(['Outlet_Size','Unnamed: 0'], axis=1)
y = data['Outlet_Size']

In [7]:
y.head()

0    1.0
1    1.0
2    1.0
3    2.0
4    1.0
Name: Outlet_Size, dtype: float64

In [8]:
x.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,0,0.016047,249.8092,0,0,3735.138
1,5.92,1,0.019278,48.2692,2,3,443.4228
2,17.5,0,0.01676,141.618,0,0,2097.27
3,8.93,0,0.0,53.8614,2,0,994.7052
4,10.395,1,0.0,51.4008,2,3,556.6088


In [9]:
# SCALING DATA
# scaler = MinMaxScaler()
# x_scaled = scaler.fit_transform(x)

In [10]:
# x = pd.DataFrame(x_scaled, columns = x.columns)

In [11]:
# SPLIT DATA
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 56, stratify=y)

# KNN -Method

In [12]:
# Creating an instance of KNN
clf = KNN(n_neighbors=20)

# Fitting the model
clf.fit(train_x, train_y)

# Predicting over the Train Set and calculating F1
train_predict = clf.predict(train_x)

# predicted y 
test_predict = clf.predict(test_x)

# F1 score
f1_score_train = f1_score(train_y, train_predict, average='macro')
print('Training F1 Score:', f1_score_train)

f1_score_test = f1_score(test_y, test_predict, average='macro')
print('Testing F1 Score:', f1_score_test)

Training F1 Score: 0.41663168301214465
Testing F1 Score: 0.37097450227034584


In [13]:
# Accuracy score
acc_train = accuracy_score(train_y, train_predict)
print('Training Accuracy Score:', acc_train)

acc_test = accuracy_score(test_y, test_predict)
print('Testing Accuracy Score:', acc_test)

Training Accuracy Score: 0.5617364746945899
Testing Accuracy Score: 0.5029431000654022


In [14]:
# Precision score
pre_train = precision_score(train_y, train_predict, average='macro')
print('Training Precision Score:', pre_train)

pre_test = precision_score(test_y, test_predict, average='macro')
print('Testing Precision Score:', pre_test)

Training Precision Score: 0.5063965325938208
Testing Precision Score: 0.4241413450164961


In [15]:
# ROC AUC score for multiclass classification (One-vs-Rest)
roc_auc_train = roc_auc_score(train_y, clf.predict_proba(train_x), multi_class='ovr')
print('Training ROC AUC Score:', roc_auc_train)

roc_auc_test = roc_auc_score(test_y, clf.predict_proba(test_x), multi_class='ovr')
print('Testing ROC AUC Score:', roc_auc_test)

Training ROC AUC Score: 0.7017587222113972
Testing ROC AUC Score: 0.6079045523710994


In [16]:
# recall_score
recall_score_train = recall_score(train_y, train_predict,average='macro')
print('Training Sensitivity Score:', recall_score_train)

recall_score_test = recall_score(test_y, test_predict,average='macro')
print('Testing Sensitivity Score:', recall_score_test)

Training Sensitivity Score: 0.4423888731112653
Testing Sensitivity Score: 0.39478508421938013


# SVM -Method


In [17]:
# Creating an instance of SVM
svm_model = svm.SVC(probability=True)

# Fitting the model
svm_model.fit(train_x, train_y)

# Predicting over the Train Set 
train_predict_svm = svm_model.predict(train_x)

# predicted over the Test Set 
test_predict_svm = svm_model.predict(test_x)

# F1 score
f1_score_svm_train = f1_score(train_y, train_predict_svm, average='macro')
print('Training F1 Score:', f1_score_svm_train)

f1_score_svm_test = f1_score(test_y, test_predict_svm, average='macro')
print('Testing F1 Score:', f1_score_svm_train)

Training F1 Score: 0.35004724773088847
Testing F1 Score: 0.35004724773088847


In [18]:
# Accuracy score
acc_svm_train = accuracy_score(train_y, train_predict_svm )
print('Training Accuracy Score:', acc_svm_train)

acc_svm_test = accuracy_score(test_y, test_predict_svm)
print('Testing Accuracy Score:',acc_svm_test)

Training Accuracy Score: 0.5259598603839442
Testing Accuracy Score: 0.5304120340091563


In [19]:
# Precision score
pre_svm_train = precision_score(train_y, train_predict_svm , average='macro')
print('Training Precision Score:', pre_svm_train)

pre_svm_test = precision_score(test_y, test_predict_svm, average='macro')
print('Testing Precision Score:', pre_svm_test)

Training Precision Score: 0.37321077950127624
Testing Precision Score: 0.38803032985189123


In [20]:
# ROC AUC score for multiclass classification (One-vs-Rest)
roc_auc_svm_train = roc_auc_score(train_y, svm_model.predict_proba(train_x), multi_class='ovr')
print('Training ROC AUC Score:', roc_auc_svm_train)

roc_auc_svm_test = roc_auc_score(test_y, svm_model.predict_proba(test_x), multi_class='ovr')
print('Testing ROC AUC Score:',roc_auc_svm_test)

Training ROC AUC Score: 0.7495258304916378
Testing ROC AUC Score: 0.7494775879615339


In [21]:
# recall_score
recall_score_svm_train = recall_score(train_y, train_predict_svm,average='macro')
print('Training Sensitivity Score:', recall_score_svm_train)

recall_score_svm_test = recall_score(test_y, test_predict_svm,average='macro')
print('Testing Sensitivity Score:', recall_score_svm_test)

Training Sensitivity Score: 0.39836452772191633
Testing Sensitivity Score: 0.40124561769265976


# Logistic Regression -Method


In [22]:
# Creating an instance of SVM
logreg_model = LogisticRegression()

# Fitting the model
logreg_model.fit(train_x, train_y)

# Predicting over the Train Set 
train_predict_logreg = logreg_model.predict(train_x)

# predicted over the Test Set 
test_predict_logreg = logreg_model.predict(test_x)

# F1 score
f1_score_logreg_train = f1_score(train_y, train_predict_logreg, average='macro')
print('Training F1 Score:', f1_score_logreg_train)

f1_score_logreg_test = f1_score(test_y, test_predict_logreg, average='macro')
print('Testing F1 Score:', f1_score_logreg_test)

Training F1 Score: 0.5184197516638638
Testing F1 Score: 0.5318462284436017


In [23]:
# Accuracy score
acc_logreg_train = accuracy_score(train_y, train_predict_logreg )
print('Training Accuracy Score:',acc_logreg_train)

acc_logreg_test = accuracy_score(test_y, test_predict_logreg)
print('Testing Accuracy Score:',acc_logreg_test)

Training Accuracy Score: 0.6544502617801047
Testing Accuracy Score: 0.6651406147809026


In [24]:
# Precision score
pre_logreg_train = precision_score(train_y, train_predict_logreg , average='macro')
print('Training Precision Score:', pre_logreg_train)

pre_logreg_test = precision_score(test_y, test_predict_logreg, average='macro')
print('Testing Precision Score:', pre_logreg_train)

Training Precision Score: 0.616955381920555
Testing Precision Score: 0.616955381920555


In [25]:
# ROC AUC score for multiclass classification (One-vs-Rest)
roc_auc_logreg_train = roc_auc_score(train_y, logreg_model.predict_proba(train_x), multi_class='ovr')
print('Training ROC AUC Score:', roc_auc_logreg_train)

roc_auc_logreg_test = roc_auc_score(test_y, logreg_model.predict_proba(test_x), multi_class='ovr')
print('Testing ROC AUC Score:',roc_auc_logreg_test)

Training ROC AUC Score: 0.7612927125318789
Testing ROC AUC Score: 0.7908587984884639


In [26]:
# recall_score
recall_score_logreg_train = recall_score(train_y, train_predict_logreg,average='macro')
print('Training Sensitivity Score:', recall_score_logreg_train)

recall_score_logreg_test = recall_score(test_y, test_predict_logreg,average='macro')
print('Testing Sensitivity Score:', recall_score_logreg_test)

Training Sensitivity Score: 0.5359350786870841
Testing Sensitivity Score: 0.5465932427995964


# Decision Tree Classifier

In [27]:
# Creating an instance of SVM
dt_model = DecisionTreeClassifier()

# Fitting the model
dt_model.fit(train_x, train_y)

# Predicting over the Train Set 
train_predict_dt = dt_model.predict(train_x)

# predicted over the Test Set 
test_predict_dt = dt_model.predict(test_x)

# F1 score
f1_score_dt_train = f1_score(train_y, train_predict_dt, average='macro')
print('Training F1 Score:', f1_score_dt_train)

f1_score_dt_test = f1_score(test_y, test_predict_dt, average='macro')
print('Testing F1 Score:', f1_score_dt_test)

Training F1 Score: 1.0
Testing F1 Score: 0.8712873904027796


In [28]:
# Accuracy score
acc_dt_train = accuracy_score(train_y, train_predict_dt )
print('Training Accuracy Score:',acc_dt_train)

acc_dt_test = accuracy_score(test_y, test_predict_dt)
print('Testing Accuracy Score:',acc_dt_test)

Training Accuracy Score: 1.0
Testing Accuracy Score: 0.8378024852844996


In [29]:
# Precision score
pre_dt_train = precision_score(train_y, train_predict_dt , average='macro')
print('Training Precision Score:', pre_dt_train)

pre_dt_test = precision_score(test_y, test_predict_dt, average='macro')
print('Testing Precision Score:', pre_dt_train)

Training Precision Score: 1.0
Testing Precision Score: 1.0


In [30]:
# ROC AUC score for multiclass classification (One-vs-Rest)
roc_auc_dt_train = roc_auc_score(train_y, dt_model.predict_proba(train_x), multi_class='ovr')
print('Training ROC AUC Score:', roc_auc_logreg_train)

roc_auc_dt_test = roc_auc_score(test_y, dt_model.predict_proba(test_x), multi_class='ovr')
print('Testing ROC AUC Score:',roc_auc_dt_test)

Training ROC AUC Score: 0.7612927125318789
Testing ROC AUC Score: 0.8881151466925493


In [31]:
# recall_score
recall_score_dt_train = recall_score(train_y, train_predict_dt,average='macro')
print('Training Sensitivity Score:', recall_score_dt_train)

recall_score_dt_test = recall_score(test_y, test_predict_dt,average='macro')
print('Testing Sensitivity Score:', recall_score_dt_test)

Training Sensitivity Score: 1.0
Testing Sensitivity Score: 0.8708180866181169


# Naive_Bayes -Method


In [32]:
# Creating an instance of SVM
nb_model = GaussianNB()

# Fitting the model
nb_model.fit(train_x, train_y)

# Predicting over the Train Set 
train_predict_nb = nb_model.predict(train_x)

# predicted over the Test Set 
test_predict_nb = nb_model.predict(test_x)

# F1 score
f1_score_nb_train = f1_score(train_y, train_predict_nb, average='macro')
print('Training F1 Score:', f1_score_nb_train)

f1_score_nb_test = f1_score(test_y, test_predict_nb, average='macro')
print('Testing F1 Score:', f1_score_nb_test)

Training F1 Score: 0.8734763899328241
Testing F1 Score: 0.8793521000652648


In [33]:
# Accuracy score
acc_nb_train = accuracy_score(train_y, train_predict_nb )
print('Training Accuracy Score:',acc_nb_train)

acc_nb_test = accuracy_score(test_y, test_predict_nb)
print('Testing Accuracy Score:',acc_nb_test)

Training Accuracy Score: 0.8407504363001745
Testing Accuracy Score: 0.8476128188358404


In [34]:
# Precision score
pre_nb_train = precision_score(train_y, train_predict_nb , average='macro')
print('Training Precision Score:', pre_nb_train)

pre_nb_test = precision_score(test_y, test_predict_nb, average='macro')
print('Testing Precision Score:', pre_nb_train)

Training Precision Score: 0.8998600187636866
Testing Precision Score: 0.8998600187636866


In [35]:
# ROC AUC score for multiclass classification (One-vs-Rest)
roc_auc_nb_train = roc_auc_score(train_y, nb_model.predict_proba(train_x), multi_class='ovr')
print('Training ROC AUC Score:', roc_auc_nb_train)

roc_auc_nb_test = roc_auc_score(test_y, nb_model.predict_proba(test_x), multi_class='ovr')
print('Testing ROC AUC Score:',roc_auc_nb_test)

Training ROC AUC Score: 0.8703483234371543
Testing ROC AUC Score: 0.8822133878105213


In [36]:
# recall_score
recall_score_nb_train = recall_score(train_y, train_predict_nb,average='macro')
print('Training Sensitivity Score:', recall_score_nb_train)

recall_score_nb_test = recall_score(test_y, test_predict_nb,average='macro')
print('Testing Sensitivity Score:', recall_score_nb_test)

Training Sensitivity Score: 0.883256353933522
Testing Sensitivity Score: 0.8878297064722757


In [37]:
df = {
    "Method":['KNN','SVM','Logistic Regression','Decision Tree Classifier','Naive_Bayes'],
    "F1 Score":[],
    "Accuracy":[],
    "Precision":[],
    "ROC AUC":[],
    "Sensitivity":[]
}
df["F1 Score"]=[f1_score_test,f1_score_svm_test,f1_score_logreg_test,f1_score_dt_test,f1_score_nb_test]

df["Accuracy"]=[acc_test,acc_svm_test,acc_logreg_test,acc_dt_test,acc_nb_test]

df["Precision"]=[pre_svm_test,pre_svm_test,pre_logreg_test,pre_dt_test,pre_nb_test]

df["ROC AUC"]=[roc_auc_test,roc_auc_svm_test,roc_auc_logreg_test,roc_auc_dt_test,roc_auc_nb_test]

df["Sensitivity"]=[recall_score_test,recall_score_svm_test,recall_score_logreg_test,recall_score_dt_test,
                   recall_score_nb_test]

dataframe = pd.DataFrame(df)

In [38]:
X = np.array([x.iloc[3]])

In [39]:
# import pickle
import joblib

d1 = {"model":nb_model,
      "Item_Weight":x["Item_Weight"],
      "Item_Fat_Content":x["Item_Fat_Content"],
      "Item_Visibility":x["Item_Visibility"],
      "Item_MRP":x["Item_MRP"],
      "Outlet_Location_Type":x["Outlet_Location_Type"],
      "Outlet_Type":x["Outlet_Type"],
      "Item_Outlet_Sales":x["Item_Outlet_Sales"]
     }


In [40]:
# with open('saved_model.pkl', 'wb') as file:
#     pickle.dump(d1, file)
joblib.dump(d1, 'saved_model.pkl')

['saved_model.pkl']

In [41]:
# with open('saved_model.pkl', 'rb') as file:
#     model = pickle.load(file)
model = joblib.load('saved_model.pkl')

In [42]:
reg_load = model['model']

d3 = {
        'Item_Weight':8.930,
        'Item_Fat_Content': 0,
        'Item_Visibility': 0.000000,
        'Item_MRP': 53.8614,
        'Outlet_Location_Type': 2,
        'Outlet_Type': 0,
        'Item_Outlet_Sales': 994.7052,
}
df = pd.DataFrame([d3])

In [43]:
prediction = reg_load.predict(df)
prediction

array([2.])

In [44]:
import sys
print(sys.version)


3.11.4 | packaged by conda-forge | (main, Jun 10 2023, 18:08:41) [Clang 15.0.7 ]
