In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv("../data/processed/data_milk_c.csv",index_col=0)
X=data.drop(columns=['Grade','Grade_sparse']).values
y=data['Grade_sparse'].values

In [3]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=25)

print(f"Train shape: {X_train.shape}")
print(f"Test shape {X_test.shape}")

Train shape: (58, 7)
Test shape (25, 7)


In [4]:
from sklearn.svm import SVC

estimator_svc=SVC(kernel='linear',C=2)

estimator_svc.fit(X_train,y_train)
accuracy = estimator_svc.score(X_test, y_test)
print(f"Accuracy {accuracy}")

Accuracy 0.84


In [5]:
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier(max_depth=10)
random_forest.fit(X_train,y_train)
accuracy = random_forest.score(X_test, y_test)
print(f"Accuracy {accuracy}")

Accuracy 0.76


In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)
accuracy = knn.score(X_test, y_test)
print(f"Accuracy {accuracy}")

Accuracy 0.68


In [7]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
accuracy = gnb.score(X_test, y_test)
print(f"Accuracy {accuracy}")

Accuracy 0.92


In [8]:
from sklearn.model_selection import cross_validate

results = cross_validate(GaussianNB() ,X,y,return_train_score=True,cv=10)
results

{'fit_time': array([0.00386786, 0.00833535, 0.        , 0.00251436, 0.00100875,
        0.        , 0.00800037, 0.        , 0.01000643, 0.        ]),
 'score_time': array([0.00254798, 0.        , 0.00551343, 0.        , 0.        ,
        0.        , 0.        , 0.00600529, 0.        , 0.        ]),
 'test_score': array([0.77777778, 0.88888889, 1.        , 1.        , 1.        ,
        0.625     , 1.        , 0.625     , 0.875     , 0.75      ]),
 'train_score': array([0.75675676, 0.87837838, 0.86486486, 0.89333333, 0.86666667,
        0.90666667, 0.88      , 0.90666667, 0.88      , 0.90666667])}

In [10]:
import numpy as np
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.874
Test Score: 0.8541666666666666


In [11]:
from joblib import dump

dump(gnb,'../models/gnbayes.joblib')

['../models/gnbayes.joblib']

In [14]:
data.drop(columns=['Grade','Grade_sparse']).columns

Index(['pH', 'Temp', 'Taste', 'Odor', 'Fat ', 'Turbidity', 'Colour'], dtype='object')

In [7]:
import joblib
from pydantic import BaseModel

class milk_features(BaseModel):
    pH:float
    Temp:float
    Taste:float
    Odor:float
    Fat:float
    Turbidity:float
    Colour:float


class MilkModel():
    def __init__(self):
        self.model_name='../models/gnbayes.joblib'
        self.model=joblib.load(self.model_name)

    def predict_quality(self,pH,Temp,Taste,Odor,Fat,Turbidity,Colour):
        data_input=[[pH,Temp,Taste,Odor,Fat,Turbidity,Colour]]
        prediction=self.model.predict(data_input)
        probability=self.model.predict_proba(data_input).max()
        return prediction[0],probability

In [12]:
model=MilkModel()
def predict_quality(milk: milk_features):
    data = milk.dict()
    prediction, probability = model.predict_quality(data["pH"], 
                                                    data["Temp"],
                                                    data['Taste'],
                                                    data['Odor'],
                                                    data['Fat'],
                                                    data['Turbidity'],
                                                    data['Colour']
    )
    return {
        'prediction':prediction,
        'probability':probability
    }

In [17]:
predict_quality(0,0,0,0,0,0,0)

TypeError: predict_quality() takes 1 positional argument but 7 were given