# Explore here

In [None]:
# Your code here
# If you have both numerical and categorical features in your data, there are different strategies, 
# but the best one to preserve the usefulness and suitability of this model is to transform the categorical ones into numerical ones 
# using coding techniques as we have seen above: pd.factorize of Pandas.

Take a cleaned dataset and split it

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y = True, as_frame = True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
22,4.6,3.6,1.0,0.2
15,5.7,4.4,1.5,0.4
65,6.7,3.1,4.4,1.4
11,4.8,3.4,1.6,0.2
42,4.4,3.2,1.3,0.2


Choose a model and fit it

In [3]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

Make predictions using the model

In [4]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

Check for the accuracy

In [5]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

1.0

In [6]:
# Super accuracy score

Save the model in case we need to use it again

In [7]:
from pickle import dump

dump(model, open("naive_bayes_default.sav", "wb"))

PROJECT

In [8]:
import pandas as pd

data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


Since we want ot work with only one categorical value, we do not do an EDA but process the text

In [9]:
def process_text(df):
    df = df.drop("package_name", axis=1) # drop feature that does not interest us
    df["review"] = df["review"].str.strip().str.lower() # remove space in the text and convert text to lowercase
                            # removing WHITE SPACES! : blank spaces, tab character, new line - remove where they are too much (3 instead of 1)
    return df

data = process_text(data)

data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [10]:
from sklearn.model_selection import train_test_split  # prepare the data to use model

X = data["review"]
y = data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer # Transform the text into a word count matrix: to obtain numerical features from the text

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
from sklearn.naive_bayes import MultinomialNB # Use the MultinomialNB when the target is binary and the predictors are categorical numbers

model = MultinomialNB()
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [14]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8156424581005587

Also test the other models

In [15]:
from sklearn.naive_bayes import GaussianNB 

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8044692737430168

In [16]:
from sklearn.naive_bayes import BernoulliNB 

model = BernoulliNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.770949720670391

Optimizing the model

In [17]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),    # dictionary of parameters: we choose the best parameters for the model
    # Use these two parameters to use the random search
    "fit_prior": [True, False]
}

# We initialize the random search: it selects parameters randomly, quicker than grid search
# Randomly get settings: best parameters to make the model work well
random_search = RandomizedSearchCV(model, hyperparams, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [18]:
random_search.fit(X_train, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': True, 'alpha': 0.01}


In [19]:
model = MultinomialNB(alpha = 1.917638190954774, fit_prior = False) # use the parameters given by random search in the model that interests us
model.fit(X_train, y_train)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)
# small improvement

0.8212290502793296

In [None]:
# from pickle import dump

# dump(model, open("where I want the model.modelname.sav", "wb"))