In [1]:
import pandas as pd

df = pd.read_csv('spam15.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [3]:
# Category column is text; need to convert it to numeric
df['spam'] = df.Category.apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

In [5]:
# use sklearn CountVectorizer to create various features based on the unique words in column Message
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer()
X_train_count = vector.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [6]:
# use multinomial Naive Bayes (in the previous lesson, we used Gaussian Naive Bayes)
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
# test with some text email
emails = [
    'Hey mohan, can we get together to watch football game tomorrow?',
    'Up to 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = vector.transform(emails)
model.predict(emails_count)

array([0, 1])

In [8]:
X_test_count = vector.transform(X_test)
model.score(X_test_count, y_test)

0.9885139985642498

In [10]:
# use sklearn pipeline to simplify the process above
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()), # convert text into CountVectorizer
    ('multinomial', MultinomialNB())   # apply multinomial Naive Bayes
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test) # this should print out the same score as above

0.9885139985642498

In [11]:
pipeline.predict(emails)

array([0, 1])

### Machine Learning Tutorial - Naive Bayes: Exercise

Use wine dataset from sklearn.datasets to classify wines into 3 categories. Load the dataset and split it into test and train. After that train the model using Gaussian and Multinominal classifier and post which model performs better. Use the trained model to perform some predictions on test data.

In [12]:
from sklearn.datasets import load_wine

wine = load_wine()
dir(wine)

['DESCR', 'data', 'feature_names', 'target', 'target_names']

In [13]:
wine.data.shape

(178, 13)

In [14]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [15]:
wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [16]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2)

In [21]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

1.0

In [23]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()), # convert text into CountVectorizer
    ('multinomial', MultinomialNB())   # apply multinomial Naive Bayes
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [None]:
there's some compiler error here as shown above.
In addition the model solution provider by the YouTuber himself stops after the fit() and score() with GaussianNB