In [1]:
#flatten means join multiple lists/vectors 

import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [14]:
doc = nlp("played")

doc[0].lemma_

'play'

In [15]:
import pandas as pd

In [21]:
df = pd.read_csv("spam.csv")

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
#imbalanced dataset
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [23]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=="spam" else 0)

In [24]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["Message"], df["spam"], test_size = 0.2)

In [30]:
type(X_train)

pandas.core.series.Series

In [33]:
type(X_train.values)

numpy.ndarray

In [35]:
from sklearn.feature_extraction.text import CountVectorizer


CV = CountVectorizer()

X_train_cv = CV.fit_transform(X_train.values)

X_train_cv

<4457x7712 sparse matrix of type '<class 'numpy.int64'>'
	with 59267 stored elements in Compressed Sparse Row format>

In [37]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
X_train_cv.shape

(4457, 7712)

In [39]:
CV.get_feature_names_out()[1000:1050]

array(['anti', 'antibiotic', 'any', 'anybody', 'anymore', 'anyone',
       'anyones', 'anyplaces', 'anythiing', 'anythin', 'anything',
       'anythingtomorrow', 'anytime', 'anyway', 'anyways', 'anywhere',
       'aom', 'apart', 'apartment', 'apes', 'apeshit', 'aphex', 'apo',
       'apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appeal', 'appear', 'appendix', 'applausestore',
       'applebees', 'apples', 'application', 'apply', 'applyed',
       'applying', 'appointment', 'appointments', 'appreciate',
       'appreciated', 'approaches', 'approaching', 'approve', 'approved',
       'approx', 'apps', 'appt'], dtype=object)

In [40]:
CV.get_feature_names_out().shape

(7712,)

In [41]:
dir(CV)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_

In [42]:
CV.vocabulary_

{'claim': 1832,
 '200': 344,
 'shopping': 6083,
 'spree': 6380,
 'just': 3853,
 'call': 1618,
 '08717895698': 127,
 'now': 4812,
 'have': 3350,
 'you': 7672,
 'won': 7540,
 'mobstorequiz10ppm': 4522,
 'blank': 1393,
 'is': 3726,
 'but': 1584,
 'wat': 7357,
 'lol': 4165,
 'nervous': 4715,
 'lt': 4226,
 'gt': 3250,
 'are': 1059,
 'free': 2985,
 'can': 1643,
 'ok': 4886,
 'lor': 4183,
 'not': 4801,
 'too': 6930,
 'early': 2511,
 'me': 4384,
 'still': 6451,
 'having': 3355,
 'project': 5415,
 'meeting': 4403,
 'if': 3587,
 'havent': 3353,
 'collected': 1896,
 'the': 6772,
 'dough': 2430,
 'pls': 5229,
 'let': 4068,
 'know': 3944,
 'so': 6247,
 'go': 3157,
 'to': 6892,
 'place': 5199,
 'sent': 5991,
 'it': 3737,
 'get': 3115,
 'control': 1995,
 'number': 4827,
 'good': 3179,
 'morning': 4557,
 'my': 4636,
 'boytoy': 1487,
 'how': 3510,
 'those': 6818,
 'yummy': 7694,
 'lips': 4119,
 'where': 7447,
 'sexy': 6020,
 'buns': 1569,
 'what': 7436,
 'do': 2376,
 'think': 6802,
 'of': 4861,
 'crave

In [65]:
CV.get_feature_names_out()[344]

'200'

In [53]:
import numpy as np
X_train_np = X_train_cv.toarray()

X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [54]:
np.where(X_train_np[0] != 0)

(array([ 127,  344, 1618, 1832, 3350, 3853, 4522, 4812, 6083, 6380, 7540,
        7672], dtype=int64),)

In [57]:
X_train[:4][3780]

'Claim a 200 shopping spree, just call 08717895698 now! Have you won! MobStoreQuiz10ppm'

In [58]:
X_train_np[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [67]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_cv, y_train)

In [68]:
X_test_cv = CV.transform(X_test)

In [72]:
from sklearn.metrics import classification_report

y_hat = model.predict(X_test_cv)

In [74]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.97      0.91      0.94       151

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [81]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_cv = CV.transform(emails)

model.predict(emails_cv)

array([0, 1], dtype=int64)

In [82]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [83]:
clf.fit(X_train, y_train)

In [85]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.97      0.91      0.94       151

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

