<a href="https://colab.research.google.com/github/olinyoder2534/NLP_practice/blob/main/BoW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import pandas as pd
import numpy as np
import spacy

In [57]:
df = pd.read_csv('/content/spam.csv')

In [58]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [59]:
df.shape

(5572, 2)

In [60]:
df.dtypes

Category    object
Message     object
dtype: object

In [61]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [62]:
#1 for spam, 0 for non spam
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [63]:
df = df.drop('Category', axis = 1)
df.head()

Unnamed: 0,Message,Spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [64]:
#resample data
from imblearn.under_sampling import RandomUnderSampler

In [65]:
rus = RandomUnderSampler(random_state=42)
df_balanced, y_resampled = rus.fit_resample(df[['Message', 'Spam']], df['Spam'])

df_balanced = pd.DataFrame(df_balanced, columns=['Message', 'Spam'])
df_balanced.head()

Unnamed: 0,Message,Spam
0,If i not meeting ü all rite then i'll go home ...,0
1,"I.ll always be there, even if its just in spir...",0
2,"Sorry that took so long, omw now",0
3,I thk 50 shd be ok he said plus minus 10.. Did...,0
4,Dunno i juz askin cos i got a card got 20% off...,0


In [66]:
df_balanced['Spam'].value_counts()

Spam
0    747
1    747
Name: count, dtype: int64

In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['Spam'], test_size = .2)

In [68]:
X_train.head()

1247    Someone U know has asked our dating service 2 ...
587                       Like  &lt;#&gt; , same question
1200    LIFE has never been this much fun and great un...
1240    Thanks for your ringtone order, reference numb...
711                                Meet after lunch la...
Name: Message, dtype: object

In [69]:
nlp = spacy.load("en_core_web_sm")

In [70]:
#lemmatize
def lemmatize_message(message):
    doc = nlp(message)
    return ' '.join([token.lemma_ for token in doc])

In [72]:
X_train = X_train.apply(lemmatize_message)

In [73]:
X_train.head()

1247    someone u know have ask our date service 2 con...
587                    like   & lt;#&gt ; , same question
1200    life have never be this much fun and great unt...
1240    thank for your ringtone order , reference numb...
711                               meet after lunch la ...
Name: Message, dtype: object

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<1195x3668 sparse matrix of type '<class 'numpy.int64'>'
	with 19880 stored elements in Compressed Sparse Row format>

In [75]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [76]:
X_train_cv.shape

(1195, 3668)

In [80]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

In [81]:
model.fit(X_train_cv, y_train)

In [83]:
X_test_cv = v.transform(X_test)

In [86]:
y_pred = model.predict(X_test_cv)

In [92]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       160
           1       0.94      0.98      0.96       139

    accuracy                           0.96       299
   macro avg       0.96      0.96      0.96       299
weighted avg       0.96      0.96      0.96       299



In [93]:
emails = [
    'Congrats! You have been selected to help Okonkwo, the Nigerian Prince.',
    'See attached file',
    'You won $1000. Click the link below to enter your info to receive your reward.',
    'Hey man, are you coming for a game tomorrow',
    "Want to come over to watch a movie?"
]

In [95]:
emails_cv = v.transform(emails)

In [97]:
model.predict(emails_cv)

array([1, 0, 1, 0, 0])

In [99]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

In [100]:
clf.fit(X_train, y_train)

In [103]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       160
           1       1.00      0.95      0.97       139

    accuracy                           0.98       299
   macro avg       0.98      0.97      0.98       299
weighted avg       0.98      0.98      0.98       299

