In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=["label", "message"])

def text_process(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

messages['cleaned_message'] = messages['message'].apply(text_process)


In [3]:
w2v_model = Word2Vec(sentences=messages['cleaned_message'], vector_size=100, window=5, min_count=1, workers=4)

In [4]:
def get_average_word2vec(tokens_list, model, vector_size):
    if len(tokens_list) < 1:
        return np.zeros(vector_size)
    vectorized = [model.wv[word] for word in tokens_list if word in model.wv]
    return np.mean(vectorized, axis=0)

vector_size = 100
X = np.array([get_average_word2vec(tokens, w2v_model, vector_size) for tokens in messages['cleaned_message']])
y = messages['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

In [8]:
y_pred = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.93      1.00      0.96       966
        spam       1.00      0.50      0.67       149

    accuracy                           0.93      1115
   macro avg       0.96      0.75      0.82      1115
weighted avg       0.94      0.93      0.92      1115



In [9]:
import pickle

In [10]:
pickle.dump(rf_clf,open('rf.pkl','wb'))