In [2]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('SMSSpamCollection.txt',sep='\t', header=None)
dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
dataset.columns = ["label", "content"]
dataset.head(5)

Unnamed: 0,label,content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
import gensim

In Gensim, simple_preprocess() is a utility function used primarily for tokenization and lowercasing of text. It is commonly used to preprocess text data before training models like Word2Vec or Doc2Vec.

In [5]:
dataset['content_list'] = dataset['content'].apply(lambda x: gensim.utils.simple_preprocess(x))
dataset.head()

Unnamed: 0,label,content,content_list
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['content_list'],
                                                    dataset['label'], test_size=0.2)
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [7]:
X_train

0       [oops, sorry, just, to, check, that, you, don,...
1                   [ofcourse, also, upload, some, songs]
2                         [think, he, is, waste, for, rr]
3                           [sleeping, nt, feeling, well]
4       [naughty, little, thought, its, better, to, fl...
                              ...                        
4452    [thanks, for, being, there, for, me, just, to,...
4453    [no, worries, hope, photo, shoot, went, well, ...
4454    [hi, ibh, customer, loyalty, offer, the, new, ...
4455    [hope, you, are, having, good, week, just, che...
4456    [exactly, anyways, how, far, is, jide, her, to...
Name: content_list, Length: 4457, dtype: object

In [8]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=1)

In [9]:
words = set(w2v_model.wv.index_to_key)

In [10]:
w2v_model.wv['sms']

array([-0.15142632,  0.21867478,  0.05885001,  0.12663871,  0.15928213,
       -0.53906465,  0.07473314,  0.6517776 , -0.3603537 , -0.12470852,
       -0.19115774, -0.4507057 , -0.02539421,  0.18959759,  0.12222846,
       -0.29978102,  0.10197094, -0.53295   , -0.06405067, -0.6666478 ,
        0.14740033,  0.07682581,  0.17461619, -0.16603442, -0.086747  ,
       -0.00962673, -0.272106  , -0.16067363, -0.36208418,  0.00834167,
        0.36958823, -0.04969084,  0.07407066, -0.35968542, -0.15670985,
        0.31081018, -0.04028242, -0.27017716, -0.26608014, -0.4661021 ,
       -0.02053764, -0.35705453, -0.13470306,  0.10903416,  0.2841492 ,
       -0.04548687, -0.25264126, -0.04296437,  0.07542148,  0.26872188,
        0.20428345, -0.14496394, -0.13589919, -0.04947977, -0.17465486,
        0.05001741,  0.22373298, -0.00931252, -0.2620245 ,  0.11640106,
        0.0711076 ,  0.1306734 , -0.00418927,  0.05813746, -0.2940321 ,
        0.41918877,  0.19666962,  0.2583003 , -0.45738348,  0.53

In [11]:
x_train = X_train.tolist()
x_train

[['oops',
  'sorry',
  'just',
  'to',
  'check',
  'that',
  'you',
  'don',
  'mind',
  'picking',
  'me',
  'up',
  'tomo',
  'at',
  'half',
  'eight',
  'from',
  'station',
  'would',
  'that',
  'be',
  'ok'],
 ['ofcourse', 'also', 'upload', 'some', 'songs'],
 ['think', 'he', 'is', 'waste', 'for', 'rr'],
 ['sleeping', 'nt', 'feeling', 'well'],
 ['naughty',
  'little',
  'thought',
  'its',
  'better',
  'to',
  'flirt',
  'flirt',
  'flirt',
  'rather',
  'than',
  'loving',
  'someone',
  'gettin',
  'hurt',
  'hurt',
  'hurt',
  'gud',
  'nyt'],
 ['no', 'yes', 'please', 'been', 'swimming'],
 ['yeah', 'confirmed', 'for', 'you', 'staying', 'at', 'that', 'weekend'],
 ['hello',
  'how',
  'doing',
  'what',
  'been',
  'up',
  'when',
  'will',
  'moving',
  'out',
  'of',
  'the',
  'flat',
  'cos',
  'will',
  'need',
  'to',
  'arrange',
  'to',
  'pick',
  'up',
  'the',
  'lamp',
  'etc',
  'take',
  'care',
  'hello',
  'caroline'],
 ['how',
  'are',
  'you',
  'doing',
  'h

In [12]:
x_test = X_test.tolist()
x_test

[['this',
  'is',
  'the',
  'nd',
  'time',
  'we',
  'have',
  'tried',
  'to',
  'contact',
  'have',
  'won',
  'the',
  'prize',
  'claim',
  'is',
  'easy',
  'just',
  'call',
  'now',
  'only',
  'per',
  'minute',
  'bt',
  'national',
  'rate'],
 ['sorry', 'in', 'meeting', 'll', 'call', 'later'],
 ['if', 'you', 'home', 'then', 'come', 'down', 'within', 'min'],
 ['we',
  'know',
  'someone',
  'who',
  'you',
  'know',
  'that',
  'fancies',
  'you',
  'call',
  'to',
  'find',
  'out',
  'who',
  'pobox',
  'ls',
  'hb'],
 ['our',
  'mobile',
  'number',
  'has',
  'won',
  'to',
  'claim',
  'calls',
  'us',
  'back',
  'or',
  'ring',
  'the',
  'claims',
  'hot',
  'line',
  'on'],
 ['talk',
  'sexy',
  'make',
  'new',
  'friends',
  'or',
  'fall',
  'in',
  'love',
  'in',
  'the',
  'worlds',
  'most',
  'discreet',
  'text',
  'dating',
  'service',
  'just',
  'text',
  'vip',
  'to',
  'and',
  'see',
  'who',
  'you',
  'could',
  'meet'],
 ['yes',
  'he',
  'is',


## Applying Word2Vec for training set

In [13]:
x_train_vect = []
temp1=[]

#print(len(X_train))
for ls in x_train:
    for i in ls:
        if i in words:
            temp1.append(np.array(w2v_model.wv[i]))
    x_train_vect.append(np.mean(np.array(temp1), axis = 0))

In [14]:
x_train_vect = np.array(x_train_vect)
x_train_vect.shape

(4457, 100)

## Applying Word2Vec for Test Set

In [15]:
x_test_vect = []
temp1=[]

for ls in x_test:
    for i in ls:
        if i in words:
            temp1.append(np.array(w2v_model.wv[i]))
    x_test_vect.append(np.mean(np.array(temp1), axis = 0))

In [16]:
x_test_vect = np.array(x_test_vect)
x_test_vect.shape

(1115, 100)

## Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_classifier = rfc.fit(x_train_vect, y_train)

### Accuracy for Training Set

In [23]:
y_pred = rfc_classifier.predict(x_train_vect)

In [24]:
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
accuracy = (y_pred==y_train).sum()/len(y_pred)

precision = precision_score(y_train, y_pred, pos_label='ham')
recall = recall_score(y_train, y_pred, pos_label='ham')

print('Accuracy: {} , Precision: {} , Recall: {}'.format(accuracy, precision, recall))
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

Accuracy: 0.9991025353376711 , Precision: 0.9989645353352317 , Recall: 1.0
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3859
        spam       1.00      0.99      1.00       598

    accuracy                           1.00      4457
   macro avg       1.00      1.00      1.00      4457
weighted avg       1.00      1.00      1.00      4457

[[3859    0]
 [   4  594]]


### Accuracy for Test Set

In [25]:
y_pred = rfc_classifier.predict(x_test_vect)

In [26]:
precision = precision_score(y_test, y_pred, pos_label='ham')
recall = recall_score(y_test, y_pred, pos_label='ham')
accuracy = (y_pred==y_test).sum()/len(y_pred)

print('Accuracy: {} , Precision: {} , Recall: {}'.format(accuracy, precision, recall))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8663677130044843 , Precision: 0.8663677130044843 , Recall: 1.0
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       966
        spam       0.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115

[[966   0]
 [149   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
spam_detect_model = LogisticRegression().fit(x_train_vect, y_train)

### Predicting for Training Set

In [34]:
y_pred = spam_detect_model.predict(x_train_vect)

In [35]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_train,y_pred)
print(score)

0.8658290329818263


### Predicting for Test Set

In [36]:
y_pred = spam_detect_model.predict(x_test_vect)

In [37]:
score=accuracy_score(y_test,y_pred)
print(score)

0.8663677130044843
