In [1]:
# pip install scikit-learn
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv ('./UPDATED_NLP_COURSE/TextFiles/smsspamcollection.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
df.isnull ().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
len (df)

5572

In [9]:
df [['length', 'punct']]

Unnamed: 0,length,punct
0,111,9
1,29,6
2,155,6
3,49,6
4,61,2
5,147,8
6,77,2
7,160,6
8,157,6
9,154,2


In [10]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [11]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [12]:
#############################################

In [13]:
from sklearn.model_selection import train_test_split

In [19]:
# X feature data
X = df[['length', 'punct']]

# y is our label
y = df ['label']

# test_size: 0.3 means 30 % of our data is going to be Test Data
# random_state: RANDOM NUMBER Seed which will generate same randomness when used with the same seed
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3, random_state=42)

In [20]:
X_train.shape

(3900, 2)

In [23]:
X_test.shape

(1672, 2)

In [24]:
y_train.shape

(3900,)

In [25]:
y_test.shape

(1672,)

In [26]:
X_test

Unnamed: 0,length,punct
3245,147,14
944,116,1
1044,102,3
2484,45,0
812,112,4
2973,65,2
2991,126,7
2942,95,3
230,34,0
1181,26,3


In [27]:
 ### Training the Model

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
lr_model = LogisticRegression (solver='lbfgs')

In [30]:
lr_model.fit (X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
### Testing our model

In [32]:
from sklearn import metrics

In [35]:
predictions = lr_model.predict (X_test)

In [36]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [39]:
print (metrics.confusion_matrix (y_test, predictions))

[[1404   44]
 [ 219    5]]


In [40]:
df = pd.DataFrame (metrics.confusion_matrix (y_test, predictions), index=['ham', 'spam'], columns=['ham', 'spam'])

In [41]:
df

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [42]:
print (metrics.classification_report (y_test, predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [45]:
# Accuracy Score
print (metrics.accuracy_score (y_test, predictions))

0.8427033492822966


In [54]:
## Trying out other models
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB ()
nb_model.fit (X_train, y_train)

nb_predictions = nb_model.predict (X_test)

print (metrics.confusion_matrix (y_test, nb_predictions))

[[1438   10]
 [ 224    0]]


In [58]:
### Trying out support vector machine model
from sklearn.svm import SVC

svc_model = SVC (gamma = 'auto')
svc_model.fit (X_train, y_train)

svc_predictions = svc_model.predict (X_test)
print (metrics.confusion_matrix (y_test, svc_predictions))

[[1373   75]
 [ 121  103]]
