In [26]:
%matplotlib qt
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

In [10]:
# Given
dir_data = os.path.expanduser('~/data/ml/smsspamcollection/')

In [12]:
df = pd.read_csv(os.path.join(dir_data, 'SMSSpamCollection'), delimiter='\t',
                 header=None)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])

print('Number of spam messages:', df[df[0] == 'spam'][0].count())
print('Number of ham messages:', df[df[0] == 'ham'][0].count())

Number of spam messages: 747
Number of ham messages: 4825


In [19]:
X_test_raw.count()

1393

In [15]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

In [16]:
X_test

<1393x7457 sparse matrix of type '<class 'numpy.float64'>'
	with 16992 stored elements in Compressed Sparse Row format>

In [30]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
for msg, prediction in zip(X_test_raw[:5], predictions[:5]):
    print("Prediction: %s. Message: %s" % (prediction, msg))

Prediction: ham. Message: I am taking you for italian food. How about a pretty dress with no panties? :)
Prediction: ham. Message: No break time one... How... I come out n get my stuff fr ü?
Prediction: ham. Message: You'll never believe this but i have actually got off at taunton. Wow
Prediction: ham. Message: Siva is in hostel aha:-.
Prediction: ham. Message: Just woke up. Yeesh its late. But I didn't fall asleep til &lt;#&gt; am :/


In [27]:
cnfsn_mat = confusion_matrix(y_test, predictions)
print(cnfsn_mat)
plt.figure()
plt.matshow(cnfsn_mat)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

[[1203    2]
 [  43  145]]
