# Imports

In [2]:
import numpy as np
import pandas as pd

# Load the data

In [3]:
df = pd.read_csv('TextFiles/smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


# Split the data

In [4]:
from sklearn.model_selection import train_test_split

X = df['message']  
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# TfidfVectorizer

In [5]:
# Combines the power of CountVectorizer and TfidfTransformer() into one model.

In [14]:
# CountVectorizer:
    # Builds a vocabulary of words from a set of documents,
    # a numerical vector showing how many times each word appeared in a document


In [17]:
# TfidfVectorizer: Tells how important a word in a document is relative to 
    # a collection of documents.
    #Tf = Term Frequency (ex; the term "dog" appears 1 times in documnet #1)
    #Idf = Inverse Document Frequency: Decreases the weight of terms that appear
        # frequently (e.g. "the", "to", "and" etc.) and increases the weight of
        # terms that appear infrequently (e.g. "car", "blue", "book" etc.)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# LinearSVC

In [8]:
# I choose LinearSVC as it handles the sparse data in our X_train_tfidf variable better
    # than SVC would have, and also scales well to large sample numbers.

In [20]:
from sklearn.svm import LinearSVC
clf = LinearSVC()

# Build a Pipeline

In [21]:
#Condenses the previous two models into one.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data into the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# Test the classifier 

In [22]:
# Generate predictions
predictions = text_clf.predict(X_test)

In [23]:
# View the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [25]:
text_clf.predict(["You have WON a once in a lifetime SWEEPSTAKES! visit www.getrichTODAY.com to qualify!!"])

array(['spam'], dtype=object)