In [44]:
import numpy as np
import pandas as pd
import os
import re

###  Read the file 

In [108]:
input_data = pd.read_csv("SMSSpamCollection.csv", header=None)
input_data.head()
input_data.columns = ["text"]

col1 = input_data.text.str.split("\t").str[0]  ## Extract Target Variable
df_col1 = pd.DataFrame(col1) ## Convert Into data frame
df_col1.columns = ["type"] ### Change the column name
df_col1.shape ## Checking The total no of records


col2 = input_data.text.str.split("\t").str[1] ## Content column
df_col2 = pd.DataFrame(col2)## Convert Into data frame
df_col2.shape

## Add temporary columns ID for merging the two data frame
df_col1["id"] = df_col1.index 
df_col2["id"] = df_col2.index
new_data = df_col1.merge(df_col2)
new_data

## Drop The Temporary column from the new data frame
new_data.drop(new_data.columns[1], axis = 1, inplace = True)
new_data.head()

new_data.shape

(5574, 2)

In [109]:
text_data = new_data["text"]
text_data.head()

0                                Go until jurong point
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4                     Nah I don't think he goes to usf
Name: text, dtype: object

In [110]:
text1 = text_data.tolist()

## Preprocessing Text Mining 

for i in range(len(text1)):
    text1[i] = re.sub(r'\s+', ' ', text1[i])
    text1[i] = re.sub('[\d]', ' ', text1[i])
    

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(ngram_range = (1,1), stop_words = 'english')
tfidf = tfidf_vector.fit_transform(text1)
tfidf.shape

dense_mat = tfidf.todense()
dense_mat
a = pd.DataFrame(dense_mat, columns= tfidf_vector.get_feature_names())
a.shape

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [121]:
from sklearn.naive_bayes import MultinomialNB

classification = MultinomialNB().fit(tfidf, new_data["type"])
pred = classification.predict(tfidf)
print new_data["type"].value_counts()

## Build Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
prediction = confusion_matrix(new_data["type"], pred)
prediction

print (classification_report(new_data["type"], pred))

ham     4827
spam     747
Name: type, dtype: int64
             precision    recall  f1-score   support

        ham       0.98      1.00      0.99      4827
       spam       1.00      0.84      0.91       747

avg / total       0.98      0.98      0.98      5574



### Logistic Regression

In [126]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression()

log_model = logreg.fit(tfidf, new_data["type"])
pred_log = log_model.predict(tfidf)
print(classification_report(new_data["type"], pred_log))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      4827
       spam       0.99      0.74      0.85       747

avg / total       0.97      0.96      0.96      5574



###  Spliting The Data into train and test, on top of that we build Naive Bayes and Logistic Model

In [141]:
from sklearn.cross_validation import train_test_split

x_train, x_test, y_train, y_test = train_test_split(dense_mat, new_data["type"],test_size = 0.3, random_state = 5211)


###  Build Logistic Regression Model

In [145]:
from sklearn import linear_model
logreg_train = linear_model.LogisticRegression()
clr_log = logreg_train.fit(x_train, y_train)

###  Predict on train and validation data

In [148]:
pred_train = clr_log.predict(x_train)
pred_test = clr_log.predict(x_test)

print(classification_report(y_train, pred_train))
print(classification_report(y_test, pred_test))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.98      3364
       spam       0.99      0.69      0.81       537

avg / total       0.96      0.96      0.95      3901

             precision    recall  f1-score   support

        ham       0.94      1.00      0.97      1463
       spam       0.99      0.57      0.72       210

avg / total       0.95      0.95      0.94      1673



###  Build Naive Bayes Model 

In [150]:
naive_model = MultinomialNB().fit(x_train, y_train)
naive_train = naive_model.predict(x_train)
naive_test = naive_model.predict(x_test)

###  Predict On Train and Test Data

In [151]:
print(classification_report(y_train, naive_train))
print(classification_report(y_test, naive_test))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      3364
       spam       1.00      0.79      0.88       537

avg / total       0.97      0.97      0.97      3901

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1463
       spam       1.00      0.69      0.81       210

avg / total       0.96      0.96      0.96      1673

