# Text classification

https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Reading  Data

In [33]:
sms = pd.read_table('../SMSSpamCollection', header=None, names=['label', 'message'])
sms.shape
sms.groupby('label').describe()

In [3]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms['length'] = sms['message'].apply(len)

In [5]:
sms.head()

Unnamed: 0,label,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [6]:
sms.shape

(5572, 3)

In [8]:
sms.groupby('label').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4825.0,71.482487,58.440652,2.0,33.0,52.0,93.0,910.0
spam,747.0,138.670683,28.873603,13.0,133.0,149.0,157.0,223.0


In [155]:
sms.describe()

Unnamed: 0,label_num
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


# label converter

In [88]:
sms['label_num'] = sms['label'].map({'ham' :0, 'spam': 1})

In [10]:
sms.head()

Unnamed: 0,label,message,length,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",111,0
1,ham,Ok lar... Joking wif u oni...,29,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,1
3,ham,U dun say so early hor... U c already then say...,49,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,0


# Data Pre Processing

* Remove punctuation
* lower case
* tokenize
* remove stop words

In [13]:
sms.iloc[0]['message']

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [14]:
temp = 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [15]:
import nltk

In [17]:
import string


In [22]:
removepunc = "".join([c for c in temp if c not in string.punctuation])

In [23]:
lower = "".join([c for c in temp if c not in string.punctuation]).lower()

In [24]:
tokenize = nltk.word_tokenize("".join([c for c in temp if c not in string.punctuation]).lower())

In [25]:
from nltk.corpus import stopwords

In [28]:
remove_stop = [word for word in tokenize if word not in stopwords.words('english')]

In [64]:
def preprocessText(msg):
    removepunc = [c for c in msg if c not in string.punctuation]
    removepunc = "".join(removepunc)
    lower = removepunc.lower()
    tokenize = nltk.word_tokenize(lower)
    remove_stop = [word for word in tokenize if word not in stopwords.words('english')]
    remove_digit = [word for word in remove_stop if not word.isdigit()]
    joinagain = " ".join(remove_digit)
    return joinagain

In [65]:
sms['pre_process'] = sms['message'].apply(preprocessText)

In [112]:
sms.head(10)

Unnamed: 0,label,message,pre_process,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts 21s...,1
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though,0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darling weeks word back id like fu...,1
6,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,per request melle melle oru minnaminunginte nu...,0
8,spam,WINNER!! As a valued network customer you have...,winner valued network customer selected receiv...,1
9,spam,Had your mobile 11 months or more? U R entitle...,mobile months u r entitled update latest colou...,1


In [133]:
sms.iloc[9]['message']

'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'

In [68]:
sms.iloc[2]['pre_process']

'free entry wkly comp win fa cup final tkts 21st may text fa receive entry questionstd txt ratetcs apply 08452810075over18s'

# Feature Extraction

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [128]:
cv = CountVectorizer(analyzer='word')

In [129]:
feature_text = cv.fit_transform(sms['pre_process'])

In [None]:
cv.get_feature_names()

In [131]:
feature_text.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [132]:
feature_text.shape

(5572, 8904)

# Train and Test Splitting

In [85]:
from sklearn.model_selection import train_test_split

In [90]:
X_train, X_test, y_train, y_test = train_test_split(feature_text.toarray(), sms['label_num'], test_size = 0.3, random_state = 123)

In [91]:
X_train.shape

(3900, 8904)

In [92]:
X_test.shape

(1672, 8904)

In [93]:
y_train.shape

(3900,)

In [94]:
y_test.shape

(1672,)

# Apply Classification Algorithm

### Logistic Regression

In [95]:
from sklearn.linear_model import LogisticRegression

In [96]:
lr = LogisticRegression()

In [97]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [98]:
y_pred = lr.predict(X_test)

In [99]:
from sklearn.metrics import confusion_matrix

In [100]:
confusion_matrix(y_test, y_pred)

array([[1436,    6],
       [  32,  198]])

## Apply Model on new Test Data

In [149]:
testdata = 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'
#testdata = 'Hi How are you'
testdata = 'HDFC personal loan rate is 11.25%,  U R entitled to Update to the latest colour mobiles'

In [150]:
t = preprocessText(testdata)

In [151]:
t

'hdfc personal loan rate u r entitled update latest colour mobiles'

In [152]:
cv.transform([t])

<1x8904 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [153]:
lr.predict(cv.transform([t]))

array([0], dtype=int64)