# TextClassification

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
sms = pd.read_table('sms.tsv', header=None, names=['label', 'message'])

In [3]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms.tail()

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [5]:
sms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
sms.shape

(5572, 2)

In [7]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

### changing the categorical column  into integer type

In [8]:
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [9]:
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Spliting the data into target variable and Independent variable 

In [10]:
X= sms['message']

In [11]:
y= sms['label_num']

In [12]:
X.shape

(5572,)

In [13]:
y.shape

(5572,)

### Spliting the data into train and test 

In [14]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=42)

### Vectorizing our dataset

In [15]:
vect= CountVectorizer()

In [16]:
X_train_trans= vect.fit_transform(X_train)

In [17]:
X_train_trans

<4179x7490 sparse matrix of type '<class 'numpy.int64'>'
	with 55879 stored elements in Compressed Sparse Row format>

In [18]:
X_test_trans= vect.transform(X_test)

In [19]:
X_test_trans

<1393x7490 sparse matrix of type '<class 'numpy.int64'>'
	with 16940 stored elements in Compressed Sparse Row format>

### Building and Evaluating our models 

### 1. NaiveBayes

In [20]:
nb = MultinomialNB()

In [21]:
nb.fit(X_train_trans, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
y_pred= nb.predict(X_test_trans)

In [23]:
nb.score(X_test_trans, y_test)

0.98851399856424982

In [24]:
confusion_matrix(y_test, y_pred)

array([[1203,    4],
       [  12,  174]])

In [25]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1207
          1       0.98      0.94      0.96       186

avg / total       0.99      0.99      0.99      1393



In [26]:
from sklearn.metrics import roc_auc_score

In [27]:
roc_auc_score(y_test, y_pred)

0.9660849346553706

### 2. LogisticRegression

In [28]:
log= LogisticRegression()

In [29]:
log.fit(X_train_trans, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
y_pred1 = log.predict(X_test_trans)

In [31]:
log.score(X_test_trans, y_test)

0.98492462311557794

In [32]:
print(confusion_matrix(y_test, y_pred1))

[[1207    0]
 [  21  165]]


In [33]:
print(classification_report(y_test, y_pred1))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1207
          1       1.00      0.89      0.94       186

avg / total       0.99      0.98      0.98      1393



In [34]:
roc_auc_score(y_test, y_pred1)

0.94354838709677424

### Conclusion:

** 1. Here We can clearly see that the naive bayes is having more accuracy than logistic regression model.**

** 2. In terms of predicting the False negatives, again naive bayes did very well with roc score of 0.966 **