## Text data using sklearn


In [107]:
simple_train=['call you tonight','Call me a cab','please call me...PLEASE!']

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [109]:
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [110]:
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [111]:
simple_train_dtm = vect.transform(simple_train)

In [112]:
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [113]:
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [114]:
import pandas as pd

In [115]:
pd.DataFrame(simple_train_dtm.toarray(), columns = vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [116]:
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [117]:
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [118]:
simple_test = ["Please dont call me"]

In [119]:
simple_test_dtm = vect.transform(simple_test)

In [120]:
pd.DataFrame(simple_test_dtm.toarray(), columns = vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [121]:
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None , names=['label','messages'])

In [122]:
sms.shape

(5572, 2)

In [123]:
sms.head(10)

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [124]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [125]:
sms['label_num']=sms.label.map({'ham':0, 'spam':1})

In [126]:
sms.head()

Unnamed: 0,label,messages,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [127]:
X = sms.messages
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [128]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [129]:
X_train.shape

(4179,)

In [130]:
X_test.shape

(1393,)

In [131]:
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [132]:
X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [133]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [134]:
from sklearn.naive_bayes import MultinomialNB 
nb = MultinomialNB()

In [135]:
%time nb.fit(X_train_dtm,y_train)

Wall time: 375 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [136]:
y_pred_class = nb.predict(X_test_dtm)

In [137]:
from sklearn import metrics 
metrics.accuracy_score(y_test, y_pred_class)

0.9885139985642498

In [138]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]], dtype=int64)

In [139]:
X_test[y_pred_class > y_test] 

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: messages, dtype: object

In [140]:
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

In [141]:
y_pred_prob

array([2.87744864e-03, 1.83488846e-05, 2.07301295e-03, ...,
       1.09026171e-06, 1.00000000e+00, 3.98279868e-09])

In [142]:
metrics.roc_auc_score(y_test, y_pred_prob)

0.9866431000536962

In [143]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

%time logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)


Wall time: 31.2 ms




In [144]:
y_pred_prob = logreg.predict_proba(X_test_dtm)[:,1]
y_pred_prob

array([0.01269556, 0.00347183, 0.00616517, ..., 0.03354907, 0.99725053,
       0.00157706])

In [145]:
metrics.accuracy_score(y_test,y_pred_class)

0.9877961234745154

In [146]:
metrics.roc_auc_score(y_test,y_pred_prob)

0.9936817612314301

In [147]:
X_train_tokens= vect.get_feature_names()
len(X_train_tokens)

7456

In [148]:
print(X_train_tokens[0:50])

['00', '000', '008704050406', '0121', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400', '02073162414', '02085076972', '021', '03', '04', '0430', '05', '050703', '0578', '06', '07', '07008009200', '07090201529', '07090298926', '07123456789', '07732584351', '07734396839', '07742676969', '0776xxxxxxx', '07781482378', '07786200117', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '07880867867', '0789xxxxxxx', '07946746291', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '08', '0800', '08000407165', '08000776320', '08000839402', '08000930705']


In [149]:
print(X_train_tokens[-50:])

['yer', 'yes', 'yest', 'yesterday', 'yet', 'yetunde', 'yijue', 'ym', 'ymca', 'yo', 'yoga', 'yogasana', 'yor', 'yorge', 'you', 'youdoing', 'youi', 'youphone', 'your', 'youre', 'yourjob', 'yours', 'yourself', 'youwanna', 'yowifes', 'yoyyooo', 'yr', 'yrs', 'ything', 'yummmm', 'yummy', 'yun', 'yunny', 'yuo', 'yuou', 'yup', 'zac', 'zaher', 'zealand', 'zebra', 'zed', 'zeros', 'zhong', 'zindgi', 'zoe', 'zoom', 'zouk', 'zyada', 'èn', '〨ud']


In [150]:
nb.feature_count_

array([[ 0.,  0.,  0., ...,  1.,  1.,  1.],
       [ 5., 23.,  2., ...,  0.,  0.,  0.]])

In [151]:
nb.feature_count_.shape

(2, 7456)

In [152]:
ham_token_count = nb.feature_count_[0,:]
spam_token_count = nb.feature_count_[1,:]
print(ham_token_count)
print(spam_token_count
     )

[0. 0. 0. ... 1. 1. 1.]
[ 5. 23.  2. ...  0.  0.  0.]


In [153]:
tokens = pd.DataFrame({'token':X_train_tokens,'ham':ham_token_count,'spam':spam_token_count})
tokens.head()

Unnamed: 0,token,ham,spam
0,0,0.0,5.0
1,0,0.0,23.0
2,8704050406,0.0,2.0
3,121,0.0,1.0
4,1223585236,0.0,1.0


In [154]:
tokens.sample(5, random_state=6)

Unnamed: 0,token,ham,spam
6977,very,64.0,2.0
4521,nasty,1.0,1.0
6999,villa,0.0,1.0
1266,beloved,1.0,0.0
6522,textoperator,0.0,2.0


In [155]:
nb.class_count_

array([3617.,  562.])

In [170]:
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1

In [171]:
tokens.sample(5 , random_state=6)

Unnamed: 0,token,ham,spam,spam_ratio
6977,very,2.000558,2.003559,6.381252
4521,nasty,2.000553,2.003556,6.431106
6999,villa,2.000553,2.003556,6.431995
1266,beloved,2.000553,2.003552,6.425379
6522,textoperator,2.000553,2.003559,6.437723


In [172]:
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5 , random_state=6)

Unnamed: 0,token,ham,spam,spam_ratio
6977,very,0.000553,0.003565,6.381252
4521,nasty,0.000553,0.003565,6.431106
6999,villa,0.000553,0.003565,6.431995
1266,beloved,0.000553,0.003565,6.425379
6522,textoperator,0.000553,0.003565,6.437723


In [173]:
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5 , random_state=6)

Unnamed: 0,token,ham,spam,spam_ratio
6977,very,0.000553,0.003565,6.445598
4521,nasty,0.000553,0.003565,6.445603
6999,villa,0.000553,0.003565,6.445603
1266,beloved,0.000553,0.003565,6.445593
6522,textoperator,0.000553,0.003565,6.445613


In [174]:
tokens.sort_values('spam_ratio', ascending=False)


Unnamed: 0,token,ham,spam,spam_ratio
6656,to,0.000553,0.003568,6.450491
1552,call,0.000553,0.003567,6.448311
7424,your,0.000553,0.003566,6.447533
7420,you,0.000553,0.003566,6.447458
2864,free,0.000553,0.003566,6.447191
2821,for,0.000553,0.003566,6.447114
4778,or,0.000553,0.003566,6.446989
4662,now,0.000553,0.003566,6.446985
6542,the,0.000553,0.003566,6.446984
6805,txt,0.000553,0.003566,6.446751
