# Introduzione, concetti basici

In [1]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [2]:
X.shape

(150, 4)

In [3]:
y.shape

(150,)

In [4]:
import pandas as pd
pd.DataFrame(X, columns = iris.feature_names).head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X,y) # learning the relationships between features and response
knn.predict([[3,4,5,2]])

array([1])

# representing text as numerical data

In [7]:
simple_train = ['call you tonight', 'call me a cab', 'please call me ...PLEASE']

In [8]:
# convert text into a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [9]:
# learn the vocabulary of training data
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
# examine fitted vocabulary, no duplicates. 
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [11]:
# document-term matrix
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [12]:
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [13]:
pd.DataFrame(simple_train_dtm.toarray(), columns = vect.get_feature_names())
# one row per document
# one column per token
# vectorization: turning collection of documents into feature matrix (bag of words, no keep track of order)


Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [14]:
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [15]:
print(simple_train_dtm)
# left: coordinates of non vero values
# right: values at that point
# sparse matrixec: all you store is: locations and value (non zero)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [16]:
# we use this because most of values are zero, so... we save memory.

In [17]:
# test document
simple_test = ['Please don\'t call me']
simple_test

["Please don't call me"]

In [18]:
# to make predictions: train/test must have same features...
# so we need to use transform method
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()


array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [19]:
pd.DataFrame(simple_test_dtm.toarray(), columns= vect.get_feature_names())
# 'dont' was not seen in training --> it was dropped

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


we are pretending that we trained our model on that 6x3 matrix, 
and now we are ready to make predictionof something. for example... nice vs mean messages.
label as nice or mean. 
we pass our new document (must be numeric, and features must be same as in training)
so if i use .predict... we need to pass this array up here. 
why are we ok with dropping the word 'dont'? >> because we dont know anything about the relationship
trining goal: learn relationship feature - response. 
since we didnt use the word dont in training, we dont know what to do with that. 
it's like using a new feature only in predictions step... wouldnt know what to do with it man

so:
vect.fit learns the vocabulary
vect.transform   uses  vocabulary to build matrix
vect.transform(test)   same, drops terms not seen before. 


# reading text based dataset into pandas

In [20]:
file_path = r'C:\Users\Mfornaroli\Desktop\data_spam.tsv'
sms = pd.read_table(file_path, encoding = "ISO-8859-1", header=None, names=['label', 'messages'])
sms.head(10)

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [21]:
sms.shape

(5572, 2)

In [22]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [23]:
# convert labels to numerical variable
# create new column, map values to numbers
sms['label_num'] = sms.label.map(dict(zip(['ham', 'spam'], [0, 1])))
sms.head()

Unnamed: 0,label,messages,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [24]:
X = sms.messages
y = sms.label_num
print(X.shape)  # it's important that its 1D now. we will transform it to matrix late
print(y.shape)


(5572,)
(5572,)


In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
# we split before, so that we will have new words in the testing set
# real world: feature get dropped! not seen in training. 
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


# vectorizing the dataset

In [26]:
vect = CountVectorizer()   # istantiate the vectorizer
vect.fit(X_train)    # learn the vocabulary of training data
X_train_dtm = vect.transform(X_train)  # returns matrix

In [27]:
# we can do it at the same time actually, slightly faster. 
X_train_dtm = vect.fit_transform(X_train)

In [28]:
X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [29]:
X_test_dtm = vect.transform(X_test)
# no fitting: we would get a different set of tokens, probably smaller
# you can see: vocabulary is the same, when transforming testing data in doc term matrix (dropping new tokens!)
X_test_dtm

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

# Multinomial Naive Bayes classifier

In [30]:
# suitable for classification with discrete - features (integer values)
# tf - idf would work too
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [31]:
%time nb.fit(X_train_dtm, y_train)
# naive bayes is very fast

Wall time: 126 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
y_pred_class = nb.predict(X_test_dtm) # must have same number of columns!

In [33]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9885139985642498

In [34]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]], dtype=int64)

In [35]:
# false positives
y_pred_class.shape
print(type(y_pred_class))
y_serie_preds = pd.Series(y_pred_class)
print(type(y_series))

<class 'numpy.ndarray'>


NameError: name 'y_series' is not defined

In [None]:
print(X_test[(y_pred_class == 1 ) & (y_test == 0)])
# boolean series
# really easy! works as long as they have the same shape, right?
print(X_test[1988])

In [36]:
a = pd.DataFrame(y_test)
a['new_col'] = list(range(1393))
for k in range(1393):
    predicted_value = int(y_pred_class[k])
    true_value = int(a.loc[a.new_col == k, 'label_num'] )
    if predicted_value != true_value:
        print("error, index = ", k)
        
# sono 16! come il totale di errori effettivamente commessi

error, index =  40
error, index =  72
error, index =  206
error, index =  556
error, index =  567
error, index =  583
error, index =  585
error, index =  644
error, index =  815
error, index =  822
error, index =  884
error, index =  1149
error, index =  1169
error, index =  1266
error, index =  1319
error, index =  1381


In [37]:
print(X_test[(y_pred_class == 0 ) & (y_test == 1)])
print(X_test[(y_pred_class == 0 ) & (y_test == 1)].index)

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - ?It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: messages, dtype: object
Int64Index([3132, 5, 3530, 684, 1875, 1893, 4298, 4949, 2821, 2247, 4514], dtype='int64')


In [38]:
print(X_test[4674])
# pandas series can be sliced with indexes, like this

I forgot 2 ask ü all smth.. There's a card on da present lei... How? Ü all want 2 write smth or sign on it?


In [39]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:,1]  # output has 2 columns, prob for each class! so 2 cols in this case
# we consider only prob of class 1
y_pred_prob


array([2.87744864e-03, 1.83488846e-05, 2.07301295e-03, ...,
       1.09026171e-06, 1.00000000e+00, 3.98279868e-09])

In [40]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9866431000536962

# comparing models (logReg)

In [41]:
'''
Naive Bayes doesnt make good predicted probabilitie
produces extreme values, not actual probabilities!
POPULAR WITH TXT ML!
'''

'\nNaive Bayes doesnt make good predicted probabilitie\nproduces extreme values, not actual probabilities!\nPOPULAR WITH TXT ML!\n'

In [42]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
%time logreg.fit(X_train_dtm, y_train) # slower!
y_pred_class_logreg = logreg.predict(X_test_dtm)
y_pred_prob_logreg = logreg.predict_proba(X_test_dtm)[:, 1]
print(metrics.accuracy_score(y_test, y_pred_class_logreg))
print(metrics.roc_auc_score(y_test, y_pred_prob_logreg))

Wall time: 22.5 ms
0.9877961234745154
0.9936817612314301


# examine the NB model

In [43]:
# what we do: we calculate the approximate 'spamm-ness' of each token
# spammy and hammy words! 
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)   # in alphabetical order, dude!

7456

In [44]:
# NB counts the number of times each token appears in EACH CLASS
print(nb.feature_count_.shape)
nb.feature_count_
# for each token. given a class, what's the probability. 

(2, 7456)


array([[ 0.,  0.,  0., ...,  0.,  1.,  1.],
       [ 5., 23.,  2., ...,  1.,  0.,  0.]])

In [45]:
# calculate spamminess (simple hack)
ham_token_count = nb.feature_count_[0,:]
spam_token_count = nb.feature_count_[1,:]

In [46]:
tokens = pd.DataFrame({'token': X_train_tokens, 'ham': ham_token_count, 'spam': spam_token_count})
print(tokens.shape)
tokens.head()


(7456, 3)


Unnamed: 0,token,ham,spam
0,0,0.0,5.0
1,0,0.0,23.0
2,8704050406,0.0,2.0
3,121,0.0,1.0
4,1223585236,0.0,1.0


In [47]:
tokens.sample(10, random_state=6)  # cool method for random sampling

Unnamed: 0,token,ham,spam
6977,versus,1.0,0.0
4521,nasty,1.0,1.0
6999,vill,1.0,0.0
1266,beloved,1.0,0.0
6522,textoperator,0.0,2.0
1035,arng,2.0,0.0
250,1013,0.0,1.0
5703,scores,1.0,1.0
4504,nahi,2.0,0.0
4036,long,35.0,0.0


In [48]:
nb.class_count_

array([3617.,  562.])

In [49]:
# normalize counts
tokens['ham'] = tokens.ham / nb.class_count_[0] * 30000 + 1/nb.class_count_[0]
# normalize counts
tokens['spam'] = tokens.spam / nb.class_count_[1] * 30000 + 1/nb.class_count_[1]

In [50]:
tokens.sample(5, random_state=6) 

Unnamed: 0,token,ham,spam
6977,versus,8.294443,0.001779
4521,nasty,8.294443,53.382562
6999,vill,8.294443,0.001779
1266,beloved,8.294443,0.001779
6522,textoperator,0.000276,106.763345


In [51]:
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=6) 

Unnamed: 0,token,ham,spam,spam_ratio
6977,versus,8.294443,0.001779,0.000215
4521,nasty,8.294443,53.382562,6.435943
6999,vill,8.294443,0.001779,0.000215
1266,beloved,8.294443,0.001779,0.000215
6522,textoperator,0.000276,106.763345,386163.019573


In [52]:
tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0,token,ham,spam,spam_ratio
1766,claim,0.000276,4697.510676,1.699090e+07
5209,prize,0.000276,4003.560498,1.448088e+07
293,150p,0.000276,2562.279359,9.267764e+06
6680,tone,0.000276,2508.898577,9.074686e+06
3139,guaranteed,0.000276,2241.994662,8.109295e+06
...,...,...,...,...
4051,lor,978.711916,0.001779,1.818062e-06
5824,she,1061.653580,0.001779,1.676026e-06
3244,he,1401.714404,0.001779,1.269417e-06
4093,lt,1915.952723,0.001779,9.287074e-07


In [53]:
# look up the spam/ratio for a given token
tokens.shape
tokens=tokens.set_index('token')

In [54]:
# tokens.loc['yours', 'spam_ratio']
tokens.head()

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.000276,266.905694,965397.9
0,0.000276,1227.759786,4440807.0
8704050406,0.000276,106.763345,386163.0
121,0.000276,53.382562,193084.7
1223585236,0.000276,53.382562,193084.7


In [55]:
# let me just try something, dude....
tokens.columns

Index(['ham', 'spam', 'spam_ratio'], dtype='object')

In [56]:
tokens.columns

Index(['ham', 'spam', 'spam_ratio'], dtype='object')

In [63]:
tokens['new_column'] = (tokens['ham'] - tokens['spam'])/tokens['spam_ratio']

In [64]:
tokens.columns

Index(['ham', 'spam', 'spam_ratio', 'new_column'], dtype='object')

In [65]:
tokens.head()
# adding new columns like this, seems to work just fine. . . .

Unnamed: 0_level_0,ham,spam,spam_ratio,new_column
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.000276,266.905694,965397.9,-0.000276
0,0.000276,1227.759786,4440807.0,-0.000276
8704050406,0.000276,106.763345,386163.0,-0.000276
121,0.000276,53.382562,193084.7,-0.000276
1223585236,0.000276,53.382562,193084.7,-0.000276
