In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./spam.csv')

In [3]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.Category.value_counts() / data.shape[0] * 100

ham     86.593683
spam    13.406317
Name: Category, dtype: float64

# Stopwords

In [5]:
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english') 

def noStopWords(text):
    
    lis = text.split(' ')
    liste = []
    for word in lis:
        # on enleve la ponctuation
        word = [char for char in word if char not in string.punctuation]
        word = ''.join(word)
        liste.append(word)

    # on enleve les stopwords
    noStopliste = [word for word in liste if word.lower() not in stop_words]

    return noStopliste

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\osman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data['msgNoStop'] = data.Message.apply(noStopWords)
data.head()

Unnamed: 0,Category,Message,msgNoStop
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, dont, think, goes, usf, lives, around, t..."


In [7]:
data.msgNoStop[0]

['Go',
 'jurong',
 'point',
 'crazy',
 'Available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'got',
 'amore',
 'wat']

# Stemming

In [8]:
from nltk.stem import SnowballStemmer

def stemListe(l):
    stemmer = SnowballStemmer('english')
    return [stemmer.stem(w) for w in l]

In [9]:
data['stem'] = data.msgNoStop.apply(stemListe)
data.head()

Unnamed: 0,Category,Message,msgNoStop,stem
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, jurong, point, crazy, Available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho..."


# Lematization

In [10]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\osman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def lemaWords(li):
    lematizer = WordNetLemmatizer()
    return [lematizer.lemmatize(w) for w in li]

In [12]:
data['lem'] = data.msgNoStop.apply(lemaWords)

In [13]:
data.head()

Unnamed: 0,Category,Message,msgNoStop,stem,lem
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, jurong, point, crazy, Available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[Nah, dont, think, go, usf, life, around, though]"


In [14]:
' '.join(data.lem[0])

'Go jurong point crazy Available bugis n great world la e buffet Cine got amore wat'

# TF - IDF

In [15]:
data['lemString'] = data.lem.apply(lambda x: ' '.join(x))


Unnamed: 0,Category,Message,msgNoStop,stem,lem,lemString
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, jurong, point, crazy, Available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[Go, jurong, point, crazy, Available, bugis, n...",Go jurong point crazy Available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]",Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin...",Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[U, dun, say, early, hor, U, c, already, say]",U dun say early hor U c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[Nah, dont, think, go, usf, life, around, though]",Nah dont think go usf life around though


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.lemString)
print(X)

  (0, 8579)	0.198815322464582
  (0, 1167)	0.3486933052544185
  (0, 3714)	0.16363884074024213
  (0, 2158)	0.2945775133441825
  (0, 1839)	0.3328654507722598
  (0, 4682)	0.2945775133441825
  (0, 8804)	0.23696359501268566
  (0, 3752)	0.1940778445754089
  (0, 1841)	0.2945775133441825
  (0, 1393)	0.26515305693057367
  (0, 2424)	0.2700389619655321
  (0, 6189)	0.24170107290185877
  (0, 4528)	0.3486933052544185
  (0, 3660)	0.15469022055939277
  (1, 5806)	0.5462933604518558
  (1, 8705)	0.43136824573695387
  (1, 4497)	0.5233633708961773
  (1, 4720)	0.4080786340459629
  (1, 5775)	0.27394819096941153
  (2, 71)	0.2333367067135839
  (2, 1252)	0.16856224050859708
  (2, 6512)	0.2333367067135839
  (2, 8232)	0.12539002251716486
  (2, 6462)	0.2333367067135839
  (2, 6568)	0.16611007463523586
  :	:
  (5567, 6032)	0.2203260479507876
  (5568, 3084)	0.6457639024659789
  (5568, 3443)	0.5628210076825846
  (5568, 3677)	0.3636095248537397
  (5568, 4043)	0.3660732290385455
  (5569, 7300)	0.5268366486379191
  (5569,