# Working with the test data

### Installing the packages

In [3]:
#!pip install gensim
#!pip install nltk
from gensim.models import Word2Vec
from nltk import sent_tokenize
from nltk import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\macia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import numpy as np
import pandas as pd
import emoji
from nltk.corpus import stopwords
import string
from nltk.tokenize import RegexpTokenizer

### Reading in the data

I had issues with reading in the data in the traditional ways "pd.read..."

I decided to use with with open..... I used this in another project.

In [6]:
#path = "/Users/miklp/Documents/GitHub/Student-Projects/datasets_483_982_spam.csv"
path = "/Users/macia/Documents/MSIA-21/IRL-MSIA/datasets_483_982_spam.csv"

#path_1 = "https://raw.githubusercontent.com/mowgl-i/Student-Projects/master/datasets_483_982_spam.csv"
with open(path) as file:
    data = pd.read_csv(file)
data.head()

#data = pd.read_csv(path_1)


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


 Looks like only two of the columns are useful to us. 
 
 That would be the "class" of message that we get. Either "ham" or "spam"
 and the actual "message" we get. 
 
 Let's rename the columns and select only theh two columns we need. 

In [7]:
data.columns = ["class","text", "none","none","none"]

data = data[["class","text"]]

data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.shape

(5572, 2)

In [7]:
data.dtypes

class    object
text     object
dtype: object

In [8]:
data.isnull().sum()

class    0
text     0
dtype: int64

In [10]:
# Let's make sure only 2 classes exist

data.groupby("class").count()

Unnamed: 0_level_0,text
class,Unnamed: 1_level_1
ham,4825
spam,747


In [10]:
print(round(747/5572 * 100), "%  of our data is classified as spam")

13 %  of our data is classified as spam


Let's replicate the word2vec tutorial. 

One thing that I was having trouble with was removing the punctuation from the text.
I suspect that if I could traditionally read in the data, then traditional string manipulation methods would work?


# Data Preprocess

I aim to lowercase, remove punctuation, lemmatize and tokenize all emails.  

In [176]:
text = data['text']

In [181]:
tokens = list(data['text'].replace(",",'').str.lower().apply(word_tokenize))


In [202]:
tokens_1 = data['text'].tolist()
tokens_1 = str(tokens_1) 
test = sent_tokenize(tokens_1)

alpha_ = [t for t in sent_tokenize(tokens_1) if t.isalnum()]



79609

In [178]:
print(tokens[0:5])

[['go', 'until', 'jurong', 'point', ',', 'crazy..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...'], ['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...'], ['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', 'fa', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', 'c', "'s", 'apply', '08452810075over18', "'s"], ['u', 'dun', 'say', 'so', 'early', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...'], ['nah', 'i', 'do', "n't", 'think', 'he', 'goes', 'to', 'usf', ',', 'he', 'lives', 'around', 'here', 'though']]


In [180]:
from nltk.corpus import stopwords
#nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')



english_stops = set(stopwords.words('english'))

text = str(text).lower() 
alphabetical_only = [t for t in word_tokenize(text) if t.isalnum()]
No_stopwords = [t for t in alphabetical_only if t not in english_stops]

wordNlemma = WordNetLemmatizer()
lemma_text = [wordNlemma.lemmatize(t) for t in No_stopwords]

print(text[:100])
#print(alphabetical_only[:100])
#print(No_stopwords[:100])
#print(lemma_text[:100])

TypeError: 'method' object is not subscriptable

In [163]:
print(len(text))
print(len(alphabetical_only))
print(len(No_stopwords))
print(len(lemma_text))

3577
574
353
353


 In this format, the text here is just like the example word2vec tutorial. A list of lists where each list is a "message". In the tutorial, each list was a "sentence".

In [169]:
# Using the tutorial set params

test_vector = Word2Vec(tokens, min_count = 1, window = 3, workers = 1, size =  2, seed = 1, sg = 1)
print(test_vector)

Word2Vec(vocab=9861, size=2, alpha=0.025)


In [145]:
# I want to clean up the text, naturally, but spam emails contain strange words so we should keep
# if we plan to build a spam classifier.

test_vector.wv.most_similar('sex')

[('box61', 1.0),
 ('noline', 0.9999999403953552),
 ('their', 0.9999999403953552),
 ('09066612661', 0.9999999403953552),
 ('auction', 0.9999999403953552),
 ('havenåõt', 0.9999998807907104),
 ('me.need', 0.9999998807907104),
 ('ar', 0.9999998807907104),
 ('under', 0.9999998807907104),
 ('nokia6650', 0.9999998807907104)]

In [146]:
test_vector.wv.most_similar('class')

[('jerry', 1.0),
 ('look', 1.0),
 ('demand', 0.9999998807907104),
 ('fab', 0.9999998807907104),
 ('narcotics', 0.9999998807907104),
 ('82468', 0.9999998211860657),
 ('anjola', 0.9999998211860657),
 ('lodge', 0.9999997019767761),
 ('strokes', 0.9999995827674866),
 ('oz', 0.999999463558197)]

In [149]:
test_vector.wv.most_similar('home')

[('it', 1.0),
 ('not', 0.9999998807907104),
 ('gpu', 0.9999998807907104),
 ('watched', 0.9999998807907104),
 ('10th', 0.9999997615814209),
 ('skins', 0.9999997615814209),
 ('\\mix\\', 0.9999995827674866),
 ('sexiest', 0.9999995231628418),
 ('tryin', 0.9999992847442627),
 ('determine', 0.9999987483024597)]

In [150]:
test_vector[test_vector.wv.vocab][0]

# This is the position of the first vocab word in our wv space

  """Entry point for launching an IPython kernel.


array([2.0983374, 1.7074937], dtype=float32)

# Now let's do this with the 'clean' data
this data contains only alpabetical numbers, which are all lowercase without any stop words. 




In [164]:
print(lemma_text[:100])

['0', 'go', 'jurong', 'point', 'available', '1', 'ok', 'lar', 'joking', 'wif', 'u', 'oni', '2', 'free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'fina', '3', 'u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say', '4', 'nah', 'think', 'go', 'usf', 'life', 'aro', '5', 'freemsg', 'hey', 'darling', '3', 'week', 'n', '6', 'even', 'brother', 'like', 'speak', '7', 'per', 'request', 'melle', 'oru', 'minnamin', '8', 'winner', 'valued', 'network', 'customer', '9', 'mobile', '11', 'month', 'u', 'r', 'entitle', '10', 'gon', 'na', 'home', 'soon', 'want', 'tal', '11', 'six', 'chance', 'win', 'cash', '100', 'po', '12', 'urgent', '1', 'week', 'free', 'membership', '13', 'searching', 'right', 'word', 'tha', '14', 'date', 'sunday', '15', 'xxxmobilemovieclub', 'use']


In [168]:
test_vector = Word2Vec(lemma_text, min_count = 1, window = 3, workers = 1, size =  2, seed = 1, sg = 1)
print(test_vector)

Word2Vec(vocab=38, size=2, alpha=0.025)


In [167]:
test_vector.wv.most_similar('searching')

KeyError: "word 'searching' not in vocabulary"

In [140]:
test_vector.wv.most_similar('home')

KeyError: "word 'home' not in vocabulary"

In [157]:
from collections import Counter
bow = Counter(lemma_text)
bow.most_common(20)

280

### Now, I'd like to see which tokens / words are most used with each class. 

In [22]:
token_counts = data.groupby("class")['text'].apply(lambda x: nltk.FreqDist(nltk.tokenize.word_tokenize(' '.join(x))))

In [23]:
token_counts_df = token_counts.to_frame().reset_index()#sort_values(by = "text", ascending =  False)


#token_counts_df.nlargest(3, columns = "text")

print(token_counts_df[token_counts_df["class"] == "ham"].nlargest(10,columns="text"))
print(token_counts_df[token_counts_df["class"] == "spam"].nlargest(10,columns="text"))


# As you can see it'd be important to remove stop words and punctuation. 

     class level_1    text
97     ham       .  3886.0
1374   ham       I  1910.0
9295   ham     you  1697.0
8616   ham      to  1540.0
79     ham       ,  1500.0
378    ham       ?  1367.0
99     ham     ...  1223.0
8500   ham     the  1022.0
2999   ham       a   969.0
5610   ham       i   917.0
      class level_1    text
9390   spam       .  1005.0
12774  spam      to   608.0
9349   spam       !   542.0
9382   spam       ,   371.0
11571  spam       a   358.0
13005  spam     you   189.0
11721  spam    call   187.0
13006  spam    your   187.0
12396  spam      or   185.0
9353   spam       &   178.0
