# Imports

In [44]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

[nltk_data] Downloading package punkt to /Users/filipdej/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/filipdej/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data and tokenization

In [86]:
data = "In a hole in the ground there lived a hobbit Not a nasty dirty wet hole filled with the ends of worms and an oozy smell nor yet a dry bare sandy hole with nothing in it to sit down on or to eat it was a hobbit hole and that means comfort"
words = word_tokenize(data)
print(data,"\n\n", words)

In a hole in the ground there lived a hobbit Not a nasty dirty wet hole filled with the ends of worms and an oozy smell nor yet a dry bare sandy hole with nothing in it to sit down on or to eat it was a hobbit hole and that means comfort 

 ['In', 'a', 'hole', 'in', 'the', 'ground', 'there', 'lived', 'a', 'hobbit', 'Not', 'a', 'nasty', 'dirty', 'wet', 'hole', 'filled', 'with', 'the', 'ends', 'of', 'worms', 'and', 'an', 'oozy', 'smell', 'nor', 'yet', 'a', 'dry', 'bare', 'sandy', 'hole', 'with', 'nothing', 'in', 'it', 'to', 'sit', 'down', 'on', 'or', 'to', 'eat', 'it', 'was', 'a', 'hobbit', 'hole', 'and', 'that', 'means', 'comfort']


# Stop words

Getting rid of excessive words

In [3]:
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
words_filtered = [w for w in words if w not in stopWords]
print(words_filtered)

['In', 'hole', 'ground', 'lived', 'hobbit', 'Not', 'nasty', 'dirty', 'wet', 'hole', 'filled', 'ends', 'worms', 'oozy', 'smell', 'yet', 'dry', 'bare', 'sandy', 'hole', 'nothing', 'sit', 'eat', 'hobbit', 'hole', 'means', 'comfort']


In [6]:
set(words) - set(words_filtered)

{'a',
 'an',
 'and',
 'down',
 'in',
 'it',
 'nor',
 'of',
 'on',
 'or',
 'that',
 'the',
 'there',
 'to',
 'was',
 'with'}

# Stemming

Getting rid of word ends

In [7]:
ps = SnowballStemmer('english')
words_stemmed = [ps.stem(w) for w in words_filtered]
print(words_stemmed)

['in', 'hole', 'ground', 'live', 'hobbit', 'not', 'nasti', 'dirti', 'wet', 'hole', 'fill', 'end', 'worm', 'oozi', 'smell', 'yet', 'dri', 'bare', 'sandi', 'hole', 'noth', 'sit', 'eat', 'hobbit', 'hole', 'mean', 'comfort']


# Bag of Words

Simple document vectorization

https://en.wikipedia.org/wiki/Bag-of-words_model

In [83]:
data = ['John likes icecream', 'Sarah likes John John likes Sarah', 'John plays game']

vectorizer = CountVectorizer()
Y = vectorizer.fit_transform(data)

for text_repr,text in zip(Y,data):
    print(text,":")
    print(dict(zip(vectorizer.get_feature_names(), *text_repr.toarray())),"\n")

John likes icecream :
{'game': 0, 'icecream': 1, 'john': 1, 'likes': 1, 'plays': 0, 'sarah': 0} 

Sarah likes John John likes Sarah :
{'game': 0, 'icecream': 0, 'john': 2, 'likes': 2, 'plays': 0, 'sarah': 2} 

John plays game :
{'game': 1, 'icecream': 0, 'john': 1, 'likes': 0, 'plays': 1, 'sarah': 0} 



# Tf-idf 

Wise document vectorization

https://en.wikipedia.org/wiki/Tf–idf

In [80]:
data = ['John likes icecream', 'Sarah likes John John likes Sarah', 'John plays game']
stemmer = SnowballStemmer('english')
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vectorizer = TfidfVectorizer(stop_words='english', analyzer=stemmed_words)
Y = vectorizer.fit_transform(data).todense()

for text_repr,text in zip(Y,data):
    print(text,":")
    print(dict(zip(vectorizer.get_feature_names(), *np.asarray(text_repr))),"\n")

John likes icecream :
{'game': 0.0, 'icecream': 0.7203334490549893, 'john': 0.4254405389711991, 'like': 0.5478321549274363, 'play': 0.0, 'sarah': 0.0} 

Sarah likes John John likes Sarah :
{'game': 0.0, 'icecream': 0.0, 'john': 0.4254405389711991, 'like': 0.5478321549274363, 'play': 0.0, 'sarah': 0.7203334490549893} 

John plays game :
{'game': 0.652490884512534, 'icecream': 0.0, 'john': 0.3853716274664007, 'like': 0.0, 'play': 0.652490884512534, 'sarah': 0.0} 

