# Imports

In [14]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/filipdej/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/filipdej/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data and tokenization

In [12]:
data = "In a hole in the ground there lived a hobbit Not a nasty dirty wet hole filled with the ends of worms and an oozy smell nor yet a dry bare sandy hole with nothing in it to sit down on or to eat it was a hobbit hole and that means comfort"
words = word_tokenize(data)
print(words)

['In', 'a', 'hole', 'in', 'the', 'ground', 'there', 'lived', 'a', 'hobbit', 'Not', 'a', 'nasty', 'dirty', 'wet', 'hole', 'filled', 'with', 'the', 'ends', 'of', 'worms', 'and', 'an', 'oozy', 'smell', 'nor', 'yet', 'a', 'dry', 'bare', 'sandy', 'hole', 'with', 'nothing', 'in', 'it', 'to', 'sit', 'down', 'on', 'or', 'to', 'eat', 'it', 'was', 'a', 'hobbit', 'hole', 'and', 'that', 'means', 'comfort']


# Stop words

In [3]:
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
words_filtered = [w for w in words if w not in stopWords]
print(words_filtered)

['In', 'hole', 'ground', 'lived', 'hobbit', 'Not', 'nasty', 'dirty', 'wet', 'hole', 'filled', 'ends', 'worms', 'oozy', 'smell', 'yet', 'dry', 'bare', 'sandy', 'hole', 'nothing', 'sit', 'eat', 'hobbit', 'hole', 'means', 'comfort']


In [6]:
set(words) - set(words_filtered)

{'a',
 'an',
 'and',
 'down',
 'in',
 'it',
 'nor',
 'of',
 'on',
 'or',
 'that',
 'the',
 'there',
 'to',
 'was',
 'with'}

# Stemming

In [7]:
ps = SnowballStemmer('english')
words_stemmed = [ps.stem(w) for w in words_filtered]
print(words_stemmed)

['in', 'hole', 'ground', 'live', 'hobbit', 'not', 'nasti', 'dirti', 'wet', 'hole', 'fill', 'end', 'worm', 'oozi', 'smell', 'yet', 'dri', 'bare', 'sandi', 'hole', 'noth', 'sit', 'eat', 'hobbit', 'hole', 'mean', 'comfort']


# Bag of Words representation

In [8]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([' '.join(words_stemmed)])
print(dict(zip(vectorizer.get_feature_names(), *X.toarray())))

{'bare': 1, 'comfort': 1, 'dirti': 1, 'dri': 1, 'eat': 1, 'end': 1, 'fill': 1, 'ground': 1, 'hobbit': 2, 'hole': 4, 'in': 1, 'live': 1, 'mean': 1, 'nasti': 1, 'not': 1, 'noth': 1, 'oozi': 1, 'sandi': 1, 'sit': 1, 'smell': 1, 'wet': 1, 'worm': 1, 'yet': 1}


# Tf-idf representation

In [16]:
# vectorizer = TfidfVectorizer(stop_words='english', analyzer=words_stemmed)
# Y = vectorizer.fit_transform(files).todense()

NameError: name 'files' is not defined