## MI-DDW - Text Mining

### NLTK

In [1]:
! pip install nltk
! pip install numpy
# ! pip install twython



In [1]:
import nltk
# nltk.download()
# Download Corpora -> brown webtext words stopwords
# Download Models -> punkt averaged_perceptron_tagger maxent_ne_chunker vader_lexicon wordnet tagsets
nltk.download(["brown","webtext", "words", "stopwords"] )
nltk.download(["punkt", "averaged_perceptron_tagger", "maxent_ne_chunker", "vader_lexicon", "wordnet", "tagsets"])

[nltk_data] Downloading package brown to
[nltk_data]     /home/users/m/martilad/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     /home/users/m/martilad/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/users/m/martilad/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/m/martilad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/m/martilad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/users/m/martilad/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data

True

In [2]:
from nltk.corpus import brown
brown.words()[0:10]
brown.tagged_words()[0:10]
len(brown.words())
# dir(brown)

1161192

### Text

In [4]:
with open('text.txt', 'r') as myfile:
    text = myfile.read()

### Basic Operations

#### Counting

In [5]:
from collections import Counter
tokens = nltk.word_tokenize(text)
def tokenCounts(tokens):
    counts = Counter(tokens)
    sortedCounts = sorted(counts.items(), key=lambda count:count[1], reverse=True)
    return sortedCounts

tokenCounts(tokens)[:10]

[(',', 9132),
 ('to', 4086),
 ('.', 4085),
 ('the', 4058),
 ('of', 3596),
 ('and', 3423),
 ('her', 2107),
 ('I', 2065),
 ('a', 1897),
 ('was', 1837)]

#### Remove Punctuation

In [9]:
from string import punctuation
punctuation += '“”--'
tokens = nltk.word_tokenize(text)

nopunc_tokens = [token for token in tokens if token not in punctuation]
tokenCounts(nopunc_tokens)[:10]

[('to', 4086),
 ('the', 4058),
 ('of', 3596),
 ('and', 3423),
 ('her', 2107),
 ('I', 2065),
 ('a', 1897),
 ('was', 1837),
 ('in', 1779),
 ('not', 1496)]

#### Stopwords

In [10]:
from nltk.corpus import stopwords
stops = stopwords.words('english')
tokens = nltk.word_tokenize(text)

filtered_tokens = [token for token in tokens if token not in punctuation]
filtered_tokens = [token for token in filtered_tokens if token not in stops]
tokenCounts(filtered_tokens)[:10]

[('I', 2065),
 ('Mr.', 768),
 ('Elizabeth', 631),
 ("'s", 574),
 ('could', 513),
 ('would', 465),
 ('said', 401),
 ('Darcy', 401),
 ('Mrs.', 337),
 ('She', 324)]

### Text Processing techniques

#### Sentence Splitting

In [12]:
sentences = nltk.sent_tokenize(text)

print(sentences[100:105])

['“Oh!', 'She is the most beautiful creature I ever beheld!', 'But there is one\nof her sisters sitting down just behind you, who is very pretty, and I\ndare say very agreeable.', 'Do let me ask my partner to introduce you.”\n\n“Which do you mean?” and turning round he looked for a moment at\nElizabeth, till catching her eye, he withdrew his own and coldly said:\n“She is tolerable, but not handsome enough to tempt _me_; I am in no\nhumour at present to give consequence to young ladies who are slighted\nby other men.', 'You had better return to your partner and enjoy her\nsmiles, for you are wasting your time with me.”\n\nMr. Bingley followed his advice.']


#### Tokenization

In [13]:
sentences = nltk.sent_tokenize(text)
tokens = [nltk.word_tokenize(sent) for sent in sentences]

print(tokens[100:105])

[['“', 'Oh', '!'], ['She', 'is', 'the', 'most', 'beautiful', 'creature', 'I', 'ever', 'beheld', '!'], ['But', 'there', 'is', 'one', 'of', 'her', 'sisters', 'sitting', 'down', 'just', 'behind', 'you', ',', 'who', 'is', 'very', 'pretty', ',', 'and', 'I', 'dare', 'say', 'very', 'agreeable', '.'], ['Do', 'let', 'me', 'ask', 'my', 'partner', 'to', 'introduce', 'you.', '”', '“', 'Which', 'do', 'you', 'mean', '?', '”', 'and', 'turning', 'round', 'he', 'looked', 'for', 'a', 'moment', 'at', 'Elizabeth', ',', 'till', 'catching', 'her', 'eye', ',', 'he', 'withdrew', 'his', 'own', 'and', 'coldly', 'said', ':', '“', 'She', 'is', 'tolerable', ',', 'but', 'not', 'handsome', 'enough', 'to', 'tempt', '_me_', ';', 'I', 'am', 'in', 'no', 'humour', 'at', 'present', 'to', 'give', 'consequence', 'to', 'young', 'ladies', 'who', 'are', 'slighted', 'by', 'other', 'men', '.'], ['You', 'had', 'better', 'return', 'to', 'your', 'partner', 'and', 'enjoy', 'her', 'smiles', ',', 'for', 'you', 'are', 'wasting', 'your'

#### Part-Of-Speech Tagging

In [None]:
sentences = nltk.sent_tokenize(text)
tokens = [nltk.word_tokenize(sent) for sent in sentences]
tagged = [nltk.pos_tag(sent) for sent in tokens]

print(tagged[100:105])
verbs = 0
nouns = 0
for i in tagged:
    for j in i:
        
        if j[1] == 'V':
            verbs+=1
        if j[1] == 'N':
            noun+=1
print('verbs', verbs)
print('noun', nouns)
    

In [11]:
nltk.help.upenn_tagset('JJ*')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
JJR: adjective, comparative
    bleaker braver breezier briefer brighter brisker broader bumper busier
    calmer cheaper choosier cleaner clearer closer colder commoner costlier
    cozier creamier crunchier cuter ...
JJS: adjective, superlative
    calmest cheapest choicest classiest cleanest clearest closest commonest
    corniest costliest crassest creepiest crudest cutest darkest deadliest
    dearest deepest densest dinkiest ...


#### Stemming and Lemmatization

In [22]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokens = nltk.word_tokenize(text)

stems = {token:stemmer.stem(token) for token in tokens}
#print(stems)
print(stems['universally'])
print(stems['remorse'])

univers
remors


In [23]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
tokens = nltk.word_tokenize(text)

lemmas = {token:lemmatizer.lemmatize(token) for token in tokens}
print(lemmas['universally'])
print(lemmas['remorse'])

universally
remorse


#### Named Entity Recognition

In [25]:
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)

ne_chunked = nltk.ne_chunk(tagged, binary=True)
print(ne_chunked[100:105])

[('you', 'PRP'), ('heard', 'VBP'), ('that', 'IN'), Tree('NE', [('Netherfield', 'NNP'), ('Park', 'NNP')]), ('is', 'VBZ')]


In [30]:
def extractEntities(ne_chunked):
    data = {}
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    return data

extractEntities(ne_chunked)

{'Jane Austen': 'PERSON',
 'Mr. Bennet': 'PERSON',
 'Netherfield Park': 'PERSON',
 'Long': 'PERSON',
 'Netherfield': 'ORGANIZATION',
 'England': 'GPE',
 'Mr. Morris': 'PERSON',
 'Michaelmas': 'ORGANIZATION',
 'Single': 'PERSON',
 'Mr. Bingley': 'PERSON',
 'Sir William': 'PERSON',
 'Lady Lucas': 'PERSON',
 'Lizzy': 'PERSON',
 'Jane': 'PERSON',
 'Lydia': 'PERSON',
 'Bennet': 'PERSON',
 'least.': 'ORGANIZATION',
 'Elizabeth': 'PERSON',
 'Kitty': 'PERSON',
 'Heaven': 'PERSON',
 'Mary': 'PERSON',
 'Mr. Bingley.': 'PERSON',
 'Hertfordshire': 'ORGANIZATION',
 'London': 'GPE',
 'Mr. Hurst': 'PERSON',
 'Mr. Darcy': 'PERSON',
 'Derbyshire': 'ORGANIZATION',
 'Miss Bingley': 'PERSON',
 'Elizabeth Bennet': 'PERSON',
 'Come': 'ORGANIZATION',
 'Darcy': 'PERSON',
 'Catherine': 'ORGANIZATION',
 'Longbourn': 'ORGANIZATION',
 'Everybody': 'PERSON',
 'Miss Lucas': 'PERSON',
 'Miss King': 'PERSON',
 'Maria Lucas': 'PERSON',
 'God': 'PERSON',
 'Netherfield House': 'FACILITY',
 'Bingley': 'PERSON',
 'Meryton

In [31]:
# Disambiguation of entity with entity types
ne_chunked = nltk.ne_chunk(tagged, binary=False)
extractEntities(ne_chunked)

{'Jane Austen': 'PERSON',
 'Mr. Bennet': 'PERSON',
 'Netherfield Park': 'PERSON',
 'Long': 'PERSON',
 'Netherfield': 'ORGANIZATION',
 'England': 'GPE',
 'Mr. Morris': 'PERSON',
 'Michaelmas': 'ORGANIZATION',
 'Single': 'PERSON',
 'Mr. Bingley': 'PERSON',
 'Sir William': 'PERSON',
 'Lady Lucas': 'PERSON',
 'Lizzy': 'PERSON',
 'Jane': 'PERSON',
 'Lydia': 'PERSON',
 'Bennet': 'PERSON',
 'least.': 'ORGANIZATION',
 'Elizabeth': 'PERSON',
 'Kitty': 'PERSON',
 'Heaven': 'PERSON',
 'Mary': 'PERSON',
 'Mr. Bingley.': 'PERSON',
 'Hertfordshire': 'ORGANIZATION',
 'London': 'GPE',
 'Mr. Hurst': 'PERSON',
 'Mr. Darcy': 'PERSON',
 'Derbyshire': 'ORGANIZATION',
 'Miss Bingley': 'PERSON',
 'Elizabeth Bennet': 'PERSON',
 'Come': 'ORGANIZATION',
 'Darcy': 'PERSON',
 'Catherine': 'ORGANIZATION',
 'Longbourn': 'ORGANIZATION',
 'Everybody': 'PERSON',
 'Miss Lucas': 'PERSON',
 'Miss King': 'PERSON',
 'Maria Lucas': 'PERSON',
 'God': 'PERSON',
 'Netherfield House': 'FACILITY',
 'Bingley': 'PERSON',
 'Meryton

#### Sentiment Analysis

In [None]:
from nltk.sentiment.util import *
from nltk.sentiment import SentimentIntensityAnalyzer

bneu = ""
neu = 0
bneg = ""
neg = 0
bpos = ""
pos = 0


vader_analyzer = SentimentIntensityAnalyzer()
for sentense in nltk.sent_tokenize(text):
    d = vader_analyzer.polarity_scores(text)
    if d['neg'] > neg:
        neg = d['neg']
        bneg = sentense
    if d['pos'] > pos:
        pos = d['pos']
        bpos = sentense
    if d['neu'] > neu:
        neu = d['neu']
        bneu = sentense

print("neg:", neg, bneg)
print("neu:", neu, bneu)
print("pos:", pos, bpos)

        

### Word Embeddings - Word2Vec

In [18]:
! pip install gensim



In [43]:
import gensim 
from gensim.models import Word2Vec 

sentences = nltk.sent_tokenize(text)
tokens = [nltk.word_tokenize(sent) for sent in sentences]
# data
from nltk.corpus import brown
data = brown.sents()

# Create CBOW model 
cbow_model = gensim.models.Word2Vec(tokens, min_count = 5, size = 100, window = 5, workers=10) 
  
# Create Skip Gram model 
sg_model = gensim.models.Word2Vec(tokens, min_count = 5, size = 100, window = 5, sg = 1, workers=10) 

In [49]:
cbow_model.wv.most_similar("love", topn=5)

[('often', 0.9995832443237305),
 ('us', 0.9995463490486145),
 ('consider', 0.9995065331459045),
 ('married', 0.9993481040000916),
 ('allow', 0.9992833137512207)]

In [50]:
sg_model.wv.most_similar("dead", topn=5)

[('sick', 0.9970794916152954),
 ('grave', 0.9968978762626648),
 ('sing', 0.9963023662567139),
 ('separate', 0.9946496486663818),
 ('uncivil', 0.9943731427192688)]

### Wordcloud

In [22]:
! pip install wordcloud Image matplotlib

Collecting Image
  Using cached https://files.pythonhosted.org/packages/0c/ec/51969468a8b87f631cc0e60a6bf1e5f6eec8ef3fd2ee45dc760d5a93b82a/image-1.5.27-py2.py3-none-any.whl
Collecting matplotlib
  Using cached https://files.pythonhosted.org/packages/83/2a/e47bbd9396af32376863a426baed62d9bf3091f81defd1fe81c5f33b11a3/matplotlib-3.0.3-cp37-cp37m-manylinux1_x86_64.whl
Collecting kiwisolver>=1.0.1 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/5c/7e/d6cae2f241ba474a2665f24b480bf4e247036d63939dda2bbc4d2ee5069d/kiwisolver-1.0.1-cp37-cp37m-manylinux1_x86_64.whl
Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/de/0a/001be530836743d8be6c2d85069f46fecf84ac6c18c7f5fb8125ee11d854/pyparsing-2.3.1-py2.py3-none-any.whl
Collecting cycler>=0.10 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8ab68b9c834e61/cycler-0.10.0-py2.py3

In [None]:
%matplotlib inline
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud().generate(text)

plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [25]:
with open('text.txt', 'r') as myfile:
    book = myfile.read()

In [38]:
from collections import Counter
tokens = nltk.word_tokenize(book)
def tokenCounts(tokens):
    counts = Counter(tokens)
    sortedCounts = sorted(counts.items(), key=lambda count:count[1], reverse=True)
    return sortedCounts

tokenCounts(tokens)[:10]

[(',', 9132),
 ('to', 4086),
 ('.', 4085),
 ('the', 4058),
 ('of', 3596),
 ('and', 3423),
 ('her', 2107),
 ('I', 2065),
 ('a', 1897),
 ('was', 1837)]

In [40]:
from nltk.corpus import stopwords
from string import punctuation
punctuation += '“”--'
stops = stopwords.words('english')
tokens = nltk.word_tokenize(book)

filtered_tokens = [token for token in tokens if token not in punctuation]
filtered_tokens = [token for token in filtered_tokens if token not in stops]
tokenCounts(filtered_tokens)[:10]

[('I', 2065),
 ('Mr.', 768),
 ('Elizabeth', 631),
 ("'s", 574),
 ('could', 513),
 ('would', 465),
 ('said', 401),
 ('Darcy', 401),
 ('Mrs.', 337),
 ('She', 324)]

In [45]:
sentences = nltk.sent_tokenize(book)
tokens = [nltk.word_tokenize(sent) for sent in sentences]

print(tokens[100:105])



[['“', 'Oh', '!'], ['She', 'is', 'the', 'most', 'beautiful', 'creature', 'I', 'ever', 'beheld', '!'], ['But', 'there', 'is', 'one', 'of', 'her', 'sisters', 'sitting', 'down', 'just', 'behind', 'you', ',', 'who', 'is', 'very', 'pretty', ',', 'and', 'I', 'dare', 'say', 'very', 'agreeable', '.'], ['Do', 'let', 'me', 'ask', 'my', 'partner', 'to', 'introduce', 'you.', '”', '“', 'Which', 'do', 'you', 'mean', '?', '”', 'and', 'turning', 'round', 'he', 'looked', 'for', 'a', 'moment', 'at', 'Elizabeth', ',', 'till', 'catching', 'her', 'eye', ',', 'he', 'withdrew', 'his', 'own', 'and', 'coldly', 'said', ':', '“', 'She', 'is', 'tolerable', ',', 'but', 'not', 'handsome', 'enough', 'to', 'tempt', '_me_', ';', 'I', 'am', 'in', 'no', 'humour', 'at', 'present', 'to', 'give', 'consequence', 'to', 'young', 'ladies', 'who', 'are', 'slighted', 'by', 'other', 'men', '.'], ['You', 'had', 'better', 'return', 'to', 'your', 'partner', 'and', 'enjoy', 'her', 'smiles', ',', 'for', 'you', 'are', 'wasting', 'your'

TypeError: unhashable type: 'list'

In [44]:
sentences = nltk.sent_tokenize(book)
tokens = [nltk.word_tokenize(sent) for sent in sentences]
tagged = [nltk.pos_tag(sent) for sent in tokens]

print(tagged[100:105])

[[('“', 'JJ'), ('Oh', 'UH'), ('!', '.')], [('She', 'PRP'), ('is', 'VBZ'), ('the', 'DT'), ('most', 'RBS'), ('beautiful', 'JJ'), ('creature', 'NN'), ('I', 'PRP'), ('ever', 'RB'), ('beheld', 'VBD'), ('!', '.')], [('But', 'CC'), ('there', 'EX'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('her', 'PRP$'), ('sisters', 'NNS'), ('sitting', 'VBG'), ('down', 'RP'), ('just', 'RB'), ('behind', 'IN'), ('you', 'PRP'), (',', ','), ('who', 'WP'), ('is', 'VBZ'), ('very', 'RB'), ('pretty', 'RB'), (',', ','), ('and', 'CC'), ('I', 'PRP'), ('dare', 'VBP'), ('say', 'VB'), ('very', 'RB'), ('agreeable', 'JJ'), ('.', '.')], [('Do', 'VB'), ('let', 'VB'), ('me', 'PRP'), ('ask', 'VB'), ('my', 'PRP$'), ('partner', 'NN'), ('to', 'TO'), ('introduce', 'VB'), ('you.', 'JJ'), ('”', 'NNP'), ('“', 'NNP'), ('Which', 'NNP'), ('do', 'VBP'), ('you', 'PRP'), ('mean', 'VB'), ('?', '.'), ('”', 'NN'), ('and', 'CC'), ('turning', 'VBG'), ('round', 'NN'), ('he', 'PRP'), ('looked', 'VBD'), ('for', 'IN'), ('a', 'DT'), ('moment', 'NN

In [None]:
nltk.help.upenn_tagset('JJ*')

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokens = nltk.word_tokenize(text)

stems = {token:stemmer.stem(token) for token in tokens}
print(stems)

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
tokens = nltk.word_tokenize(text)

lemmas = {token:lemmatizer.lemmatize(token) for token in tokens}
print(lemmas)