In [41]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter 
#from gensim.models.tfidfmodel import TfidfModel
#from gensim.corpora.dictionary import Dictionary

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

**Read the Text File**

In [61]:
f = open('data.txt', 'r')
text = f.read()

In [62]:
text[:1000]

"The Project Gutenberg EBook of Man to Man, by Jackson Gregory\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: Man to Man\n\nAuthor: Jackson Gregory\n\nRelease Date: July 29, 2006 [EBook #18933]\n\nLanguage: English\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK MAN TO MAN ***\n\n\n\n\nProduced by Al Haines\n\n\n\n\n\n\n\n\n\n\n[Frontispiece: The blazing heat was such that men and horses and steers\nsuffered terribly.]\n\n\n\n\n\n\nMAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nAUTHOR OF\n\nJUDITH OF BLUE LAKE RANCH, THE BELLS OF SAN JUAN, SIX FEET FOUR, ETC.\n\n\n\n\nILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGROSSET & DUNLAP\n\nPUBLISHERS -------- NEW YORK\n\n\n\n\nCOPYRIGHT, 1920, BY\n\nCHARLES SCRIBNER'S SONS\n\n\nPublished October, 1920\n\n\n\n\nCONTENTS\n\n\nCHAPTER

**Sentences Tokenization (Additional information, not will be used, you can skip and start from word tokens )**

In [87]:
row_split=text.split("\n")

In [88]:
row_split[:5]

['The Project Gutenberg EBook of Man to Man, by Jackson Gregory',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included']

In [85]:
sent_tokens=sent_tokenize(text)

In [86]:
sent_tokens[:3]

['The Project Gutenberg EBook of Man to Man, by Jackson Gregory\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.',
 'You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: Man to Man\n\nAuthor: Jackson Gregory\n\nRelease Date: July 29, 2006 [EBook #18933]\n\nLanguage: English\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK MAN TO MAN ***\n\n\n\n\nProduced by Al Haines\n\n\n\n\n\n\n\n\n\n\n[Frontispiece: The blazing heat was such that men and horses and steers\nsuffered terribly.]',
 'MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nAUTHOR OF\n\nJUDITH OF BLUE LAKE RANCH, THE BELLS OF SAN JUAN, SIX FEET FOUR, ETC.']

**Word Tokenization of words and lowercase**

In [63]:
tokens=word_tokenize(text.lower())

In [64]:
tokens[:10]

['the', 'project', 'gutenberg', 'ebook', 'of', 'man', 'to', 'man', ',', 'by']

In [65]:
len(tokens)

97042

In [66]:
type(tokens)

list

**Remove Punctuation and numbers**

In [48]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\H
[nltk_data]     P\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [67]:
tokens_1=[w for w in tokens if w.isalpha()]

In [68]:
tokens_1[:10]

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'man',
 'to',
 'man',
 'by',
 'jackson']

In [69]:
len(tokens_1)

77105

**Remove Stopwords**

In [53]:
stop_words = stopwords.words('english')

In [70]:
tokens_2=[t for t in tokens_1 if t not in stop_words]

In [71]:
tokens_2[:20]

['project',
 'gutenberg',
 'ebook',
 'man',
 'man',
 'jackson',
 'gregory',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'cost',
 'almost',
 'restrictions',
 'whatsoever',
 'may',
 'copy',
 'give',
 'away',
 'terms']

In [72]:
len(tokens_2)

38117

**lemmatization**

In [73]:
lem=WordNetLemmatizer()
tokens_3=[lem.lemmatize(t) for t in tokens_2]

In [74]:
tokens_3[:20]

['project',
 'gutenberg',
 'ebook',
 'man',
 'man',
 'jackson',
 'gregory',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'cost',
 'almost',
 'restriction',
 'whatsoever',
 'may',
 'copy',
 'give',
 'away',
 'term']

**Most_common**

In [75]:
corpus=tokens_3.copy()

In [84]:
print(Counter(corpus).most_common(5))

[('steve', 542), ('packard', 541), ('blenham', 524), ('man', 445), ('terry', 409)]


**PoST (Part of Speech Tagging)**

In [94]:
tagged=nltk.pos_tag(corpus)

In [95]:
tagged[:20]

[('project', 'NN'),
 ('gutenberg', 'NN'),
 ('ebook', 'NN'),
 ('man', 'NN'),
 ('man', 'NN'),
 ('jackson', 'NN'),
 ('gregory', 'NN'),
 ('ebook', 'NN'),
 ('use', 'NN'),
 ('anyone', 'NN'),
 ('anywhere', 'RB'),
 ('cost', 'VBZ'),
 ('almost', 'RB'),
 ('restriction', 'NN'),
 ('whatsoever', 'NN'),
 ('may', 'MD'),
 ('copy', 'VB'),
 ('give', 'VB'),
 ('away', 'RP'),
 ('term', 'NN')]

In [None]:
"""
CC     coordinating conjunction
CD     cardinal digit
DT     determiner
EX     existential there (like: "there is" ... think of it like "there exists")
FW     foreign word
IN     preposition/subordinating conjunction
JJ     adjective 'big'
JJR    adjective, comparative 'bigger'
JJS    adjective, superlative 'biggest'
LS     list marker 1)
MD     modal could, will
NN     noun, singular 'desk'
NNS    noun plural 'desks'
NNP    proper noun, singular 'Harrison'
NNPS   proper noun, plural 'Americans'
PDT    predeterminer 'all the kids'
POS    possessive ending parent's
PRP    personal pronoun I, he, she
PRP$   possessive pronoun my, his, hers
RB     adverb very, silently,
RBR    adverb, comparative better
RBS    adverb, superlative best
RP     particle give up
TO     to go 'to' the store.
UH     interjection errrrrrrrm
VB     verb, base form take
VBD    verb, past tense took
VBG    verb, gerund/present participle taking
VBN    verb, past participle taken
VBP    verb, sing. present, non-3d take
VBZ    verb, 3rd person sing. present takes
WDT    wh-determiner which
WP     wh-pronoun who, what
WP$    possessive wh-pronoun whose
WRB    wh-abverb where, when
"""

**Named Entity Regocnition (Drawing a tree)**

In [109]:
named_ent=nltk.ne_chunk(tagged)

In [110]:
print(named_ent[:20])

[('project', 'NN'), ('gutenberg', 'NN'), ('ebook', 'NN'), ('man', 'NN'), ('man', 'NN'), ('jackson', 'NN'), ('gregory', 'NN'), ('ebook', 'NN'), ('use', 'NN'), ('anyone', 'NN'), ('anywhere', 'RB'), ('cost', 'VBZ'), ('almost', 'RB'), ('restriction', 'NN'), ('whatsoever', 'NN'), ('may', 'MD'), ('copy', 'VB'), ('give', 'VB'), ('away', 'RP'), ('term', 'NN')]


In [None]:
#named_ent.draw()