In [1]:
import nltk

In [2]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

In [3]:
sentence = "The Quick brown fox, Jumps over the lazy little dog. Hello World."

In [4]:
sentence.split()

['The',
 'Quick',
 'brown',
 'fox,',
 'Jumps',
 'over',
 'the',
 'lazy',
 'little',
 'dog.',
 'Hello',
 'World.']

In [5]:
import string
from string import punctuation

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
sentence = [char for char in sentence if char not in string.punctuation]

In [8]:
sentence = ''.join(sentence)

In [9]:
sentence

'The Quick brown fox Jumps over the lazy little dog Hello World'

In [10]:
words = word_tokenize(sentence)

words

['The',
 'Quick',
 'brown',
 'fox',
 'Jumps',
 'over',
 'the',
 'lazy',
 'little',
 'dog',
 'Hello',
 'World']

In [11]:
nltk.pos_tag(words)

[('The', 'DT'),
 ('Quick', 'NNP'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('Jumps', 'NNP'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('little', 'JJ'),
 ('dog', 'NN'),
 ('Hello', 'NNP'),
 ('World', 'NNP')]

In [12]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [13]:
syn = wordnet.synsets("speak")[0]

In [14]:
syn.hypernyms()

[Synset('communicate.v.02')]

In [15]:
syn.hyponyms()

[Synset('babble.v.01'),
 Synset('bark.v.01'),
 Synset('bay.v.01'),
 Synset('begin.v.04'),
 Synset('blubber.v.02'),
 Synset('blurt_out.v.01'),
 Synset('bumble.v.03'),
 Synset('cackle.v.01'),
 Synset('chatter.v.04'),
 Synset('chatter.v.05'),
 Synset('deliver.v.01'),
 Synset('drone.v.02'),
 Synset('enthuse.v.02'),
 Synset('generalize.v.02'),
 Synset('gulp.v.02'),
 Synset('hiss.v.03'),
 Synset('lip_off.v.01'),
 Synset('mumble.v.01'),
 Synset('murmur.v.01'),
 Synset('open_up.v.07'),
 Synset('peep.v.04'),
 Synset('rant.v.01'),
 Synset('rasp.v.02'),
 Synset('read.v.03'),
 Synset('shout.v.01'),
 Synset('sing.v.02'),
 Synset('slur.v.03'),
 Synset('snap.v.01'),
 Synset('snivel.v.01'),
 Synset('speak_in_tongues.v.01'),
 Synset('speak_up.v.02'),
 Synset('swallow.v.04'),
 Synset('talk_of.v.01'),
 Synset('tone.v.01'),
 Synset('tone.v.02'),
 Synset('troll.v.07'),
 Synset('verbalize.v.01'),
 Synset('vocalize.v.05'),
 Synset('whiff.v.05'),
 Synset('whisper.v.01'),
 Synset('yack.v.01')]

In [16]:
words

['The',
 'Quick',
 'brown',
 'fox',
 'Jumps',
 'over',
 'the',
 'lazy',
 'little',
 'dog',
 'Hello',
 'World']

In [17]:
clean_words = [w.lower() for w in words if w.lower() not in stopwords.words('english')]

In [18]:
clean_words

['quick', 'brown', 'fox', 'jumps', 'lazy', 'little', 'dog', 'hello', 'world']

In [19]:
movie_reviews.categories()

['neg', 'pos']

In [20]:
all_words = movie_reviews.words()

In [21]:
freq = nltk.FreqDist(all_words)

In [22]:
freq

FreqDist({'plot': 1513,
          ':': 3042,
          'two': 1911,
          'teen': 151,
          'couples': 27,
          'go': 1113,
          'to': 31937,
          'a': 38106,
          'church': 69,
          'party': 183,
          ',': 77717,
          'drink': 32,
          'and': 35576,
          'then': 1424,
          'drive': 105,
          '.': 65876,
          'they': 4825,
          'get': 1949,
          'into': 2623,
          'an': 5744,
          'accident': 104,
          'one': 5852,
          'of': 34123,
          'the': 76529,
          'guys': 268,
          'dies': 104,
          'but': 8634,
          'his': 9587,
          'girlfriend': 218,
          'continues': 88,
          'see': 1749,
          'him': 2633,
          'in': 21822,
          'her': 4522,
          'life': 1586,
          'has': 4719,
          'nightmares': 26,
          'what': 3322,
          "'": 30585,
          's': 18513,
          'deal': 219,
          '?': 3771,
          'wa

In [23]:
freq.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [24]:
def create_word_features(w):
    useful_words = [w.lower() for w in words if w.lower() not in stopwords.words('english')]
    my_dict = dict([(word, True) for word in useful_words])
    
    return my_dict


In [25]:
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    print(words)
    #neg_reviews.append((create_word_features(words), "negative"))


['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...]
['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...]
['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...]
['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...]
['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]
['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...]
['that', "'", 's', 'exactly', 'how', 'long', 'the', ...]
['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...]
['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...]
['best', 'remembered', 'for', 'his', 'understated', ...]
['janeane', 'garofalo', 'in', 'a', 'romantic', ...]
['and', 'now', 'the', 'high', '-', 'flying', 'hong', ...]
['a', 'movie', 'like', 'mortal', 'kombat', ':', ...]
['she', 'was', 'the', 'femme', 'in', '"', 'la', ...]
['john', 'carpenter', 'makes', 'b', '-', 'movies', '.', ...]
['i', "'", 'm', 'really', 'starting', 'to', 'wonder', ...]
['so', 'what', 'do',

['robin', 'williams', 'is', 'a', 'comedic', 'genus', ...]
['ever', 'watch', 'a', 'very', 'young', 'child', 'try', ...]
['what', 'are', 'we', 'going', 'to', 'do', 'with', ...]
['there', 'are', 'some', 'pretty', 'impressive', ...]
['so', ',', 'it', "'", 's', 'thirty', 'years', 'later', ...]
['except', 'for', 'a', 'few', 'bright', 'moments', ...]
['maybe', 'this', 'mission', 'should', 'have', 'been', ...]
['mulholland', 'drive', 'did', 'very', 'well', 'at', ...]
['ingredients', ':', 'possessed', 'plastic', 'dolls', ...]
['watching', 'the', 'movie', ',', 'i', 'vowed', 'to', ...]
['i', 'was', 'going', 'to', 'see', 'ram', 'shrasta', ...]
['starship', 'troopers', 'is', 'a', 'bad', 'movie', ...]
['capsule', ':', 'one', 'of', 'the', 'ten', 'worst', ...]
['hello', 'kids', '.', 'today', 'the', 'movie', ...]
['deserves', 'recognition', 'for', ':', 'achieving', ...]
['starring', 'ben', 'stiller', ',', 'elizabeth', ...]
['the', 'camera', 'zooms', 'in', 'incredibly', 'close', ...]
['"', 'goodbye', ',

['capsule', ':', 'a', 'wild', 'jungle', 'woman', 'and', ...]
['`', 'the', 'bachelor', "'", 'is', 'one', 'of', 'the', ...]
['topless', 'women', 'talk', 'about', 'their', 'lives', ...]
['beware', 'of', 'movies', 'with', 'the', 'director', ...]
['august', 'and', 'september', 'are', 'a', 'wasteland', ...]
['the', 'event', 'horizon', 'is', 'the', 'boundary', ...]
['there', 'is', 'a', 'rule', 'when', 'it', 'comes', ...]
['as', 'a', 'hot', '-', 'shot', 'defense', 'attorney', ...]
['i', "'", 'm', 'not', 'sure', 'i', 'should', 'be', ...]
['the', 'most', 'interesting', 'thing', 'about', ...]
['an', 'american', 'werewolf', 'in', 'paris', 'is', ...]
['depending', 'on', 'who', 'you', 'ask', ',', 'the', ...]
['wizards', 'is', 'an', 'animated', 'feature', 'that', ...]
['inspired', 'by', 'the', '1958', 'film', 'house', ...]
['capsule', ':', 'liebes', 'meets', 'tod', '.', 'this', ...]
['you', 'know', 'that', 'a', 'movie', 'has', 'issues', ...]
['adam', 'sandler', 'vehicles', 'are', 'never', ...]
['dese

['the', 'general', "'", 's', 'daughter', 'will', ...]
['cradle', 'will', 'rock', 'is', 'the', 'latest', ...]
['disillusioned', 'and', 'trying', 'to', 'find', 'the', ...]
['kate', '(', 'jennifer', 'aniston', ')', 'is', ...]
['this', 'review', 'contains', 'spoilers', ',', 'but', ...]
['"', 'have', 'you', 'ever', 'heard', 'the', 'one', ...]
['"', 'we', 'are', 'grateful', 'that', 'we', 'have', ...]
['what', 'happens', 'when', 'you', 'put', 'martin', ...]
['blatantly', 'borrowing', 'elements', 'from', '1993', ...]
['i', 'won', '\x12', 't', 'even', 'pretend', 'that', ...]
['synopsis', ':', 'lifelong', 'friends', 'rafe', '(', ...]
['this', 'is', 'a', 'film', 'that', 'i', 'was', ...]
['i', 'love', 'movies', '.', 'i', 'really', 'do', '.', ...]
['i', 'looked', 'at', 'the', '"', 'internet', 'movie', ...]
['the', 'formula', 'is', 'simple', '.', 'trap', 'a', ...]
['the', 'thirteenth', 'floor', 'is', 'a', 'bland', ',', ...]
['my', 'giant', 'begins', 'with', 'a', 'monologue', ...]
['alexandre', 'duma

['when', 'considering', 'david', 'fincher', "'", 's', ...]
['weighed', 'down', 'by', 'tired', 'plot', 'lines', ...]
['if', 'you', "'", 're', 'going', 'to', 'make', 'a', ...]
['it', 'used', 'to', 'be', 'that', 'not', 'just', ...]
['hav', 'plenty', ',', 'as', 'we', 'are', 'told', 'in', ...]
['it', 'seems', 'that', 'i', "'", 've', 'stopped', ...]
['note', ':', 'some', 'may', 'consider', 'portions', ...]
['everything', 'about', 'this', 'ninth', 'trek', ...]
['this', 'is', 'one', 'of', 'the', 'worst', 'big', '-', ...]
['phew', ',', 'what', 'a', 'mess', '!', 'for', 'his', ...]
['in', 'french', ',', 'the', 'phrase', '"', 'film', ...]
['plot', ':', 'lara', 'croft', 'is', 'british', ',', ...]
['it', 'happens', 'every', 'year', '--', 'the', 'days', ...]
['i', 'heard', 'actor', 'skeet', 'ulrich', ...]
['it', 'is', 'with', 'some', 'sad', 'irony', 'that', ...]
['sometimes', 'a', 'stellar', 'cast', 'can', ...]
['the', 'most', 'depressing', 'thing', 'about', 'the', ...]
['when', 'walt', 'disney', 'pi

In [26]:
neg_reviews

[]