In [1]:
import nltk
from nltk.corpus import movie_reviews

In [3]:
documents = [(list(movie_reviews.words(file)),category)
            for category in movie_reviews.categories()
            for file in movie_reviews.fileids(category)]

In [4]:
import random
random.shuffle(documents)

In [5]:
len(documents)

2000

In [6]:
all_words = nltk.FreqDist([w.lower() for w in movie_reviews.words()])
word_features = list(all_words)[:1000]

In [10]:
def document_features(document):
    features = {}
    document_words = set(document)
    for w in word_features:
        features[f'contains({w})'] = w in document_words
    return features

In [11]:
features = [(document_features(d),c) for d,c in documents]

In [12]:
features[0]

({'contains(,)': True,
  'contains(the)': True,
  'contains(.)': True,
  'contains(a)': True,
  'contains(and)': True,
  'contains(of)': True,
  'contains(to)': True,
  "contains(')": False,
  'contains(is)': True,
  'contains(in)': True,
  'contains(s)': False,
  'contains(")': True,
  'contains(it)': True,
  'contains(that)': True,
  'contains(-)': True,
  'contains())': True,
  'contains(()': True,
  'contains(as)': True,
  'contains(with)': True,
  'contains(for)': True,
  'contains(his)': True,
  'contains(this)': True,
  'contains(film)': True,
  'contains(i)': True,
  'contains(he)': True,
  'contains(but)': True,
  'contains(on)': True,
  'contains(are)': True,
  'contains(t)': False,
  'contains(by)': True,
  'contains(be)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(an)': True,
  'contains(who)': True,
  'contains(not)': True,
  'contains(you)': True,
  'contains(from)': True,
  'contains(at)': True,
  'contains(was)': True,
  'contains(have)': True

In [13]:
train_set, test_set = features[100:],features[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.83

In [15]:
classifier.show_most_informative_features(10)

Most Informative Features
         contains(worst) = True              neg : pos    =      4.4 : 1.0
          contains(mess) = True              neg : pos    =      4.1 : 1.0
        contains(stupid) = True              neg : pos    =      3.8 : 1.0
     contains(memorable) = True              pos : neg    =      3.6 : 1.0
        contains(boring) = True              neg : pos    =      3.5 : 1.0
     contains(excellent) = True              pos : neg    =      3.2 : 1.0
     contains(perfectly) = True              pos : neg    =      3.0 : 1.0
         contains(fails) = True              neg : pos    =      2.8 : 1.0
     contains(wonderful) = True              pos : neg    =      2.8 : 1.0
     contains(effective) = True              pos : neg    =      2.6 : 1.0


In [35]:
import re
most_informative_features = [re.findall('contains\((.*)\)',w)[0] for w,c in classifier.most_informative_features(10)]

In [36]:
most_informative_features

['worst',
 'mess',
 'stupid',
 'memorable',
 'boring',
 'excellent',
 'perfectly',
 'fails',
 'wonderful',
 'effective']

In [41]:
def document_features(document):
    features = {}
    document_words = set(document)
    for w in most_informative_features:
        features[f'contains({w})'] = w in document_words
    return features

In [42]:
features = [(document_features(d),c) for d,c in documents]

In [43]:
train_set, test_set = features[100:],features[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.65

In [44]:
classifier.show_most_informative_features(10)

Most Informative Features
         contains(worst) = True              neg : pos    =      4.4 : 1.0
          contains(mess) = True              neg : pos    =      4.1 : 1.0
        contains(stupid) = True              neg : pos    =      3.8 : 1.0
     contains(memorable) = True              pos : neg    =      3.6 : 1.0
        contains(boring) = True              neg : pos    =      3.5 : 1.0
     contains(excellent) = True              pos : neg    =      3.2 : 1.0
     contains(perfectly) = True              pos : neg    =      3.0 : 1.0
         contains(fails) = True              neg : pos    =      2.8 : 1.0
     contains(wonderful) = True              pos : neg    =      2.8 : 1.0
     contains(effective) = True              pos : neg    =      2.6 : 1.0


In [59]:
def document_features(document):
    features = {}
    p = ''.join(document)
    features['punc_ratio'] = len(re.findall('[\W]',p))/len(p)
    features['alphabet_ratio'] = len(re.findall('[a-zA-Z]',p))/len(p)
    features['digit_ratio'] = len(re.findall('\d',p))/len(p)
    return features

In [60]:
features = [(document_features(d),c) for d,c in documents]

In [61]:
train_set, test_set = features[100:],features[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.54

In [62]:
classifier.show_most_informative_features(10)

Most Informative Features
             digit_ratio = 0.0032502708559046588    neg : pos    =      1.7 : 1.0
              punc_ratio = 0.03959731543624161    neg : pos    =      1.7 : 1.0
          alphabet_ratio = 0.9523809523809523    neg : pos    =      1.7 : 1.0
          alphabet_ratio = 0.9583333333333334    neg : pos    =      1.7 : 1.0
             digit_ratio = 0.0026666666666666666    pos : neg    =      1.7 : 1.0
             digit_ratio = 0.0               pos : neg    =      1.1 : 1.0
             digit_ratio = 0.0005936479667557139    neg : pos    =      1.0 : 1.0
             digit_ratio = 0.0008269018743109151    neg : pos    =      1.0 : 1.0
             digit_ratio = 0.000864304235090752    neg : pos    =      1.0 : 1.0
             digit_ratio = 0.0010224948875255625    neg : pos    =      1.0 : 1.0


In [49]:
documents[0][0]

['teenagers',
 'have',
 'a',
 'lot',
 'of',
 'power',
 'in',
 'hollywood',
 '.',
 'every',
 'year',
 'countless',
 'films',
 'will',
 'be',
 'made',
 'targeting',
 'that',
 'audience',
 'in',
 'particular',
 ',',
 'and',
 'rely',
 'on',
 'the',
 'entire',
 'teenage',
 'population',
 'to',
 'turn',
 'out',
 'on',
 'friday',
 'and',
 'saturday',
 'nights',
 ',',
 'wallets',
 'in',
 'hand',
 '.',
 'the',
 'formula',
 'is',
 'very',
 'simple',
 ',',
 'you',
 'make',
 'a',
 'film',
 'with',
 'a',
 'big',
 'name',
 'young',
 'actor',
 'or',
 'actress',
 'with',
 'sex',
 'appeal',
 '.',
 'you',
 'add',
 'a',
 'high',
 'school',
 'environment',
 'that',
 'features',
 'everyone',
 'from',
 'prom',
 'queens',
 'to',
 'math',
 'club',
 'nerds',
 ',',
 'and',
 'then',
 'a',
 'very',
 'simple',
 'relationship',
 'conflict',
 'that',
 'can',
 'be',
 'worked',
 'out',
 'in',
 '90',
 'minutes',
 ',',
 'the',
 'typical',
 'teenage',
 'attention',
 'span',
 '.',
 'the',
 'response',
 'is',
 'enormous',


In [50]:
document_features(documents[0][0])

{'punc_ratio': 0.23153049482163407,
 'alphabet_ratio': 0.9670886075949368,
 'digit_ratio': 0.004142692750287687}

In [52]:
p = ''.join(documents[0][0])

In [64]:
re.findall('[\d]',p)

['9',
 '0',
 '9',
 '2',
 '9',
 '2',
 '9',
 '2',
 '9',
 '2',
 '9',
 '2',
 '9',
 '2',
 '1',
 '9',
 '8',
 '3']