In [1]:
import nltk
import random
import pyprind
import pandas as pd
import os

basepath = './aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        # for example, './aclImdb/train/pos/'
        path = os.path.join(basepath, s, l)
        # open all files in that path
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            # append review text and sentiment to dataframe
            df = df.append([[txt, l]], ignore_index = True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:21


In [2]:
df.head(10)

Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",pos
1,This is a gem. As a Film Four production - the...,pos
2,"I really like this show. It has drama, romance...",pos
3,This is the best 3-D experience Disney has at ...,pos
4,"Of the Korean movies I've seen, only three had...",pos
5,this movie is funny funny funny my favorite qu...,pos
6,I'm just starting to explore the so far wonder...,pos
7,There is no need for me to repeat the synopsis...,pos
8,"I got this movie with my BBC ""Jane Austen Coll...",pos
9,"This was a great movie, I would compare it to ...",pos


In [3]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    # replacing the punctuation with no space, which in effect deletes the punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)

df['review'] = df['review'].apply(remove_punctuation)

df.head(10)

Unnamed: 0,review,sentiment
0,Based on an actual story John Boorman shows th...,pos
1,This is a gem As a Film Four production the a...,pos
2,I really like this show It has drama romance a...,pos
3,This is the best 3D experience Disney has at t...,pos
4,Of the Korean movies Ive seen only three had r...,pos
5,this movie is funny funny funny my favorite qu...,pos
6,Im just starting to explore the so far wonderf...,pos
7,There is no need for me to repeat the synopsis...,pos
8,I got this movie with my BBC Jane Austen Colle...,pos
9,This was a great movie I would compare it to t...,pos


In [4]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

def removestopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    # joining the list of words with space separator
    return " ".join(text)

df['review'] = df['review'].apply(removestopwords)
df.head(10)

Unnamed: 0,review,sentiment
0,based actual story john boorman shows struggle...,pos
1,gem film four production anticipated quality i...,pos
2,really like show drama romance comedy rolled o...,pos
3,best 3d experience disney themeparks certainly...,pos
4,korean movies ive seen three really stuck firs...,pos
5,movie funny funny funny favorite quote movie t...,pos
6,im starting explore far wonderful world monsie...,pos
7,need repeat synopsis rendered glenn black whit...,pos
8,got movie bbc jane austen collection 5 dvds ol...,pos
9,great movie would compare movie game get end f...,pos


In [5]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatizewords(text):
    #WordNet Lemmatization
    text = [WordNetLemmatizer().lemmatize(word) for word in text.split()]
    return " ".join(text)

df['review'] = df['review'].apply(lemmatizewords)
df.head(10)

[nltk_data] Downloading package wordnet to /Users/patshih/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,review,sentiment
0,based actual story john boorman show struggle ...,pos
1,gem film four production anticipated quality i...,pos
2,really like show drama romance comedy rolled o...,pos
3,best 3d experience disney themeparks certainly...,pos
4,korean movie ive seen three really stuck first...,pos
5,movie funny funny funny favorite quote movie t...,pos
6,im starting explore far wonderful world monsie...,pos
7,need repeat synopsis rendered glenn black whit...,pos
8,got movie bbc jane austen collection 5 dvd old...,pos
9,great movie would compare movie game get end f...,pos


In [6]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenizeword(text):
    text = word_tokenize(text)
    return text

df['tokenized'] = df['review'].apply(tokenizeword)
df.head(10)

[nltk_data] Downloading package punkt to /Users/patshih/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,review,sentiment,tokenized
0,based actual story john boorman show struggle ...,pos,"[based, actual, story, john, boorman, show, st..."
1,gem film four production anticipated quality i...,pos,"[gem, film, four, production, anticipated, qua..."
2,really like show drama romance comedy rolled o...,pos,"[really, like, show, drama, romance, comedy, r..."
3,best 3d experience disney themeparks certainly...,pos,"[best, 3d, experience, disney, themeparks, cer..."
4,korean movie ive seen three really stuck first...,pos,"[korean, movie, ive, seen, three, really, stuc..."
5,movie funny funny funny favorite quote movie t...,pos,"[movie, funny, funny, funny, favorite, quote, ..."
6,im starting explore far wonderful world monsie...,pos,"[im, starting, explore, far, wonderful, world,..."
7,need repeat synopsis rendered glenn black whit...,pos,"[need, repeat, synopsis, rendered, glenn, blac..."
8,got movie bbc jane austen collection 5 dvd old...,pos,"[got, movie, bbc, jane, austen, collection, 5,..."
9,great movie would compare movie game get end f...,pos,"[great, movie, would, compare, movie, game, ge..."


In [7]:
nltk.download('averaged_perceptron_tagger')

# j is adject, r is adverb, and v is verb
# allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

def pos(text):
    # parts of speech tagging for each word 
    pos = nltk.pos_tag(text)
    
    adjective = []
    
    # make a list of  all adjectives identified by the allowed word types list above
    for w in pos:
        if w[1][0] in allowed_word_types:
            adjective.append(w[0])
    
    return adjective

df['tokenized'] = df['tokenized'].apply(pos)
df.head(10)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/patshih/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,review,sentiment,tokenized
0,based actual story john boorman show struggle ...,pos,"[actual, struggle, american, good, id, america..."
1,gem film four production anticipated quality i...,pos,"[shot, great, know, bad, see, slow, conceivabl..."
2,really like show drama romance comedy rolled o...,pos,"[lorelei, repeat, uptodate, think, know, th, f..."
3,best 3d experience disney themeparks certainly...,pos,"[original, squeak, original, greatest, musical..."
4,korean movie ive seen three really stuck first...,pos,"[korean, first, excellent, sister, second, thi..."
5,movie funny funny funny favorite quote movie t...,pos,"[funny, funny, funny, favorite, naked, indian,..."
6,im starting explore far wonderful world monsie...,pos,"[wonderful, ive, last, single, female, bring, ..."
7,need repeat synopsis rendered glenn black whit...,pos,"[glenn, black, white, powerful, bleakness, dee..."
8,got movie bbc jane austen collection 5 dvd old...,pos,"[old, first, different, thought, austen, cheer..."
9,great movie would compare movie game get end f...,pos,"[great, cant, shaken, happenedbr, come, matrix..."


In [8]:
all_words =[]
for x in df['tokenized']:
    all_words.extend(x)

print(len(all_words))

# creating a frequency distribution of each word.
freqdist = nltk.FreqDist(all_words)
# listing the 5000 most frequent words
frequent_features = list(freqdist.keys())[:5000]

print(len(frequent_features))

1286242
5000


In [9]:
def find_features(tokenized_review):
    features = {}
    for w in frequent_features:
        features[w] = (w in tokenized_review)
    return features

# Creating features for each review
# This can take a while...
featuresets = [(find_features(review), sentiment) for (review, sentiment) in zip(df['tokenized'], df['sentiment'])]

print(len(featuresets))
print(featuresets[0])

50000
({'actual': True, 'struggle': True, 'american': True, 'good': True, 'id': True, 'loose': True, 'caught': True, 'political': True, 'shot': True, 'split': True, 'spontaneous': True, 'beautiful': True, 'unforgettable': True, 'great': False, 'know': False, 'bad': False, 'see': False, 'slow': False, 'conceivable': False, 'documentary': False, 'grimmer': False, 'guilt': False, 'didnt': False, 'stop': False, 'lorelei': False, 'repeat': False, 'uptodate': False, 'think': False, 'th': False, 'fly': False, 'give': False, 'favorite': False, 'mainly': False, 'long': False, 'guess': False, 'br': False, 'happy': False, 'original': False, 'squeak': False, 'greatest': False, 'musical': False, 'aladdin': False, 'little': False, 'smile': False, 'entire': False, 'spectacular': False, 'korean': False, 'first': False, 'excellent': False, 'sister': False, 'second': False, 'third': False, 'fourth': False, 'oldboy': False, 'sympathy': False, 'thirst': False, 'quentin': False, 'gratuitous': False, 'u': F

In [10]:
from nltk.classify.naivebayes import NaiveBayesClassifier

# Shuffling the documents 
random.shuffle(featuresets)

trainng_set = featuresets[:40000]
testing_set = featuresets[10000:]

# This can take a while...
classifier = NaiveBayesClassifier.train(trainng_set)

print(nltk.classify.accuracy(classifier, testing_set))
print(classifier.show_most_informative_features(15))

0.816225
Most Informative Features
             unwatchable = True              neg : pos    =     15.9 : 1.0
                 rickman = True              pos : neg    =     14.3 : 1.0
                   dvdbr = True              pos : neg    =     11.7 : 1.0
               laughable = True              neg : pos    =     11.2 : 1.0
                 unfunny = True              neg : pos    =     11.1 : 1.0
              triumphant = True              pos : neg    =      9.7 : 1.0
                  wilder = True              pos : neg    =      9.7 : 1.0
                   worst = True              neg : pos    =      9.7 : 1.0
                 garbage = True              neg : pos    =      9.4 : 1.0
                bogosian = True              pos : neg    =      9.0 : 1.0
               engrossed = True              pos : neg    =      9.0 : 1.0
                flawless = True              pos : neg    =      9.0 : 1.0
              impeccable = True              pos : neg    =      

In [11]:
ground_truth = [test_case[1] for test_case in testing_set]
predictions = [classifier.classify(test_case[0]) for test_case in testing_set]

In [12]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

print(confusion_matrix(ground_truth, predictions))
print(classification_report(ground_truth, predictions))

[[16828  3221]
 [ 4130 15821]]
              precision    recall  f1-score   support

         neg       0.80      0.84      0.82     20049
         pos       0.83      0.79      0.81     19951

    accuracy                           0.82     40000
   macro avg       0.82      0.82      0.82     40000
weighted avg       0.82      0.82      0.82     40000



In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(["neg", "pos"])
ground_truth_enc = le.transform(ground_truth)
predictions_enc = le.transform(predictions)

auc = roc_auc_score(ground_truth_enc, predictions_enc)

print("AUC: {:.2f}".format(auc))

AUC: 0.82


In [14]:
# pip install -U textblob
from textblob import TextBlob
nltk.download('brown')

blob = TextBlob("Hello! Today is a good day!")
blob.tags
blob.noun_phrases

for sentence in blob.sentences:
        print(sentence, ": ", sentence.sentiment)
        
blob.translate(to="es")

[nltk_data] Downloading package brown to /Users/patshih/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Hello! :  Sentiment(polarity=0.0, subjectivity=0.0)
Today is a good day! :  Sentiment(polarity=0.875, subjectivity=0.6000000000000001)


TextBlob("¡Hola! ¡Hoy es un buen día!")

In [15]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=5000, max_df=.15)
X = vect.fit_transform(df['review'])

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0)

document_topics = lda.fit_transform(X)

print(lda.components_.shape)
document_topics

(10, 5000)


array([[0.4645126 , 0.27462099, 0.00128224, ..., 0.00128236, 0.20278419,
        0.00128234],
       [0.15382375, 0.00217439, 0.00217431, ..., 0.00217464, 0.62764491,
        0.00217505],
       [0.00263206, 0.4069163 , 0.00263228, ..., 0.00263207, 0.00263233,
        0.00263246],
       ...,
       [0.003334  , 0.00333389, 0.00333398, ..., 0.00333409, 0.00333371,
        0.00333391],
       [0.06243173, 0.0006995 , 0.00069948, ..., 0.00069956, 0.00069951,
        0.00069952],
       [0.00400035, 0.0040011 , 0.14120838, ..., 0.0573253 , 0.00400075,
        0.00400148]])

In [16]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
print(len(sorting))
print(sorting)

10
[[2004  501 2064 ... 2764 3810 1873]
 [4930 1653 1675 ... 1116 1129 1848]
 [ 114 1711  250 ... 1905  479 1873]
 ...
 [1439 1868 2882 ... 3810  479 1116]
 [4947 4816  228 ... 4053 3837 1129]
 [2935 4108 3230 ... 3868 4729 1087]]


In [17]:
feature_names = np.array(vect.get_feature_names())
print(len(feature_names))
print(feature_names)

5000
['10' '100' '1000' ... 'zero' 'zombie' 'zone']


In [18]:
def print_topics(topics, feature_names, sorting, topics_per_chunk, n_words):
    for i in range(0, len(topics), topics_per_chunk):
        # for each chunk:
        these_topics = topics[i: i + topics_per_chunk]
        # maybe we have less than topics_per_chunk left
        len_this_chunk = len(these_topics)
        print(these_topics)
        print(*these_topics)
        print(len_this_chunk)
        # print topic headers
        print(("topic {:<8}" * len_this_chunk).format(*these_topics))
        print(("-------- {0:<5}" * len_this_chunk).format(""))
        # print top n_words frequent words
        for i in range(n_words):
            try:
                print(("{:<14}" * len_this_chunk).format(*feature_names[sorting[these_topics, i]]))
            except:
                pass
        print("\n")
        
print_topics(topics=range(10), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

range(0, 5)
0 1 2 3 4
5
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
guy           woman         action        horror        role          
black         family        fight         killer        performance   
he            father        animation     zombie        play          
back          girl          cartoon       gore          cast          
police        young         hero          blood         john          
around        wife          villain       pretty        version       
woman         mother        series        house         robert        
car           friend        disney        vampire       star          
kill          child         original      there         mr            
cop           son           war           flick         played        


range(5, 10)
5 6 7 8 9
5
topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------    