In [1]:
import os
from pathlib import Path
from sklearn.utils import shuffle

def load_Reviews(filepath):
  neg= [Path(filepath+'neg/'+ filename).read_text(encoding='utf-8')
        for filename in os.listdir(filepath+'neg/')]
  pos= [Path(filepath+'pos/'+ filename).read_text(encoding='utf-8')
        for filename in os.listdir(filepath+'pos/')]
  
  sentiment= ['negative']* len(neg) + ['positive']* len(pos)
  reviews=neg+pos

  return shuffle(reviews, sentiment) #shuffle order

train_reviews, train_sentiment=load_Reviews('aclImdb/train/')
test_reviews, test_sentiment=load_Reviews('aclImdb/test/')


In [24]:
train_reviews[6]

'Earlier today I got into an argument on why so many people complain about modern films in which I encountered a curious statement: "the character development in newer movies just isn\'t nearly as good or interesting as it used to be." Depending on the film(s) in question, this can be attributed to a number of things, sometimes generic special effects and plot-driven Hollywood garbage like War Of The Worlds, but in the case of over-the-top, uninteresting attempts at social commentary and a desperate struggle to put "art" back into cinema, it\'s movies like Dog Days that are to blame.<br /><br />I normally have a very high tolerance for movies, no matter how dull or pointless I find them (ranging from good, long ones like Andrei Rublev and Dogville, to ones I\'ve considered painful to sit through a la Alpha Dog and Wild Wild West). I shut this movie off 45 minutes in, which is 30 minutes more than I actually should have. I wasn\'t interested in any of the characters whatsoever and found

In [4]:
#to download raw training and test review sets, since this process takes a very long time
import json
raw_reviews={'train': (train_reviews, train_sentiment), 'test': (test_reviews, test_sentiment)}
with open('raw_reviews.json', 'w') as outfile:
    json.dump(raw_reviews, outfile)

In [11]:
train_reviews[9]

"I enjoyed this film. It was a joy to see a version so close to the vision of Peter O'Donnell.<br /><br />A number of people have disliked the film, but it has to be seen in context of the origin story that it is. The film uses flashback to show the young Modesty and the events that shaped her into the woman that she became. Before the Network. Before Willie Garvin.<br /><br />The pace is a trifle slow, and for my taste not enough tension is developed in the present day scenes. However this is acceptable just to get such a faithful version.<br /><br />If you like Modesty Blaise, you will enjoy it even with its faults, if you just want an action flick with car chases - forget it.<br /><br />It has the feeling of being the first of a franchise, but as I have never seen it promoted anywhere, I suspect there will be no more to follow. Sadly."

In [22]:
import text_normalizer as tn

norm_train_reviews= tn.normalize_corpus(train_reviews, stopword_removal=False, text_lemmatization=False)
norm_test_reviews= tn.normalize_corpus(test_reviews, stopword_removal=False, text_lemmatization=False)

# normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
#                      accented_char_removal=True, text_lower_case=True, 
#                      text_stemming=False, text_lemmatization=True, 
#                      special_char_removal=True, remove_digits=True,
#                      stopword_removal=True, stopwords=stopword_list)

In [None]:
#to download normalized training and test review sets, since this process takes a very long time
import json
norm_reviews={'train': (norm_train_reviews, train_sentiment), 'test': (norm_test_reviews, test_sentiment)}
with open('norm_reviews2.json', 'w') as outfile:
    json.dump(norm_reviews, outfile)

In [25]:
norm_train_reviews[9]

'i enjoyed this film it was a joy to see a version so close to the vision of peter ofdonnell a number of people have disliked the film but it has to be seen in context of the origin story that it is the film uses flashback to show the young modesty and the events that shaped her into the woman that she became before the network before willie garvin the pace is a trifle slow and for my taste not enough tension is developed in the present day scenes however this is acceptable just to get such a faithful version if you like modesty blaise you will enjoy it even with its faults if you just want an action flick with car chases forget it it has the feeling of being the first of a franchise but as i have never seen it promoted anywhere i suspect there will be no more to follow sadly'

In [4]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu
import nltk

np.set_printoptions(precision=2, linewidth=80)

In [6]:
#normalize datasets
stop_words= nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

norm_train_reviews= tn.normalize_corpus(train_reviews, stopwords= stop_words)
norm_test_reviews= tn.normalize_corpus(test_reviews, stopwords=stop_words)

# normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
#                      accented_char_removal=True, text_lower_case=True, 
#                      text_stemming=False, text_lemmatization=True, 
#                      special_char_removal=True, remove_digits=True,
#                      stopword_removal=True, stopwords=stopword_list)

In [7]:
len(norm_test_reviews)

25000

In [19]:
#to download normalized training and test review sets, since this process takes a very long time
import json
norm_reviews={'train': (norm_train_reviews, train_sentiment), 'test': (norm_test_reviews, test_sentiment)}
with open('norm_reviews.json', 'w') as outfile:
    json.dump(norm_reviews, outfile)

In [14]:
norm_train_reviews[18224]

'enjoy film joy see version close vision peter ofdonnell number people dislike film but see context origin story film use flashback show young modesty event shape woman become network willie garvin pace trifle slow taste not enough tension develop present day scene however acceptable get faithful version like modesty blaise enjoy even fault want action flick car chase forget feeling first franchise but never see promote anywhere suspect no follow sadly'

In [10]:
#load normalized and training test review sets
import json
with open ('norm_reviews.json') as f:
    norm_reviews=json.load(f)
    
norm_train_reviews, train_sentiment=norm_reviews['train']
norm_test_reviews, test_sentiment=norm_reviews['test']