In [194]:
import pandas
import bs4
import advertools
import itertools
import nltk

Read the database.

In [195]:
df = pandas.read_csv('IMDB-Dataset.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


Remove web tags.

In [196]:
df['review'] = df['review'].apply(lambda line: bs4.BeautifulSoup(line, 'lxml').text)
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


Some simple custom translations.

In [197]:
def simple_translate(line):
    better_line = line
    better_line = better_line.replace('+', ' plus ')
    better_line = better_line.replace('&', ' and ')
    return better_line
df['review'] = df['review'].apply(simple_translate)
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh and innovati...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [198]:
def remove_puctuation(line):
    return ''.join(map(lambda c: c if c.isalnum() or c=="'" or not c.isascii() else ' ', line))
df['review'] = df['review'].apply(remove_puctuation)
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,Petter Mattei's Love in the Time of Money is...,positive
5,Probably my all time favorite movie a story o...,positive
6,I sure would like to see a resurrection of a u...,positive
7,This show was an amazing fresh and innovati...,negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


Split into words.

In [199]:
# df['review'] = df['review'].apply(lambda line: list(itertools.chain.from_iterable(advertools.word_tokenize(line, 1))))
from nltk.tokenize import TreebankWordTokenizer
t = nltk.treebank.TreebankWordTokenizer()
df['review'] = df['review'].apply(TreebankWordTokenizer().tokenize)
df.head(10)

Unnamed: 0,review,sentiment
0,"[One, of, the, other, reviewers, has, mentione...",positive
1,"[A, wonderful, little, production, The, filmin...",positive
2,"[I, thought, this, was, a, wonderful, way, to,...",positive
3,"[Basically, there, 's, a, family, where, a, li...",negative
4,"[Petter, Mattei, 's, Love, in, the, Time, of, ...",positive
5,"[Probably, my, all, time, favorite, movie, a, ...",positive
6,"[I, sure, would, like, to, see, a, resurrectio...",positive
7,"[This, show, was, an, amazing, fresh, and, inn...",negative
8,"[Encouraged, by, the, positive, comments, abou...",negative
9,"[If, you, like, original, gut, wrenching, laug...",positive


Remove words with special characters.

In [200]:
df['review'] = df['review'].apply(lambda words: list(filter(lambda word: word.isascii(), words)))
df.head(10)

Unnamed: 0,review,sentiment
0,"[One, of, the, other, reviewers, has, mentione...",positive
1,"[A, wonderful, little, production, The, filmin...",positive
2,"[I, thought, this, was, a, wonderful, way, to,...",positive
3,"[Basically, there, 's, a, family, where, a, li...",negative
4,"[Petter, Mattei, 's, Love, in, the, Time, of, ...",positive
5,"[Probably, my, all, time, favorite, movie, a, ...",positive
6,"[I, sure, would, like, to, see, a, resurrectio...",positive
7,"[This, show, was, an, amazing, fresh, and, inn...",negative
8,"[Encouraged, by, the, positive, comments, abou...",negative
9,"[If, you, like, original, gut, wrenching, laug...",positive


Remove non-words.

In [201]:
def possibly_word(word):
    return word.upper() != word.lower()
df['review'] = df['review'].apply(lambda words: list(filter(possibly_word, words)))
df.head(10)

Unnamed: 0,review,sentiment
0,"[One, of, the, other, reviewers, has, mentione...",positive
1,"[A, wonderful, little, production, The, filmin...",positive
2,"[I, thought, this, was, a, wonderful, way, to,...",positive
3,"[Basically, there, 's, a, family, where, a, li...",negative
4,"[Petter, Mattei, 's, Love, in, the, Time, of, ...",positive
5,"[Probably, my, all, time, favorite, movie, a, ...",positive
6,"[I, sure, would, like, to, see, a, resurrectio...",positive
7,"[This, show, was, an, amazing, fresh, and, inn...",negative
8,"[Encouraged, by, the, positive, comments, abou...",negative
9,"[If, you, like, original, gut, wrenching, laug...",positive


In [202]:
# def remove_full_stop(word):
#     return word.replace('.', '')
# df['review'] = df['review'].apply(lambda words: list(map(remove_full_stop, words)))
# df.head(10)

Remove stray apostrophes.

In [203]:
def clean_apostrophes(word):
    if word == "n't":
        return "not"
    simple_word = word.replace("'", "")
    if simple_word == word:
        return word
    if len(simple_word) <=3:
        return None
    else:
        return simple_word
df['review'] = df['review'].apply(lambda words: list(filter(bool, map(clean_apostrophes, words))))
df.head(10)

Unnamed: 0,review,sentiment
0,"[One, of, the, other, reviewers, has, mentione...",positive
1,"[A, wonderful, little, production, The, filmin...",positive
2,"[I, thought, this, was, a, wonderful, way, to,...",positive
3,"[Basically, there, a, family, where, a, little...",negative
4,"[Petter, Mattei, Love, in, the, Time, of, Mone...",positive
5,"[Probably, my, all, time, favorite, movie, a, ...",positive
6,"[I, sure, would, like, to, see, a, resurrectio...",positive
7,"[This, show, was, an, amazing, fresh, and, inn...",negative
8,"[Encouraged, by, the, positive, comments, abou...",negative
9,"[If, you, like, original, gut, wrenching, laug...",positive


Transform words to normal forms.

In [204]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
# df['review'] = df['review'].apply(lambda words: list(map(stemmer.stem, words)))
# df.head(10)
df['review'] = df['review'].apply(lambda words: list(map(stemmer.stem, words)))
df.head(10)

Unnamed: 0,review,sentiment
0,"[one, of, the, other, review, has, mention, th...",positive
1,"[a, wonder, littl, product, the, film, techniq...",positive
2,"[i, thought, this, was, a, wonder, way, to, sp...",positive
3,"[basic, there, a, famili, where, a, littl, boy...",negative
4,"[petter, mattei, love, in, the, time, of, mone...",positive
5,"[probabl, my, all, time, favorit, movi, a, sto...",positive
6,"[i, sure, would, like, to, see, a, resurrect, ...",positive
7,"[this, show, was, an, amaz, fresh, and, innov,...",negative
8,"[encourag, by, the, posit, comment, about, thi...",negative
9,"[if, you, like, origin, gut, wrench, laughter,...",positive


Remove stop words.

In [205]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove('no')
stop_words.remove('not')
df['review'] = df['review'].apply(lambda words: list(filter(lambda word: word not in stop_words, words)))
df.head(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maksimabramov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, oz, episod, hook...",positive
1,"[wonder, littl, product, film, techniqu, veri,...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, famili, littl, boy, jake, think, zombi...",negative
4,"[petter, mattei, love, time, money, visual, st...",positive
5,"[probabl, time, favorit, movi, stori, selfless...",positive
6,"[sure, would, like, see, resurrect, date, seah...",positive
7,"[show, amaz, fresh, innov, idea, first, air, f...",negative
8,"[encourag, posit, comment, film, look, forward...",negative
9,"[like, origin, gut, wrench, laughter, like, mo...",positive


In [206]:
small = df['review'].apply(len).min()
small, list(filter(lambda x: len(x) <= small, df['review']))

(3, [['script', 'stori', 'mess']])

In [210]:
all_words = set()
for line in df['review']:
    all_words.update(line)
print("unique words:", len(all_words))
print("max length:", max(len(line) for line in df['review']))

unique words: 41640
max length: 1430


In [217]:
d = dict()
for line in df['review']:
    for word in line:
        count = d.get(word, 0) + 1
        d[word] = count
unique_wordset = set(filter(lambda word: d[word] == 1, d.keys()))
print(len(unique_wordset))
df['review'] = df['review'].apply(lambda words: list(filter(lambda word: word not in unique_wordset, words)))

0


In [218]:
df['review'] = df['review'].apply(lambda words: ' '.join(words))
df.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,positive
1,wonder littl product film techniqu veri unassu...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,mattei love time money visual stun film watch ...,positive
5,probabl time favorit movi stori selfless sacri...,positive
6,sure would like see resurrect date seahunt ser...,positive
7,show amaz fresh innov idea first air first yea...,negative
8,encourag posit comment film look forward watch...,negative
9,like origin gut wrench laughter like movi youn...,positive


In [219]:
df.to_csv('IMDB-Dataset-clean-reduced.csv', index=False)