In [1]:
import os
import gensim
import numpy as np
import re
import pandas as pd
import logging
import nltk
from gensim import corpora, utils
from gensim.models import Phrases
from gensim.corpora import Dictionary
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict
from smart_open import open

In [2]:
def preprocess(document):
    # convert to lower case
    document = document.lower()

    # tokenize document
    tk = RegexpTokenizer(r'[a-zA-Z\'\-\_]+')
    tokens = [token for token in tk.tokenize(document)]
    tokens = [token for token in tokens if token != 'br']

    # determine stop words
    stoplist = set(stopwords.words('english'))

    # remove stop words
    tokens = [token for token in tokens if token not in stoplist]

    # stemmer
    porter = PorterStemmer()
    tokens = [porter.stem(token) for token in tokens]

    # remove words with length 1
    tokens = [token for token in tokens if len(token) > 1]

    return tokens

In [4]:
df = pd.read_csv('..\data\IMDB Dataset.csv')

In [5]:
df = df.drop_duplicates()
df['clean_review'] = df['review'].apply(preprocess)


X = df['clean_review'].tolist()
y = df['sentiment'].tolist()

In [6]:
len(X)

49582

In [7]:
dct = Dictionary(X)
len(dct)

120253

In [8]:
dct.filter_extremes(no_below=5, no_above=0.6)
len(dct)

28564

In [9]:
corpus = [dct.doc2bow(doc) for doc in X]

In [11]:
dct.save('dictionary.dict')
corpora.MmCorpus.serialize('mycorpus.mm', corpus)