# Movie Review Sequencer

### Importing Modules

In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

import os
import pickle
import re

import time
import enchant

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

### Loading dataframe and first look

In [2]:
# Concatening dataframes
df = pd.DataFrame()
for df_file in os.listdir('../../review_dfs'):    
    df_new = pd.read_pickle(f'../../review_dfs/{df_file}')
    
    if df.empty:
        df = df_new
    else:
        df = pd.concat([df, df_new])

In [3]:
# Dataframe head
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,movie_name,movie_year,meter_score,user,post_date,verified,super_reviewer,spoilers,profanity,review,rating
0,0,antichrist,2009,53,882704800,2021-02-2,0,0,0,1,"While ""Antichrist"" is certainly a graphic and ...",4.0
1,1,antichrist,2009,53,978539678,2020-12-2,0,0,0,0,The ridiculously gigantic proportions of this ...,4.0
2,2,antichrist,2009,53,978883847,2020-12-1,0,0,0,1,What to say about this BORING load of CRAP !!!...,0.5
3,3,antichrist,2009,53,977983837,2020-11-1,0,0,0,1,Too much very graphic porn and the woman is a ...,1.0
4,4,antichrist,2009,53,978580560,2020-09-2,0,0,0,0,Descomunalmente desconfortável. Lars Von Trie...,3.5


### Manually checking for problem reviews

In [5]:
# df_random = df.sample(frac=1)

# for i in range(len(df_random)):
#     review = df_random.iloc[i]['review']
#     print('- ' + review)
#     inp = input('\t')
#     if len(inp)>0:
#         print()

### Problem code examples

- [font=Century Gothic]This movie was an incredibly moving piece. The character arcs are beautiful and horrifying. It will depress you.[/font]

- [font=Impact]it was a great movie and i want to see it again!i think mel gibson did a tremendious job![/font]

- [center][font=Arial, Helvetica, sans-serif][color=#ff0000][b]"The entire artistic design of the film is so outlandish that words don't do it justice."[/b][/color][/font][/center]
[center][b][font=Arial][color=#ff0000][/color][/font][/b] [/center]
[center]Read Mike's full review by clicking below:[/center]
[url="http://www.moviepulse.net/mp_pages/dvd/page_clockworkorange.php"][img]http://www.moviepulse.net/mp_pages/dvd/screenshots/clockworkorange/clockworkorange.jpg[/img][/url]

### Removing HTML tagged reviews

In [18]:
# Highlighting HTML tag reviews
df2 = df.loc[df['review'].str.contains('\[/'), 'review']

### Removing non-english reviews

In [16]:
# Dictionary of US-english words
d = enchant.Dict("en_US")

# Checking subsection of reviews
N = 1000
for i in range(N):
    review = df['review'][i]
    review = review.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    review_tokenized = tokenizer.tokenize(review)
    
    # If 10/20 words are non-eng, flag review
    non_eng = 0
    j = 0
    for word in review_tokenized:
        j += 1
        if d.check(word)==False:
            non_eng += 1
        if non_eng == 10: 
            # print('-', review,'\n\n')
            break
        if j > 20:
            break

### Tokenizing and stemming for one review

In [7]:
start = time.time()
processed_review = []
for i in range(10):
    review = df['review'][i]
    review = review.lower()

    # Replacing digits with corresponding word
    numbers = [1,2,3,4,5,6,7,8,9,0]
    num_words = ['one','two','three','four','five','six','seven','eight','nine','zero']
    for number, number_word in zip(numbers, num_words):
        review = review.replace(f'{number}',f'{number_word}')

    # Tokenizing into list - removing all punctuation and separating by word
    tokenizer = RegexpTokenizer(r'\w+')
    review_tokenized = tokenizer.tokenize(review)

    # Stop word filtration
    stop_words = set(stopwords.words('english'))
    review_filtered = [w for w in review_tokenized if w not in stop_words]

    # Stemming
    ps = PorterStemmer()
    review_stemmed = [ps.stem(i) for i in review_filtered]
    
    processed_review.append(review_stemmed)

### Text to sequence process

In [8]:
# Text to sequence
seq_tokenizer = Tokenizer(num_words = 1000)
seq_tokenizer.fit_on_texts(processed_review)

seq = seq_tokenizer.texts_to_sequences(processed_review)
print(seq)
end = time.time()
print(end-start)

[[9, 18, 19, 46, 1, 20, 10, 47, 48, 49, 50, 21, 51, 52, 1, 5, 53, 4, 3, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 22, 64, 65, 22, 66, 11, 67, 68, 23, 5, 69, 2, 24, 12, 70, 13, 71, 1, 72, 73, 74, 14, 6, 1, 25, 75, 26, 27, 7, 76, 2, 9, 77, 78, 79, 80, 81, 6, 28, 29, 11, 82, 2, 10, 83, 12, 84, 85, 86, 87, 24], [88, 89, 90, 91, 92, 93, 94, 95, 3, 30, 96, 97, 26, 98, 99], [31, 100, 101, 102, 12, 103, 104, 105, 32, 106, 107, 108, 109, 110, 111, 112, 113], [33, 19, 34, 15, 114, 115, 35, 116, 117, 36, 118, 28, 119, 120, 32, 6, 121, 20, 122, 123, 37, 124, 125, 21, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 16, 31, 140, 141, 142, 143, 144, 15, 10, 34, 37, 15], [145, 146, 8, 4, 3, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 38, 167, 168, 169, 170], [171, 1, 172, 173, 174, 175, 1, 39, 176, 177, 16, 178, 18, 179, 180, 181, 40, 41, 23, 42], [182, 16, 2, 1, 183, 184, 185, 2, 186, 1, 6, 187, 188, 189, 42, 190, 8, 4, 3, 1,

### General Notes

Keras tutorial using functional neural networks not sequential