# Movie Review Sequencer

### Importing Modules

In [31]:
import numpy as np
import pandas as pd
import icecream as ic

from matplotlib import pyplot as plt
import seaborn as sns

import os
import pickle
import re

import time
import enchant

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [32]:
# Removing Unnecessary Warnings
pd.options.mode.chained_assignment = None

### Loading dataframe and first look

In [33]:
# Concatening dataframes
df = pd.DataFrame()
for df_file in os.listdir('../../review_dfs'):    
    df_new = pd.read_pickle(f'../../review_dfs/{df_file}')
    
    if df.empty:
        df = df_new
    else:
        df = pd.concat([df, df_new])

In [34]:
# Dataframe head
df.reset_index(inplace=True)
df

Unnamed: 0,index,movie_name,movie_year,meter_score,user,post_date,verified,super_reviewer,spoilers,profanity,review,rating
0,0,9_songs,2004,24,978825829,2020-08-2,0,0,0,1,The only erotic movie in all movies that had a...,5.0
1,1,9_songs,2004,24,977906655,2019-11-2,0,0,0,1,I like both sex and music and yet this movie h...,1.0
2,2,9_songs,2004,24,978364060,2019-10-3,0,0,0,0,I want to watch it but I don't know how,4.0
3,3,9_songs,2004,24,977673194,2018-12-2,0,0,0,0,Margo Stilley is a goddess!,5.0
4,4,9_songs,2004,24,977165620,2018-06-0,0,0,0,0,I really liked the movie. Best dick I have ev...,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
373788,262,year_of_the_dragon,1985,56,,2006-04-1,0,0,0,0,I just rewatched this movie after not having s...,2.0
373789,263,year_of_the_dragon,1985,56,,2006-03-1,0,0,0,0,Now this is a Mickey Rourke movie! A good chin...,5.0
373790,264,year_of_the_dragon,1985,56,,2006-02-1,0,0,0,0,more like year of the spenceman...am i right? ...,5.0
373791,265,year_of_the_dragon,1985,56,,2006-02-1,0,0,0,0,RENT THIS!!!!! UNDERRATED!!!!!!!!!!!!,4.0


### Manually checking for problem reviews

In [5]:
# df_random = df.sample(frac=1)

# for i in range(len(df_random)):
#     review = df_random.iloc[i]['review']
#     print('- ' + review)
#     inp = input('\t')
#     if len(inp)>0:
#         print()

### Problem code examples

- [font=Century Gothic]This movie was an incredibly moving piece. The character arcs are beautiful and horrifying. It will depress you.[/font]

- [font=Impact]it was a great movie and i want to see it again!i think mel gibson did a tremendious job![/font]

- [center][font=Arial, Helvetica, sans-serif][color=#ff0000][b]"The entire artistic design of the film is so outlandish that words don't do it justice."[/b][/color][/font][/center]
[center][b][font=Arial][color=#ff0000][/color][/font][/b] [/center]
[center]Read Mike's full review by clicking below:[/center]
[url="http://www.moviepulse.net/mp_pages/dvd/page_clockworkorange.php"][img]http://www.moviepulse.net/mp_pages/dvd/screenshots/clockworkorange/clockworkorange.jpg[/img][/url]

## Filtering reviews
### Removing HTML tagged reviews

In [6]:
# Highlighting HTML tag reviews
df_html = df.loc[df['review'].str.contains('\[/')]

# Removing HTMl tags using .replace and re.sub
str_remove   = ['[b]','[/b]',
                '[i]','[/i]',
                '[u]','[/u]',
                '[indent]','[/indent]',
                '[center]','[/center]',
                '[left]','[/left]',
                '[right]','[/right]',
                '[spoiler]','[/spoiler]',
                '[quote]','[/quote]',
                '[list]','[/list]',
                '[/size]','[/font]','[/color]']

regex_remove = [r'\[IMG\].+\[\/IMG\]', # IMG
                r'\[img\].+\[\/img\]', # img
                r'\[url\=.+\[\/url\]', # url
                r'\[\/?size=[0-9]+]',  # size
                r'\[\/?color=[A-Za-z]+]', # color
                r'\[\/?font=[A-za-z\s]+\]', # font
                r'\[email\=\".+\[\/email\]'] # email

review_clean = []
for review in df_html['review']:
    review = review.lower()
    for item in str_remove:
        review = review.replace(item, '')
        
    for regex_pattern in regex_remove:
        review = re.sub(pattern = regex_pattern,
                        repl    = '',
                        string  = review)
        
    review_clean.append(review)
    
df['review'].loc[df['review'].str.contains('\[/')] = review_clean

### Removing non-english reviews

In [18]:
# Dictionary of US-english words
d = enchant.Dict("en_US")

idx = 0
remove_idx_noneng = []
for review in df['review']:
    review = review.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    review_tokenized = tokenizer.tokenize(review)
    
    # If 10/20 words are non-eng, flag review
    j = 0
    non_eng = 0
    for word in review_tokenized:
        j += 1
        if d.check(word)==False:
            non_eng += 1
        if non_eng == 10 or j > 20:
            remove_idx_noneng.append(idx)
            idx += 1
            break
            
print(remove_idx_noneng)

KeyboardInterrupt: 

In [None]:
# Removing any reviews that still contain HTML tag variations
df_remove  = df.loc[df['review'].str.contains('\[/')]
remove_idx_html = list(df_remove.index)
df = df.drop(df.index[remove_idx_html])

### Tokenizing and stemming for one review

In [9]:
start = time.time()
processed_review = []
for review in df['review']:
    review = review.lower()

    # Replacing digits with corresponding word
    numbers = [1,2,3,4,5,6,7,8,9,0]
    num_words = ['one','two','three','four','five','six','seven','eight','nine','zero']
    for number, number_word in zip(numbers, num_words):
        review = review.replace(f'{number}',f'{number_word}')

    # Tokenizing into list - removing all punctuation and separating by word
    tokenizer = RegexpTokenizer(r'\w+')
    review_tokenized = tokenizer.tokenize(review)

    # Stop word filtration
    stop_words = set(stopwords.words('english'))
    review_filtered = [w for w in review_tokenized if w not in stop_words]

    # Stemming
    ps = PorterStemmer()
    review_stemmed = [ps.stem(i) for i in review_filtered]
    
    processed_review.append(review_stemmed)

### Text to sequence process

In [10]:
# Text to sequence
seq_tokenizer = Tokenizer(num_words = 1000)
seq_tokenizer.fit_on_texts(processed_review)

seq = seq_tokenizer.texts_to_sequences(processed_review)
print(seq)
end = time.time()
print(end-start)

[[120, 1, 47, 121, 122, 6, 21, 123, 124, 125, 126, 127, 48, 128, 5, 129, 130, 131, 49, 132, 133, 5, 1], [50, 134, 51, 2, 135, 3, 136, 52, 1], [137, 53, 6, 1, 54, 138, 8, 139], [55, 140, 22, 56, 141, 23, 24, 142, 1, 24, 25, 57, 26, 7, 58, 23, 143, 144, 145, 146, 27, 47, 147, 148, 28, 149, 150, 151, 29, 152, 59, 153, 1], [3, 9, 10, 11, 4, 154, 1, 26, 7, 58, 155, 30, 31, 1, 156, 157, 158, 23, 8, 8, 60, 12, 159, 4, 160, 161, 61, 3, 162, 163, 62, 164, 1, 63, 165, 166, 3, 60, 4, 14, 2, 32, 64, 15, 16, 167, 3, 168, 33, 65, 169, 4, 5, 4, 15, 16, 66, 170, 171, 67, 2, 172, 68, 69, 173, 17, 174, 175, 70, 176, 66, 49, 18, 177, 2, 33, 71, 3, 69, 19, 72, 13, 73, 61, 68, 178, 179, 180, 181, 182, 74, 183, 75, 25, 184, 185, 64, 186, 62, 76, 187, 34, 73, 188, 13, 74, 35, 25, 77, 50, 36, 36, 59, 3, 189, 190, 191, 37, 78, 67, 192, 2, 32, 2, 27, 27, 193, 2, 79, 194, 3, 52, 195, 196, 197, 2, 38, 80, 198, 65, 199, 6, 1, 34, 77, 81, 2, 35, 30, 31], [3, 200, 201, 202, 9, 10, 11, 1, 203, 2, 204, 205, 206, 207, 

### General Notes

Keras tutorial using functional neural networks not sequential

In [26]:
x = [1,2,3,4,5]
y = [4,5,6,7,8]

# z = [i for i in x if i not in y]
# z.extend([i for i in y])
# print(z)

z = list(set(x) | set(y))

[1, 2, 3, 4, 5, 6, 7, 8]
