In [103]:
from __future__ import print_function, division
from builtins import range

In [104]:
import sys
import numpy as np
import pandas as pd
import re
import glob, os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors
from tqdm import tqdm

from nltk.tokenize import word_tokenize
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
pm = PorterStemmer()

In [105]:
final_input_df = pd.read_csv("final_input_df.csv")

In [106]:
train

Unnamed: 0,full_text,label
15264,art raptorx joschuaknuppe image rex island scute,MonsterVerse
27836,fav godzilla,MonsterVerse
35958,creamycumshotz batman riddler,DC Extended Universe
51126,johnxuandou wow series potter queer inclusivit...,Wizarding World
14670,shitfuckers war gansters,Star Wars
...,...,...
31348,mcu marvel fan fun drama,Marvel Cinematic Universe
28404,hack stay camera woman georgia mcdonalds film ...,Star Wars
4636,part memesmonday heard darth jar jar iron star...,Star Wars
25655,number people wandavision hawkeye anyone moon ...,Marvel Cinematic Universe


In [107]:
#train = pd.read_csv('r8-train-all-terms.txt', header=None, sep='\t')
#test = pd.read_csv('r8-test-all-terms.txt', header=None, sep='\t')
#train.columns = ['label', 'content']
#test.columns = ['label', 'content']

In [108]:
def clean_tokens(token_list): ## to remove tokens like zzzzz, aa, kkk, one/two letter toekns, aaaanndd, aab
    new_tkn_lst = []
    for tkn in token_list:
        if((len(tkn) >= 3 or tkn == "dc") and len(set(list(tkn))) > 1 and len(re.findall(r'((\w)\2{2,})', tkn)) == 0  and len(re.findall(r'(^(\w)\2{1,})', tkn)) == 0):
            new_tkn_lst.append(tkn)
    return new_tkn_lst

In [109]:
def preprocess_doc(txt, stem, lemma, stop_wrds, selected_tags):
    try:
        txt = txt.lower()
        txt = re.sub(r'http\S+', '', txt) #remove URLs
        txt = re.sub('[^a-zA-Z-]', ' ', txt ) #removing punctuations numbers
        wrd_tkn = word_tokenize(txt)
        wrd_tkn = clean_tokens(wrd_tkn)
        final_wrd_tkn = wrd_tkn
        if(stop_wrds):
            final_wrd_tkn = [word for word in final_wrd_tkn if not word in set(stopwords.words('english')) ]
        if(stem):
            final_wrd_tkn = [pm.stem(word) for word in final_wrd_tkn]
        if(lemma):
            final_wrd_tkn = [lm.lemmatize(word) for word in final_wrd_tkn]
        if(len(selected_tags) > 0):
            final_wrd_tkn = pos_tag(final_wrd_tkn)
            final_wrd_tkn = [word[0] for word in final_wrd_tkn if word[1] in selected_tags ]
        return final_wrd_tkn
    except Exception as e:
        print(txt)
        print("Exception Caught: ", e)
        return []

In [110]:
def extract_stems_lemma(tags, final_input_df, fresh_load):
    if(not fresh_load):
        final_input_df = pd.read_csv("final_input_cleaned_stem_lemma.csv")
        print("Stems and Lemma extracted to final_input_cleaned_stem_lemma.csv")
    else:
        stem_cleaned_tokens = []
        lemma_cleaned_tokens = []
        for i in tqdm(final_input_df.index):
            stem_cleaned_tokens.append(preprocess_doc(txt = final_input_df['full_text'][i], stem = True, lemma = False, stop_wrds = True, selected_tags = tags))
        final_input_df['stem_cleaned_tokens'] = stem_cleaned_tokens
        for i in tqdm(final_input_df.index):
            lemma_cleaned_tokens.append(preprocess_doc(txt = final_input_df['full_text'][i], stem = False, lemma = True, stop_wrds = True, selected_tags = tags))
        final_input_df['lemma_cleaned_tokens'] = lemma_cleaned_tokens
        final_input_df.to_csv("final_input_cleaned_stem_lemma.csv", index = False)
        print("Stems and Lemma extracted to final_input_cleaned_stem_lemma.csv")

In [111]:
selected_tags = ['NN','NNP','NNPS','NNS']
extract_stems_lemma(selected_tags, final_input_df, fresh_load = False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51462/51462 [06:52<00:00, 124.72it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51462/51462 [06:43<00:00, 127.51it/s]


In [112]:
final_input_df = pd.read_csv("final_input_cleaned_stem_lemma.csv")
final_input_df['label_id'] = final_input_df['label'].astype("category").cat.codes
final_input_df

Unnamed: 0,full_text,label,stem_cleaned_tokens,lemma_cleaned_tokens,label_id
0,‘#GuardiansoftheGalaxy 3’ Star #KarenGillan Sa...,Marvel Cinematic Universe,"['guardiansofthegalaxi', 'star', 'karengillan'...","['guardiansofthegalaxy', 'star', 'karengillan'...",1
1,My friend recommend me a few shows and I have ...,DC Extended Universe,"['friend', 'show', 'idea', 'flash', 'guy', 'sh...","['friend', 'show', 'idea', 'flash', 'guy', 'sh...",0
2,@Papa__Drago Godzilla would win easily but jus...,MonsterVerse,"['papa', 'drago', 'godzilla', 'hundr', 'titan'...","['papa', 'drago', 'godzilla', 'titan', 'omg']",2
3,Batman &amp; Robin...It's so camp https://t.co...,DC Extended Universe,"['batman', 'robin', 'camp']","['batman', 'robin', 'camp']",0
4,why is huge ant man moving so slow,Marvel Cinematic Universe,"['ant', 'man', 'move']","['ant', 'man']",1
...,...,...,...,...,...
51457,@tortoisethatwon @aightmoe @Sarcasm_bender End...,DC Extended Universe,"['tortoisethatwon', 'aightmo', 'sarcasm', 'ben...","['tortoisethatwon', 'aightmoe', 'sarcasm', 'be...",0
51458,@iHrtProngs SIRIUS BLACK DOES NOT HAVE A BAD H...,Wizarding World,"['hair', 'day']","['ihrtprongs', 'hair', 'day']",4
51459,@aNorthernGarden @fiona_skywalker Agreed. She ...,Star Wars,"['fiona', 'skywalk', 'liter', 'ask', 'comment'...","['fiona', 'skywalker', 'input', 'earns', 'hous...",3
51460,And so my long Marvel-watching journey comes t...,Marvel Cinematic Universe,"['marvel-watch', 'journey', 'caught', 'endless...","['journey', 'end', 'franchise', 'eon', 'rest',...",1


In [113]:
df_vectorization = final_input_df[['stem_cleaned_tokens', 'label', 'label_id']]
df_vectorization.head()

Unnamed: 0,stem_cleaned_tokens,label,label_id
0,"['guardiansofthegalaxi', 'star', 'karengillan'...",Marvel Cinematic Universe,1
1,"['friend', 'show', 'idea', 'flash', 'guy', 'sh...",DC Extended Universe,0
2,"['papa', 'drago', 'godzilla', 'hundr', 'titan'...",MonsterVerse,2
3,"['batman', 'robin', 'camp']",DC Extended Universe,0
4,"['ant', 'man', 'move']",Marvel Cinematic Universe,1


In [114]:
dfvg = df_vectorization.groupby(['label', 'label_id'])
classes = {}
for d in dfvg:
    classes[d[0][1]] = d[0][0]
print(classes)

{0: 'DC Extended Universe', 1: 'Marvel Cinematic Universe', 2: 'MonsterVerse', 3: 'Star Wars', 4: 'Wizarding World'}


In [115]:
df_embedding = final_input_df[['lemma_cleaned_tokens', 'label', 'label_id']]
df_embedding.head()

Unnamed: 0,lemma_cleaned_tokens,label,label_id
0,"['guardiansofthegalaxy', 'star', 'karengillan'...",Marvel Cinematic Universe,1
1,"['friend', 'show', 'idea', 'flash', 'guy', 'sh...",DC Extended Universe,0
2,"['papa', 'drago', 'godzilla', 'titan', 'omg']",MonsterVerse,2
3,"['batman', 'robin', 'camp']",DC Extended Universe,0
4,"['ant', 'man']",Marvel Cinematic Universe,1


In [116]:
len(df_embedding)

51462

In [117]:
%%time
list_stemmed_data = []
list_embedding_data = []
for i in df_vectorization.index:
    exec("lst = " + df_vectorization['stem_cleaned_tokens'][i])
    lst = clean_tokens(lst)
    list_stemmed_data.append(" ".join(lst))
    exec("lst2 = " + df_embedding['lemma_cleaned_tokens'][i])
    lst2 = clean_tokens(lst2)
    list_embedding_data.append(" ".join(lst2))

Wall time: 7.47 s


In [118]:
X = pd.Series(list_embedding_data)
y = df_embedding['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True, stratify = y, random_state = 3)
train = pd.concat([X_train, y_train], axis = 1)
test = pd.concat([X_test, y_test], axis = 1)
train.columns = ['full_text', 'label']
test.columns = ['full_text', 'label']

In [119]:
train

Unnamed: 0,full_text,label
15264,art raptorx joschuaknuppe image rex island scute,MonsterVerse
27836,fav godzilla,MonsterVerse
35958,creamycumshotz batman riddler,DC Extended Universe
51126,johnxuandou wow series potter queer inclusivit...,Wizarding World
14670,shitfuckers war gansters,Star Wars
...,...,...
31348,mcu marvel fan fun drama,Marvel Cinematic Universe
28404,hack stay camera woman georgia mcdonalds film ...,Star Wars
4636,part memesmonday heard darth jar jar iron star...,Star Wars
25655,number people wandavision hawkeye anyone moon ...,Marvel Cinematic Universe


In [120]:
class GloveVectorizer:
    def __init__(self):
        # load in pre-trained word vectors
        print('Loading word vectors...')
        word2vec = {}
        embedding = []
        idx2word = []
        with open('G:\spark_big_files\glove.6B\glove.6B.50d.txt', encoding="utf-8") as f:
            # is just a space-separated text file in the format:
            # word vec[0] vec[1] vec[2] ...
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
        print('Found %s word vectors.' % len(word2vec))

        # save for later
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k,v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape

    def fit(self, data):
        pass

    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.lower().split()
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [121]:
class Word2VecVectorizer:
    def __init__(self):
        print("Loading in word vectors...")
        self.word_vectors = KeyedVectors.load_word2vec_format(
            'G:\spark_big_files\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin', encoding="utf-8",
            binary=True
        )
        print("Finished loading in word vectors")

    def fit(self, data):
        pass

    def transform(self, data):
        # determine the dimensionality of vectors
        v = self.word_vectors.get_vector('king')
        self.D = v.shape[0]

        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.split()
            vecs = []
            m = 0
            for word in tokens:
                try:
                    # throws KeyError if word not found
                    vec = self.word_vectors.get_vector(word)
                    vecs.append(vec)
                    m += 1
                except KeyError:
                    pass
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X


    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [122]:
vectorizer = GloveVectorizer()
# vectorizer = Word2VecVectorizer()
Xtrain = vectorizer.fit_transform(train.full_text)
Ytrain = train.label

Xtest = vectorizer.transform(test.full_text)
Ytest = test.label

Loading word vectors...
Found 400000 word vectors.
Numer of samples with no words found: 302 / 36023
Numer of samples with no words found: 123 / 15439


In [123]:
%%time
# create the model, train it, print scores
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

train score: 0.9857590983538295
test score: 0.7575620182654317
Wall time: 50.7 s


In [124]:
%%time
vectorizer_wv = Word2VecVectorizer()
# vectorizer = Word2VecVectorizer()
Xtrain_wv = vectorizer_wv.fit_transform(train.full_text)
Ytrain_wv = train.label

Xtest_wv = vectorizer_wv.transform(test.full_text)
Ytest_wv = test.label

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 627 / 36023
Numer of samples with no words found: 287 / 15439


In [125]:
%%time
# create the model, train it, print scores
model_wv = RandomForestClassifier(n_estimators=200)
model_wv.fit(Xtrain_wv, Ytrain_wv)
print("train score:", model_wv.score(Xtrain_wv, Ytrain_wv))
print("test score:", model_wv.score(Xtest_wv, Ytest_wv))

train score: 0.9752935624462149
test score: 0.7699980568689682
Wall time: 1min 58s


In [126]:
list_f = sorted(glob.glob("df_rottentomatoes_reviews_predicted*"), key=os.path.getmtime)
output_file_name = list_f[0]
df_rottentomatoes_reviews = pd.read_csv(output_file_name)

In [127]:
%%time
X_valid_emb = []
classes = {0: 'DC Extended Universe', 1: 'Marvel Cinematic Universe', 2: 'MonsterVerse', 3: 'Star Wars', 4: 'Wizarding World'}
for i in df_rottentomatoes_reviews.index:
    validation_text = df_rottentomatoes_reviews["review"][i]
    validation_text = preprocess_doc(validation_text, stem = False, lemma = True, stop_wrds = True, selected_tags = selected_tags)
    X_valid_emb.append(validation_text)

Wall time: 10.3 s


In [128]:
X_valid_emb = [" ".join(l) for l in X_valid_emb]

In [129]:
X_valid_emb[:2]

['godzilla monster thing people love see child godzilla improvement term sheer entertainment movie cgi action story fun dialogue fun set-pieces budget make godzilla fan movie movie movie godzilla character movie lizard intelligence rodan mothra ghidorah head personality thing godzilla film fun fan showa heisei godzilla movie',
 'movie horror maestro comedy script way movie opinion fan boy']

In [130]:
X_valid = pd.Series(X_valid_emb)
Xtest_wv = vectorizer_wv.transform(X_valid)
Ytest_wv = df_rottentomatoes_reviews['label_name']

Numer of samples with no words found: 0 / 265


In [132]:
Xtest_gv = vectorizer.transform(X_valid)
Ytest_gv = df_rottentomatoes_reviews['label_name']

Numer of samples with no words found: 0 / 265


In [133]:
print("word vector test score:", model_wv.score(Xtest_wv, Ytest_wv))
print("Glove Vectorizer test score:", model.score(Xtest_gv, Ytest_gv))

word vector test score: 0.7471698113207547
Glove Vectorizer test score: 0.6490566037735849
