The first step in our code is to load packages:

In [1]:
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import fasttext # for learning on text data
from tqdm import tqdm # for loops monitoring
import re # for regular expressions
import string
from nltk.corpus import PlaintextCorpusReader
from sklearn.utils import shuffle

# Multiclass classification

## Loading Data

In [2]:
def sorter(paths): # функция сортирующая отзывы в папках в словарь reviews
    reviews = {'1-2':[], "3-4":[], "7-8":[], "9-10":[]}
    for path in paths:
        if path == 'neg_train' or path == 'neg':
            for rating in range(1, 5):
                corpus = PlaintextCorpusReader(path, '.*\_{}.txt'.format(rating), encoding='utf-8')
                names = corpus.fileids()
                for name in names:
                    file = open(path + f"/{name}")
                    text = file.read()
                    if rating == 1 or rating == 2:
                        reviews["1-2"].append(text)
                    if rating == 3 or rating == 4:
                        reviews["3-4"].append(text)
        elif path == "pos_train" or path == 'pos':
            for rating in range(7, 11):
                corpus = PlaintextCorpusReader(path, '.*\_{}.txt'.format(rating), encoding='utf-8')
                names = corpus.fileids()
                for name in names:
                    file = open(path + f"/{name}")
                    text = file.read()
                    if rating == 7 or rating == 8:
                        reviews["7-8"].append(text)
                    if rating == 9 or rating == 10:
                        reviews["9-10"].append(text)
                    
    return reviews

In [3]:
paths = ['train_neg', 'train_neg', 'pos', 'neg']
reviews_rating = sorter(paths)

## Data Preprocess

In [4]:
df_rating = shuffle(pd.DataFrame([(key, var) for (key, L) in reviews_rating.items() for var in L], 
                 columns=['rating', 'review']))

In [5]:
df_rating.to_csv('df_rating_doubled.csv')

In [6]:
import re
def preprocessor(text):
    text =re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

preprocessor("This is a :) test :-( !")

'this is a test :) :('

In [7]:
df_rating['preprocessed_review'] = df_rating.review.apply(preprocessor)

In [8]:
data_text_fast = df_rating.apply(lambda x: '__label__' + str(x['rating']) + ' ' + ''.join([str(x['review'])]), axis = 1)

In [9]:
data_text_fast

15601    __label__7-8 **SPOILER ALERT** W. Somerset Mau...
307      __label__1-2 There is a remark that one of her...
12962    __label__7-8 This 1947 film stars and was dire...
14912    __label__7-8 As someone else commented, this i...
23341    __label__9-10 Best Stephen King film alongside...
                               ...                        
19758    __label__9-10 Just finished watching the movie...
13640    __label__7-8 I like this episode quite a bit, ...
15056    __label__7-8 I'm grateful to Cesar Montano and...
1102     __label__1-2 Oh, brother...after hearing about...
23288    __label__9-10 I have to say that this is one o...
Length: 25000, dtype: object

In [10]:
np.savetxt('data_fast_text.txt', data_text_fast, delimiter = ' ', fmt = '%s')

In [11]:
test = data_text_fast[:int(len(data_text_fast)*.1)]
train = data_text_fast[int(len(data_text_fast)*.1):]
np.savetxt('train_fasttext.txt', train, delimiter = ' ', fmt = '%s')
np.savetxt('test_fasttext.txt', test, delimiter = ' ', fmt = '%s')

In [12]:
train

21395    __label__9-10 I've watched this documentary tw...
16544    __label__7-8 "Anchors Aweigh" is the product o...
6598     __label__1-2 One of the worst movies I've ever...
10052    __label__3-4 Well, no, not really. Its not rea...
15723    __label__7-8 Faithful adaptation of witty and ...
                               ...                        
19758    __label__9-10 Just finished watching the movie...
13640    __label__7-8 I like this episode quite a bit, ...
15056    __label__7-8 I'm grateful to Cesar Montano and...
1102     __label__1-2 Oh, brother...after hearing about...
23288    __label__9-10 I have to say that this is one o...
Length: 22500, dtype: object

## Algorithm Training

In [13]:
model = fasttext.train_supervised(input='train_fasttext.txt', epoch=15, wordNgrams=2, bucket=300000, dim=13, loss='ova')    
print(model.test("test_fasttext.txt"))

(2500, 0.63, 0.63)


In [14]:
model.save_model("model_review_fasttext.bin")

## Model Evaluation

In [15]:
review_5 = "Good production, quite intense, very interesting reenactment of a battle at sea, although I am not sure how realistic and accurate it is (those submarines liked to fight a lot in plain sight over the water, right?).My main issue? It was like a documentary, lots of battle time but no much human drama. No time given to develop any characters or even make us feel something about these people. Ships were sinking in the middle of the Atlantic and we never had a look in the horror of trying to survive this. Even Hank's character felt flat. The few scenes with his love interest were rather awkward and didn't contribute much. At one point we were wondering if the captain was in the spectrum or something..Still, if you are a fan of WWII movies I suggest to watch it for the unique perspective of a battle at sea."
review_5 = preprocessor(review_5)
print(model.predict(review_5))

review = "Very bad and disgusting film!"
review = preprocessor(review)
print(model.predict(review))

review_10 = "This movie may not be for everyone, but as a Navy veteran who has stood watch in CIC this movie is very realistic. I loved that there wasn't a lot of back story. It was more about how lonely and arduous the job of a Navy captain is. Must watch!"
review_10 = preprocessor(review_10)
print(model.predict(review_10))

review_6 = "It was just OK which is reflected in my 6 star rating, which I think is generous. They made little effort at any sort of character development. Hanks has some sort of love interest that is hinted at, but that's it. I thought they were going to build some sort of bond between the captain and his Black cook, but that never got off the ground. We know almost nothing more about the man at the end than in the beginning.It's a difficult challenge to portray on film the battle between a destroyer and a submarine. This movie never really pulls that off, it's like listening to one side of a telephone conversation. The German U-boats never seem to be part of the narrative. They try to bring the U-boats in with their radio broadcasts which come across more as obscene phone calls than viable dialogue.Almost exactly half of the dialogue is sailors repeating orders. It got very tedious, very quickly. I thought they went too far in the whole \"navy talk\" department.I found the U-boat attack theme music to be mostly bothersome and heavy-handed, like death wail of a fat man, or a runner-up in Dumb and Dumber's most annoying sound in the world.The sea burial onboard was a moving tribute to our military dead. I couldn't imagine a better resting place than the open sea. I was air force and not really sure what we did. Threw bodies out the back of a C130? That'd be cool with me."
review_6 = preprocessor(review_6)
print(model.predict(review_6))

review_1 = "A typical US movie boring would a German u-boat contact a us war ship ??? and all the rest again private ( hanks) movie not worth a second to watch ok maybe now with the virus nothing better on"
review_1 = preprocessor(review_1)
print(model.predict(review_1)[0][0][9:13])

review_2 = "This movie was very disappointing, I really wanted to like it. But it was monotonous with 99% of the movie on the fighter ship fighting with a fake sounding loud sound effects. Gave me a headache, take a couple of aspirins if planing to watch it."
review_2 = preprocessor(review_2)
print(model.predict(review_2)[0][0][9:13])

review_b = "movie is disgusting"
review_b = preprocessor(review_b)
print(model.predict(review_b)[0][0][9:13])

(('__label__3-4',), array([0.32767832]))
(('__label__1-2',), array([0.99938369]))
(('__label__9-10',), array([0.9659096]))
(('__label__3-4',), array([0.32767832]))
1-2
3-4
1-2


In [16]:
model.test("test_fasttext.txt", k=4)

(2500, 0.25, 1.0)

# Binary Classification

## Loading Data

In [17]:
def sorter_sentiment(paths):
    reviews_sentiment = {'__label__0': [], '__label__1': []}
    for path in paths:
        if path == 'neg_train' or path == 'neg':
            for rating in range(1, 5):
                corpus = PlaintextCorpusReader(path, '.*\_{}.txt'.format(rating), encoding='utf-8')
                names = corpus.fileids()
                for name in names:
                    file = open(path + f"/{name}")
                    text = file.read()
                    reviews_sentiment['__label__0'].append(text)
        elif path == "pos_train" or path == 'pos':
            for rating in range(7, 11):
                corpus = PlaintextCorpusReader(path, '.*\_{}.txt'.format(rating), encoding='utf-8')
                names = corpus.fileids()
                for name in names:
                    file = open(path + f"/{name}")
                    text = file.read()
                    reviews_sentiment['__label__1'].append(text)
    return reviews_sentiment

In [19]:
paths = ['train_neg', 'train_pos', 'pos', 'neg']
reviews_sentiment = sorter_sentiment(paths)

## Data Preprocess

In [20]:
df_sentiment = shuffle(pd.DataFrame([(key, var) for (key, L) in reviews_sentiment.items() for var in L], 
                 columns=['sentiment', 'review']))

In [21]:
df_sentiment

Unnamed: 0,sentiment,review
2941,__label__0,Ok so I was bored and I watched it all the way...
9986,__label__0,When my wife and I decided to watch this movie...
10377,__label__0,"Ben Masters,(Kyd Thomas),""Dream Lover"",'86 pla..."
19533,__label__1,This is a really great short from Hal Roach. T...
6707,__label__0,I put this movie on in the hotel room to enter...
...,...,...
21549,__label__1,this movie was rather awful Vipul Shah's last ...
11087,__label__0,"I'm a big mark for the music of Neil Young, an..."
163,__label__0,This show is a perfect example of how the CBC ...
2915,__label__0,This movie is about a female rape victim/comic...


In [22]:
df_sentiment.to_csv('df_sentiment.csv')

In [23]:
df_sentiment['preprocessed_review'] = df_sentiment.review.apply(preprocessor)

In [24]:
df_sentiment

Unnamed: 0,sentiment,review,preprocessed_review
2941,__label__0,Ok so I was bored and I watched it all the way...,ok so i was bored and i watched it all the way...
9986,__label__0,When my wife and I decided to watch this movie...,when my wife and i decided to watch this movie...
10377,__label__0,"Ben Masters,(Kyd Thomas),""Dream Lover"",'86 pla...",ben masters kyd thomas dream lover 86 plays a ...
19533,__label__1,This is a really great short from Hal Roach. T...,this is a really great short from hal roach th...
6707,__label__0,I put this movie on in the hotel room to enter...,i put this movie on in the hotel room to enter...
...,...,...,...
21549,__label__1,this movie was rather awful Vipul Shah's last ...,this movie was rather awful vipul shah s last ...
11087,__label__0,"I'm a big mark for the music of Neil Young, an...",i m a big mark for the music of neil young and...
163,__label__0,This show is a perfect example of how the CBC ...,this show is a perfect example of how the cbc ...
2915,__label__0,This movie is about a female rape victim/comic...,this movie is about a female rape victim comic...


In [25]:
data_text_fast_sentiment = df_sentiment.apply(lambda x: x['sentiment'] + ' ' + ''.join([str(x['preprocessed_review'])]), axis = 1)

In [26]:
data_text_fast_sentiment

2941     __label__0 ok so i was bored and i watched it ...
9986     __label__0 when my wife and i decided to watch...
10377    __label__0 ben masters kyd thomas dream lover ...
19533    __label__1 this is a really great short from h...
6707     __label__0 i put this movie on in the hotel ro...
                               ...                        
21549    __label__1 this movie was rather awful vipul s...
11087    __label__0 i m a big mark for the music of nei...
163      __label__0 this show is a perfect example of h...
2915     __label__0 this movie is about a female rape v...
12247    __label__0 i rented this on dvd yesterday and ...
Length: 25000, dtype: object

In [27]:
np.savetxt('data_fast_text_sentiment.txt', data_text_fast_sentiment, delimiter = ' ', fmt = '%s')

In [28]:
test_sentiment = data_text_fast_sentiment[:int(len(data_text_fast_sentiment)*.1)]
train_sentiment = data_text_fast_sentiment[int(len(data_text_fast_sentiment)*.1):]
np.savetxt('train_fasttext_sentiment.txt', train_sentiment, delimiter = ' ', fmt = '%s')
np.savetxt('test_fasttext_sentiment.txt', test_sentiment, delimiter = ' ', fmt = '%s')

In [29]:
test_sentiment

2941     __label__0 ok so i was bored and i watched it ...
9986     __label__0 when my wife and i decided to watch...
10377    __label__0 ben masters kyd thomas dream lover ...
19533    __label__1 this is a really great short from h...
6707     __label__0 i put this movie on in the hotel ro...
                               ...                        
10872    __label__0 awesomely improbable and foolish po...
17942    __label__1 i enjoyed it in general i m not a f...
11770    __label__0 damp telling of the american revolu...
16718    __label__1 jazz aficionados will treasure this...
8875     __label__0 they re showing this on some off ne...
Length: 2500, dtype: object

In [30]:
train_sentiment

289      __label__0 this is a piece of s t this looks w...
24651    __label__1 who ever came up with story is one ...
15429    __label__1 this movie is awesome for three mai...
23056    __label__1 if ever anyone queries whether cine...
6967     __label__0 to anyone not familiar with c s for...
                               ...                        
21549    __label__1 this movie was rather awful vipul s...
11087    __label__0 i m a big mark for the music of nei...
163      __label__0 this show is a perfect example of h...
2915     __label__0 this movie is about a female rape v...
12247    __label__0 i rented this on dvd yesterday and ...
Length: 22500, dtype: object

## Algorithm Training

In [31]:
model_sentiment = fasttext.train_supervised(input='train_fasttext_sentiment.txt', epoch=26, wordNgrams=2, bucket=200000, dim=17, loss='ova')    
print(model_sentiment.test("test_fasttext_sentiment.txt"))

(2500, 0.8988, 0.8988)


In [32]:
model_sentiment.save_model("model_review_fasttext_sentiment.bin")

## Model Evaluation

In [33]:
rev_pos = 'What an excellent film by Rian Johnson; definitely feels like the film he was destined to make. Writing that is slick as hell, sublime performances (most notably Daniel Craig who brings his A-game in a wonderfully charismatic turn), superb editing and wonderfully atmospheric music - all tied together by masterful direction. Will probably be among the most fun you have at a theatre this year and fans of Agatha Christie and old murder mystery stories will have plenty to love here - a nostalgically entertaining time!'
rev_pos = preprocessor(rev_pos)
print(model_sentiment.predict(rev_pos))

rev_neg_1 = "Although this show is very highly rated I couldn't force myself to keep watching after a few episodes simply because of the way the actors talk. I wouldn't even say the acting is bad, the characters just can't keep up with all this dark, cold and mysterious plot. The people are all equally dark, cold and mysterious and that's just not the way people are or at least behave. For Non-German audience it is probably easier to get into but the show is set in Germany in German language. And no one here talks that way, like an emotionless robot (maybe that's the way we are seen by the world but it's not true). So every dialogue of the show reminded me just how different and stylish it wants to be, completely forgetting human depth or even basic human feelings (when you begin to ask yourself if there is any character capable of loving, even if it's just their family members, you know there is something wrong)."
rev_neg_1 = preprocessor(rev_neg_1)
print(model_sentiment.predict(rev_neg_1))

rev_pos_1 = "This feedback is currently based on the first 10 episodes of season one (still watching the series as we speak), and to be honest it made me downgrade my rating for 'Stranger Things' quite a bit. It's clear that this series is the child of a genius - everything just works. The acting is great, the cinematography is great, the soundtrack is awesome (reminds me of Klaus Schulze at times), and the story-line is mind-boggling. This series is just miles above anything that leaves Hollywood. Wish that all productions could be like this. Thanks Netflix for bringing this awesome series to me screen. Say bye-bye to public television!"
rev_pos_1 = preprocessor(rev_pos_1)
print(model_sentiment.predict(rev_pos_1))

rev_pos_2 = "This movie is good"
rev_pos_2 = preprocessor(rev_pos_2)
print(model_sentiment.predict(rev_pos_2))

rev_neg_2 = "This is truly the most garbage movie I've ever seen. As a film student, I have no idea what this director was doing as he put this maseceure together. The shots are random and unmotivated, even for a comedy. I love comedies and Andy Samberg, but the writing is so awful. I don't see how a writer could be happy showing this to anyone, and the producers must of been so eager for a script that they couldn't recognize how terrible it is. The editing and sound mixing look like they've been done by amateurs. Besides the few creative cinematic shots and the three or four decent lines of dialogue, there's absolutely nothing good about this film and anyone who things this is great must have a very low intellect and appreciation of good films."
rev_neg_2 = preprocessor(rev_neg_2)
print(model_sentiment.predict(rev_neg_2))

(('__label__1',), array([1.00001001]))
(('__label__0',), array([0.91491097]))
(('__label__1',), array([1.00001001]))
(('__label__1',), array([1.00001001]))
(('__label__0',), array([1.00001001]))
