In [None]:
# First, load the data for sentiment task and prepare the data.
import pandas as pd
import numpy as np

data = pd.read_csv('https://github.com/mbburova/MDS/raw/main/sentiment.csv', index_col=0)
data.head()

In [None]:
import re
tag_regexp = re.compile("<[^>]*>")
regex = re.compile("[A-Za-z-]+")

def words_only(text, regex=regex):
    text = re.sub(tag_regexp, '', text)
    text = re.sub('\s+', ' ',text)
    text = re.sub(r'\\','', text)
    text = text.lower().strip()
    try:
        return " ".join(regex.findall(text))
    except:
        return ""

data['cleaned_review'] = data['review'].apply(words_only)
data['tokenized'] = data['cleaned_review'].apply(lambda x: x.split())
data.head()


In [None]:
# Split the data on train and test
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

X_train, X_test, y_train, y_test = train_test_split(data.tokenized,data.sentiment, test_size=0.2, random_state = 5)
X_train[:5]

In [None]:
#!pip install nltk

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

In [None]:
data.head()

In [None]:
DICT_SIZE = 500
from collections import Counter

counter = Counter(X_train.sum())
for word in list(counter):
    if word in STOPWORDS:
        del counter[word]

In [None]:
words_counts =  counter
WORDS_TO_INDEX = [word[0] for word in counter.most_common(DICT_SIZE)]

def BoW(words, words_to_index, dict_size):
    """
        words: a list of words
        dict_size: size of the dictionary
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.array([0 for i in range(dict_size)])
    wti = np.array(words_to_index)
    
    for word in words:
        ind = np.where(wti == word)
        if len (ind) == 1 :
            result_vector[ind[0]] +=1
    return result_vector

In [None]:
from scipy import sparse as sp_sparse
X_train_bow = sp_sparse.vstack([sp_sparse.csr_matrix(BoW(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_bow = sp_sparse.vstack([sp_sparse.csr_matrix(BoW(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_bow.shape)
print('X_test shape ', X_test_bow.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 300, random_state=5, max_depth = 5)

rfc = rfc.fit(X_train_bow, y_train)

pred = rfc.predict(X_test_bow)

accuracy = accuracy_score(y_test, pred)
accuracy

In [None]:
X_test_bow

In [None]:
X_test

In [None]:
X_sample = [['i','must','say','it','s', 'perfect'],['i','feel','horrible']]
X_sample_bow = sp_sparse.vstack([sp_sparse.csr_matrix(BoW(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_sample])

In [None]:
rfc.predict(X_sample_bow)

In [1]:
%%writefile SentimentClassifier.py
from scipy import sparse as sp_sparse
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import pandas as pd
import numpy as np
import re

def words_only(text):
    tag_regexp = re.compile("<[^>]*>")
    regex = re.compile("[A-Za-z-]+")
    text = re.sub(tag_regexp, '', text)
    text = re.sub('\s+', ' ',text)
    text = re.sub(r'\\','', text)
    text = text.lower().strip()
    try:
        return " ".join(regex.findall(text))
    except:
        return ""

def BoW(words, words_to_index, dict_size):
    """
        words: a list of words
        dict_size: size of the dictionary
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.array([0 for i in range(dict_size)])
    wti = np.array(words_to_index)
    
    for word in words:
        ind = np.where(wti == word)
        if len (ind) == 1 :
            result_vector[ind[0]] +=1
    return result_vector

class SentimentClassifier:
    def __init__(self):
        self._data = []
        self._datapath = 'https://github.com/mbburova/MDS/raw/main/sentiment.csv'
        self._model = RandomForestClassifier(n_estimators = 300, random_state=5, max_depth = 5)
        self._X_train = []
        self._X_test = []
        self._y_train = []
        self._y_test = []
        self._DICT_SIZE = 500
        self._WORDS_TO_INDEX = []
        self._accuracy = 0
    
    def fit_model(self, datapath = 'https://github.com/mbburova/MDS/raw/main/sentiment.csv') :
        self._data = pd.read_csv(datapath, index_col=0)
        self._datapath = datapath
        tag_regexp = re.compile("<[^>]*>")
        regex = re.compile("[A-Za-z-]+")
        self._data['cleaned_review'] = self._data['review'].apply(words_only)
        self._data['tokenized'] = self._data['cleaned_review'].apply(lambda x: x.split())
        self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(self._data['tokenized'],self._data['sentiment'], test_size=0.2, random_state = 5)
        
        nltk.download('stopwords')
        STOPWORDS = set(stopwords.words('english'))
        
        counter = Counter(self._X_train.sum())
        for word in list(counter):
            if word in STOPWORDS:
                del counter[word]
        
        words_counts =  counter
        self._WORDS_TO_INDEX = [word[0] for word in counter.most_common(self._DICT_SIZE)]
        
        X_train_bow = sp_sparse.vstack([sp_sparse.csr_matrix(BoW(text, self._WORDS_TO_INDEX, self._DICT_SIZE)) for text in self._X_train])
        X_test_bow = sp_sparse.vstack([sp_sparse.csr_matrix(BoW(text, self._WORDS_TO_INDEX, self._DICT_SIZE)) for text in self._X_test])
        
        self._model = RandomForestClassifier(n_estimators = 300, random_state=5, max_depth = 5)
        self._model = self._model.fit(X_train_bow, self._y_train)
        pred = self._model.predict(X_test_bow)
        self._accuracy = accuracy_score(self._y_test, pred)
        
        return self._model
    
    def predict(self, sample):
        X_sample = [words_only(sample).split()]
        #X_sample = [['i','must','say','it','s', 'perfect'],['i','feel','horrible']]
        X_sample_bow = sp_sparse.vstack([sp_sparse.csr_matrix(BoW(text, self._WORDS_TO_INDEX, self._DICT_SIZE)) for text in X_sample])
        pred = self._model.predict(X_sample_bow)
        return pred[0]

Overwriting SentimentClassifier.py


In [None]:
from SentimentClassifier import SentimentClassifier

sc = SentimentClassifier()

In [None]:
sc.fit_model()

In [None]:
sc._accuracy

In [None]:
sc._accuracy

In [None]:
sc.predict("I'm disapointed by the horrible plot")

In [None]:
sc.predict("I liked the scene with a car crash")

In [None]:
sc.predict("You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant in a return to proper acting after his decade or so of calling it in. The screenplay is dense and layered (I'd say it was a thick as a Bible), cinematography is quite stark and spare for the most part but imbued with rich, lucious colour in moments (especially scenes with Florence Pugh), the score is beautiful at times but mostly anxious and oppressive, adding to the relentless pacing. The 3 hour runtime flies by. All in all I found it an intense, taxing but highly rewarding watch. This is film making at it finest. A really great watch.")

In [None]:
sc.predict("A word of advice. Don't waste your money and time going to see the Barbie movie. Went with the whole family (including my 12 year old daughter and wife) and they asked to leave before it was over, it was that bad. I think it's been 30 years since I walked out of a movie theater because the movie was so, so bad. I don't know how it's been so successful. Marketing at its best (or worst). It's boring, disjointed, and sexist (to both men and women). It's an unwatchable mess, and by the last 20 minutes we cared so little about the characters, that we just left. It's horrendous. Save two hours of your life and don't bother watching.")

In [26]:
%%writefile server.py
from flask import Flask, request
import json
import pickle
import re
from SentimentClassifier import SentimentClassifier

app = Flask(__name__)

model = SentimentClassifier()
model.fit_model()

@app.route('/')
def help_message():
    message = "Use /sentiment path to classify the sentiment of the movie review.\n" 
    message += "Post json format {'review': 'text of review for classification'}.\n" 
    message += (" Current model accuracy: " + str(model._accuracy))
    return message

@app.route('/sentiment', methods=["GET", "POST"])
def sentim_classifier():
    if request.method == 'POST':
        rq = request.get_json(force=True)
        review = rq['review']
        result = rewiew
        #result = model.predict(review)
        response = {
            "result": result
        }
        return json.dumps(response)
    else:
        return "You should use only POST query"

if __name__ == '__main__':
    app.run("0.0.0.0", 8000)

Overwriting server.py


In [27]:
! launch-server.sh server.py

Success!


In [19]:
! curl http://localhost:8000/

Use /sentiment path to classify the sentiment of the movie review.


In [20]:
! curl http://localhost:8000/sentiment

You should use only POST query

In [21]:
data = {
    'review': "You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant in a return to proper acting after his decade or so of calling it in. The screenplay is dense and layered (I'd say it was a thick as a Bible), cinematography is quite stark and spare for the most part but imbued with rich, lucious colour in moments (especially scenes with Florence Pugh), the score is beautiful at times but mostly anxious and oppressive, adding to the relentless pacing. The 3 hour runtime flies by. All in all I found it an intense, taxing but highly rewarding watch. This is film making at it finest. A really great watch."
}

In [22]:
import requests 

In [23]:
r = requests.post("http://localhost:8000/sentiment", json=data)

In [24]:
r.status_code

200

In [25]:
r.json()

{'result': {'review': "You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant in a return to proper acting after his decad