### Imports

In [92]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np
import pandas as pd
import os
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import string
import re
import joblib


# SpaCy Imports for text preprocessing
# import spacy
# from spacy.tokenizer import Tokenizer
# from spacy.lang.en import English, STOP_WORDS, TOKENIZER_EXCEPTIONS, BASE_EXCEPTIONS
# from util import update_exc



### Constants

In [101]:
DATASET_PATH = "ReviewsDataset.csv"
SAVED_MODEL_FILE = "SavedModel.sav"

### Reading the data and displaying it

In [102]:
curr_dir = os.getcwd()
complete_data = pd.read_csv(os.path.join(curr_dir, DATASET_PATH), header=0)
input_reviews = complete_data.iloc[0:, 0]
labels_senti = complete_data.iloc[0:, 1]
print(complete_data.shape)
print(input_reviews.shape)
print(labels_senti.shape)
#print(input_reviews)
#print(labels_senti)

(2999, 2)
(2999,)
(2999,)


In [106]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_reviews, labels_senti, test_size=0.25, random_state=0)
# print(train_inputs)
# print(train_labels)

def text_preprocessor(reviews):
    stop_words = set(stopwords.words('english'))
    index = 0
    for review_text in reviews.iloc[:0].to_numpy():
        # print(review_text.dtype)
        words_filtered = []
        # print(review_text.dtype)
        review_text_list = review_text.split()
        for w in review_text_list:
            if w not in stop_words:
                re.sub(r"[^\w\d'\s,.]",'',w)
                re.sub(r'[^\w\d\s]',' ',w)
                words_filtered.append(w.lower())
        # words_filtered = pd.Series(words_filtered)
        reviews.iloc[index] = words_filtered
        index = index + 1
    print("reviews processed")
    print(reviews.dtype)
    print(reviews.shape)
    #ret_reviews = reviews.reshape(1, reviews.shape[0])
    #print(ret_reviews.shape)
    return reviews


In [107]:
# train_inputs_obj = list()
# test_inputs_obj = list()
#print(train_inputs_obj.shape)
#print(test_inputs_obj.shape)

train_inputs_obj = text_preprocessor(train_inputs)
test_inputs_obj = text_preprocessor(test_inputs)

#train_inputs_obj = text_preprocessor(train_inputs).to_numpy(copy=True)
#test_inputs_obj = text_preprocessor(test_inputs).to_numpy(copy=True)

print(train_labels.dtype)
print(test_labels.dtype)
print(train_labels.shape)
print(test_labels.shape)
# print(train_inputs_obj[0])
print(train_inputs_obj.dtype)
print(test_inputs_obj.dtype)
print(train_inputs_obj.shape)
print(test_inputs_obj.shape)


reviews processed
object
(2249,)
reviews processed
object
(750,)
object
object
(2249,)
(750,)
object
object
(2249,)
(750,)


In [108]:
sentiment_analysis = Pipeline([('tfidf', TfidfVectorizer()), ('model', MultinomialNB())])
sentiment_analysis.fit(train_inputs_obj, train_labels)
joblib.dump(sentiment_analysis, SAVED_MODEL_FILE)

['SavedModel.sav']

### Testing Module

In [112]:
loaded_model = joblib.load(SAVED_MODEL_FILE)
y_pred = loaded_model.predict(test_inputs)
print(accuracy_score(y_pred, test_labels))
print(confusion_matrix(y_pred, test_labels))

0.8266666666666667
[[313  67]
 [ 63 307]]


In [117]:
str_pred = "This is one of the funniest movies ever made. And for those of you who don't get it, it's supposed to be funny. So often comedies try to be so intentionally funny that it misses, but here is finally a movie that succeeds in being hilarious in the most subtle of ways. Even ""spoofs"" lack the originality and natural feel of this film. It is a comedic classic that will surely be appreciated in another time when studios are fdoing this sort of thing regularly. kudos to the makers, and to a hilariously subtle cast of actors, including Isaac Wade, whose performances is top-notch. Truly, a real break-out star performance by an true underrated stage actor. It'll be great to see this guy get his due."
print(loaded_model.predict([str_pred]))

str_pred2 = "The Little Mermaid is one of my absolute favorite Disney movies. I'm sorry to say, however, that Disney completely messed up when they made this sequel. I'll admit it has some good points to it. The songs aren't bad, and the animation is clean and clear. There is some humor, I'm sure--I don't remember, because after watching it I immediately banned it from appearing before my eyes again. The worst point of this movie is the plot. In this movie, Ariel becomes her father. She forbids her daughter to go near the sea (yes, out of fear), just as she was forbidden to go near the land. I personally think that, given her past, Ariel would maintain some of her headstrong ways and not treat her daughter like she herself was treated.<br /><br />Besides this fact, Ursula was replaced by a non-scary, pathetic sort of sea witch (the underfed, forgotten sister) who is more comical than scary. She, too, has some little underling to do her bidding--but she's not scarier or worse than Ursula. Ursula spoiled us with her believability for badness. This sea witch is a joke.<br /><br />To make matters worse, Flounder is a fat, deep-voiced father (no longer the cute guppy we all know and love) and Eric's voice is not even done by the same actor (something that always annoys me in a remake/sequel). (His voice difference was very obvious to me, by the way!) I felt that the only reason this movie was made was so that Disney could catch a few fast dollars, something I hate to think about a corporation I actually really do enjoy. I felt that this plot lacked imagination. I know that this act (child following in the footsteps of a parent) happens, but Ariel was different. That was what we loved so much about her. She had a dream, she fell in love, and she made that dream come true. Until she appeared in this movie, that is. Then she became just like the other adults. This isn't the Ariel I know. And I don't like her.<br /><br />I know of some children who have enjoyed this film, and I know some adults who didn't mind it, either. But for me, and for all of you out there who have the utmost love for Ariel, please don't see this movie. The Ariel we know dies within, resurrected only for a song or two and one final scene that actually isn't bad (where she accepts the water back again)--although she takes very little part in the ending, regardless."
print(loaded_model.predict([str_pred2]))


['positive']
['negative']


In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = "hello hello hello hello hi hi aa done what hello please hi there"
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
nlp = English()
special_cases = BASE_EXCEPTIONS
special_cases = update_exc(special_cases, TOKENIZER_EXCEPTIONS)
special_cases
#Tokenizer(nlp.vocab, rules=special_cases, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, url_match=simple_url_re.match)

In [None]:
pip install regex

In [None]:
pip install spacy
pip install spacy-lookups-data
python -m spacy download en_core_web_sm

In [None]:
pip install --upgrade gensim --user

In [None]:
pip install --upgrade language-check --user

In [None]:
pip install pycontractions

In [None]:
import string
s = '.efqf,ewfqwe\'qewfqewfqef'
s.translate({ord(c): None for c in string.punctuation})

In [None]:
from pycontractions import Contractions
cont = Contractions(api_key="glove-twitter-25")
list(cont.expand_texts(train_inputs.values))

In [None]:
import nltk
nltk.download('stopwords')

In [119]:
pip install streamlit --user

Collecting streamlit
  Using cached https://files.pythonhosted.org/packages/5a/13/c738cf11d526ede46a326f14ede28141ce6a4c2e22cf69842d80fa6cd2a5/streamlit-0.68.0-py2.py3-none-any.whl
Collecting validators (from streamlit)
  Using cached https://files.pythonhosted.org/packages/41/4a/3360ff3cf2b4a1b9721ac1fbff5f84663f41047d9874b3aa1ac82e862c44/validators-0.18.1-py3-none-any.whl
Collecting pydeck>=0.1.dev5 (from streamlit)
  Using cached https://files.pythonhosted.org/packages/51/1e/296f4108bf357e684617a776ecaf06ee93b43e30c35996dfac1aa985aa6c/pydeck-0.5.0b1-py2.py3-none-any.whl
Collecting toml (from streamlit)
  Using cached https://files.pythonhosted.org/packages/9f/e1/1b40b80f2e1663a6b9f497123c11d7d988c0919abbf3c3f2688e448c5363/toml-0.10.1-py2.py3-none-any.whl
Collecting astor (from streamlit)
  Using cached https://files.pythonhosted.org/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl
Collecting pillow>=6.2.0 (from streamlit)
 



In [120]:
streamlit run app1.py

SyntaxError: invalid syntax (<ipython-input-120-dc7d25099f32>, line 1)