In [1]:
from flask import Blueprint,jsonify,request
from keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
import pandas as pd
from pprint import pprint
import seaborn as sb
import matplotlib.pyplot as plt
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import spacy
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec, Phrases
import pickle
from keras.models import model_from_json

Using TensorFlow backend.


In [2]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
def vectorize_data(data, vocab: dict) -> list:
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
    vectorized = list(map(encode, data))
    return vectorized

def preprocessing(input_review):
    
    stop_words = stopwords.words('english')
    data = input_review
    data = [re.sub('\s+', ' ', sent) for sent in data]
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    bigram = gensim.models.Phrases(data_words, min_count=5) 
    trigram = gensim.models.Phrases(bigram[data_words])  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_nostops = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]
    data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    embedding_vector_size = 256
    bigrams_model =  Word2Vec.load("Bigram_Word2VecModel2.model")
    X_data = bigram_mod[data_lemmatized]
    input_length = 300
    X_pad = pad_sequences(sequences=vectorize_data(X_data, vocab=bigrams_model.wv.vocab),maxlen=input_length,padding='post')
    return X_pad


In [5]:
vald = pd.read_csv('validation.csv')
datas = vald["reviews.text"]

In [6]:
json_file = open('Consumermodel.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("Consumermodel.h5")
print("Loaded model from disk")

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model)

pred = model.predict(preprocessing(datas))


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Loaded model from disk
<keras.engine.sequential.Sequential object at 0x0000014DB086F470>


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [36]:
for z in range(0,1,0.01)
Y_pred=list()
for x in pred:
    if x < 0.1:
        Y_pred.append(0)
    else:
        Y_pred.append(1)


In [37]:
Y_true = list()
for x in vald["reviews.rating"]:
    if x <5:
        Y_true.append(0)
    else:
        Y_true.append(1)

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_true, Y_pred ,normalize=True)

0.72305

In [51]:
accuracy = list()
for z in np.arange(0,1,0.001):
    Y_pred=list()
    for x in pred:
        if x < z:
            Y_pred.append(0)
        else:
            Y_pred.append(1)
    accuracy.append(accuracy_score(Y_true, Y_pred ,normalize=True))

In [52]:
accuracy

[0.71325,
 0.7131,
 0.713,
 0.71285,
 0.71245,
 0.712,
 0.712,
 0.71205,
 0.71185,
 0.71175,
 0.7117,
 0.71165,
 0.71185,
 0.71185,
 0.7117,
 0.71185,
 0.7118,
 0.7115,
 0.7114,
 0.71135,
 0.71135,
 0.7114,
 0.71135,
 0.71125,
 0.7111,
 0.71105,
 0.71115,
 0.7111,
 0.71085,
 0.71075,
 0.71055,
 0.71025,
 0.71025,
 0.71005,
 0.71,
 0.7099,
 0.70985,
 0.7099,
 0.70985,
 0.7098,
 0.7097,
 0.70965,
 0.70975,
 0.7099,
 0.70985,
 0.70955,
 0.7096,
 0.7095,
 0.70955,
 0.70975,
 0.70975,
 0.70965,
 0.7097,
 0.70955,
 0.7095,
 0.7093,
 0.70915,
 0.70905,
 0.709,
 0.70895,
 0.709,
 0.70875,
 0.70875,
 0.70865,
 0.7085,
 0.70835,
 0.70805,
 0.70785,
 0.7078,
 0.7076,
 0.70755,
 0.70745,
 0.7075,
 0.7074,
 0.70735,
 0.70735,
 0.70715,
 0.70695,
 0.70695,
 0.7069,
 0.70665,
 0.7066,
 0.70655,
 0.70635,
 0.7063,
 0.7063,
 0.70605,
 0.7061,
 0.70615,
 0.706,
 0.706,
 0.70605,
 0.70615,
 0.70625,
 0.7061,
 0.7061,
 0.70605,
 0.70605,
 0.706,
 0.7059,
 0.70585,
 0.70565,
 0.70565,
 0.70585,
 0.70575,
 