### <font color='green'>Importing Wheights and Vocabulary</font> 

In [None]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'dataimpact-rd'
!gcloud config set project {project_id}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!gsutil cp gs://di_data_sas/Sentiment_Analysis_Related_Data/EN/weights_model_topics.h5  topics.h5

In [None]:
!gsutil cp gs://di_data_sas/Sentiment_Analysis_Related_Data/EN/vocab.model vocab.model

In [None]:
!gsutil cp gs://di_data_sas/Sentiment_Analysis_Related_Data/EN/weights_model_score.h5 score.h5


In [None]:
!pip install sentencepiece

### <font color='green'>Importing the model</font> 

In [None]:
#!/usr/bin/python

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

import gensim, nltk, re

from tensorflow.python.keras import regularizers
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Bidirectional, Conv1D, CuDNNLSTM, Dense, Dropout, Embedding, LSTM
from tensorflow.python.keras.layers import normalization, Input, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

def create_tokenizer(line):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(line)
    
    return tokenizer

def encode_docs(tokenizer, max_length, docs):

    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen = max_length, padding = 'post')
    
    return padded

def encode_docs_new_vocab(sp, max_length, docs):
    
    encoded =  [sp.EncodeAsIds(doc) for doc in docs]
    padded = pad_sequences(encoded, maxlen = max_length, padding = 'post')
    
    return padded

def f1(y_true, y_pred):    
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))


def generate_data(df, mean_length, ratio, token=None, sp=None):
    
    # split dataframe into singles dataframes for each rating score
    data_1 =  df.loc[lambda df: df['review_rating'] == 1]
    data_2 =  df.loc[lambda df: df['review_rating'] == 2]
    data_3 =  df.loc[lambda df: df['review_rating'] == 3]
    data_4 =  df.loc[lambda df: df['review_rating'] == 4]
    data_5 =  df.loc[lambda df: df['review_rating'] == 5]
    
    # spliting each score dataframe into two dataframes set by a ratio
    data_val_1 = data_1[:int(ratio*len(data_1))]
    data_train_1 =  data_1[int(ratio*len(data_1)):]

    data_val_2 = data_2[:int(ratio*len(data_2))]
    data_train_2 =  data_2[int(ratio*len(data_2)):]

    data_val_3 = data_3[:int(ratio*len(data_3))]
    data_train_3 =  data_3[int(ratio*len(data_3)):]

    data_val_4 = data_4[:int(ratio*len(data_4))]
    data_train_4 =  data_4[int(ratio*len(data_4)):]

    data_val_5 = data_5[:int(ratio*len(data_5))]
    data_train_5 =  data_5[int(ratio*len(data_5)):]
    
    # concat dfs split by ratio
    train_x = pd.concat([data_train_1, data_train_2,data_train_3,  data_train_4, data_train_5])
    val_x = pd.concat([data_val_1, data_val_2,data_train_3, data_val_4, data_val_5])
    
    # setting positifs 1 for rating >3
    train_x['score'] = train_x['review_rating'].apply(lambda x: 1 if x > 3 else 0)
    val_x['score'] = val_x['review_rating'].apply(lambda x: 1 if x > 3 else 0)
    
    train_y = train_x['score'].values
    val_y = val_x['score'].values
    
    #applying categorical from keras
    y_train =  to_categorical(train_y)
    y_val = to_categorical(val_y)
    
    # choosing tokenization by word or bpe
    if sp == None:
        X_train = encode_docs(token, mean_length, train_x['review_body'])
        X_val = encode_docs(token, mean_length, val_x['review_body'])
    else:
        X_train = encode_docs_new_vocab(sp, mean_length, train_x['review_body'])
        X_val = encode_docs_new_vocab(sp, mean_length, val_x['review_body'])
    
    return X_train, y_train, X_val, y_val

def ml_model_score(vocab_size, input_length, dimension):
    
    embedding_layer = Embedding(vocab_size, dimension, input_length=input_length)
    sequence_input = Input(shape=(input_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)
    x = Dropout(0.4)(x)
    x = Dense(64,  activation = 'relu')(x)
    x = Dropout(0.3)(x)

    output_tensor = Dense(2, activation = 'softmax')(x)
    
    return Model(sequence_input, output_tensor)

def ml_model_topics(vocab_size, input_length, dimension):
    
    embedding_layer = Embedding(vocab_size, dimension, input_length=input_length)
    sequence_input = Input(shape=(input_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)
    x = Dropout(0.4)(x)
    x = Dense(64,  activation = 'relu')(x)

    output_tensor = Dense(6, activation = 'sigmoid')(x)
    
    return Model(sequence_input, output_tensor)


def precision(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
    _precision = true_pos / (predicted_pos + K.epsilon())
    return _precision

def recall(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_pos = K.sum(K.round(K.clip(y_true, 0, 1)))
    _recall = true_pos / (possible_pos + K.epsilon())
    return _recall





### <font color='green'>Single prediction function</font> 

In [None]:

import importlib
import pickle, os
import numpy as np
import pandas as pd 
from tqdm import tqdm
import tensorflow as tf
from keras import callbacks
import sentencepiece as spm
from keras import backend as K
from keras.optimizers import Adam
# from keras.backend.tensorflow_backend import set_session
# MACHINE_LEARNING = importlib.import_module('src.3_Prediction.ML.machine_learning')

results = None
PRED_Y = None
TRUE_y = None

def predict_reviews(review):
    
    config =  tf.compat.v1.ConfigProto() 
    config.gpu_options.per_process_gpu_memory_fraction = 0.8
    #set_session( tf.compat.v1.Session(config=config))
    tf.compat.v1.Session(config=config)

    K.set_epsilon(1e-5)

    # country, retailer = retailers['country'].values[0], retailers['name'].values[0].capitalize()
    country = 'US'
    lang = 'EN'
    if country in ('FR', 'BE'):
        lang = 'FR'
    
    # logger.log_message("PREDICTING SCORE FOR " + country + '_' + retailer)
    data = {'review_body': review, 'text_clean': review, 'title_clean': ''}
    df = pnd.DataFrame([data])
    #df = pd.read_csv('take.csv')
    
    df = df.dropna(subset=['review_body'])

    sp = spm.SentencePieceProcessor()
    sp.Load('vocab.model')

    input_length, vocab_size  = 256, 7500

    model = ml_model_score(vocab_size, input_length, 100)
    model.compile(optimizer=Adam(lr=1e-3), loss='categorical_crossentropy', metrics=['accuracy', f1])
    model.load_weights('score.h5')

    X = encode_docs_new_vocab(sp, input_length, df['review_body'])
    # Y_true = df['review_rating'].apply(lambda x: 1 if x > 3 else 0)
    Y = model.predict(X, batch_size=5000)
    
    # target_all = [1 if i > 0.5 else -1 for i in Y_true]
    pred_all = [1 if i[0] < 0.5 else -1 for i in Y]


    # pickle.dump(target_all, open('target_all.p', 'wb'))
    pickle.dump(pred_all, open('pred_all.p', 'wb'))

    _temp = pd.DataFrame(Y)
    _temp['ml_score'] = _temp[0].apply(lambda x: 1 if x < 0.5 else -1)

    df['ml_score'] = _temp['ml_score']
    # df['ml_score']= df[['ml_score','review_rating']].apply(lambda x: x['ml_score'] if x['review_rating'] > 1 else -1, axis=1)

    #logger.log_message("PREDICTING TOPIC FOR " + country + '__' + retailer)

    df['text_clean'] = df['text_clean'].replace(np.nan, '', regex=True)
    df['title_clean'] = df['title_clean'].replace(np.nan, '', regex=True)

    df['text'] = df['text_clean'] + ' ' + df['title_clean']
    df['text'].fillna('', inplace=True)

    model = ml_model_topics(vocab_size, input_length, 100)
    model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy', f1])
    model.load_weights('topics.h5')

    X = encode_docs_new_vocab(sp, input_length, df['text'])
    Y = model.predict(X, batch_size=5000)

    targets = pickle.load(open('targets.p', 'rb'))
    new_y = [[targets[index] if element > 0.95 else 0 for index, element in enumerate(elements)] for elements in Y]

    _temp = pd.DataFrame(new_y)
    df['ml_topic'] = list(_temp[[0,1,2,3,4,5]].values)

    df['ml_topic'] = df['ml_topic'].apply(lambda x: [i for i in x if i != 0])

    df.to_csv( 'result_ml.csv', index=False)
    return df.iloc[0].to_dict(), review

In [None]:
# testing
predict_reviews('fast shipped i love it')

### <font color='green'>Multiple tests</font> 

In [None]:
from termcolor import colored
import logging
tf.get_logger().setLevel(logging.ERROR)
reviews= ['absolutely love the taste amazing', "got shipped in 2 days, nice", 'fake coffe i would give it a 0 star', "too expensive but still good taste"]
for review in reviews:
  result = predict_reviews(review)
  score = 'Avis positif'  if result[0]['ml_score'] ==1 else "Avis Négatif"
  score_color = "green" if 'positif' in score else "red"
  print ('\nthe topic of the review: #' , result[1],'\n\n  is:  ', colored(result[0]["ml_topic"], "red"), 'and the predicted opinion is: ' ,colored(score, score_color), '\n')