### <font color='green'>Verify if the GPU device is available </font> 

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

### <font color='green'>Verify if the GPU device is available </font> 

In [None]:
!nvidia-smi

### <font color='green'>Installing required libraries </font> 

In [None]:
!pip install sentencepiece

In [None]:
!pip install --upgrade pip setuptools wheel
!pip install -I keras

In [None]:
!pip install -I tensorflow

In [None]:
#!pip install keras==2.2.4
#!pip install tensorflow-gpu==1.13.1
!pip uninstall -y tensorflow==2.2.0

### <font color='green'>Importing data from GCP</font> 

In [None]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'dataimpact-rd'
!gcloud config set project {project_id}


In [None]:
# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://di_data_sas/EN/US/Amazon/Data/periode_11/amazon_ml_opinions_topics.csv /tmp/amazon_ml_opinions_topics.csv
  
# Print the result to make sure the transfer worked.
!head -n 5 /tmp/amazon_ml_opinions_topics.csv

In [None]:
!gsutil cp gs://di_data_sas/EN/US/Walmart/Data/2020_periode_1/walmart_ml_opinions.csv /tmp/walmart_ml_opinions_topics.csv

In [None]:
!gsutil cp  gs://di_data_sas/EN/US/Target/Data/2020_periode_1/target_ml_opinions.csv /tmp/target_ml_opinions_topics.csv

In [None]:
!gsutil cp gs://di_data_sas/EN/UK/Asda/Data/2020_periode_1/asda_ml_opinions.csv /tmp/asda_ml_opinions_topics.csv

In [None]:
!gsutil cp gs://di_data_sas/EN/UK/Morrisons/Data/2020_periode_1/morrisons_ml_opinions.csv /tmp/morrisons_ml_opinions_topics.csv

In [None]:
!gsutil cp gs://di_data_sas/EN/UK/Ocado/Data/2020_periode_1/ocado_ml_opinions.csv  /tmp/ocado_ml_opinions_topics.csv

In [None]:
# quick look at the data
gl = pnd.read_csv('/tmp/asda_ml_opinions_topics.csv', nrows=2)
print(gl.text.iloc[:1])
print(gl.text_clean.iloc[:1])
a = gl.review.iloc[:1]
gl.head()

In [None]:
# function to check memory usage in megabytes
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pnd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)
print(mem_usage(dow))
print(mem_usage(dow_cat))

In [None]:
# importing data from google drive 
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# checking the data usage of the sample data
gl_int = gl.select_dtypes(include=['int'])
converted_int = gl_int.apply(pnd.to_numeric,downcast='unsigned')
print(mem_usage(gl_int))
print(mem_usage(converted_int))


### <font color='green'>Setting the data types to optimize memory usage</font> 

In [None]:
edited_types = {
'asin':         'object',
'average'  :        'float16',
'review_body'     :  'object',
'review_date'      : 'object',
'review_likes'     : 'object',
'review_rating'    :'float16',
'review_title'     : 'object',
'five_star'        :'float16',
'four_star'        :'float16',
'one_star'        :'float16',
'pp_date'          : 'object',
'three_star'       :'float16',
'two_star'         :'float16',
'refpe'            : 'object',
'text_clean'        :'object',
'title_clean'       :'object',
'ml_score'         :'float16',
'text'             : 'object',
'ml_topic'         : 'category',
'opinion'          : 'object',
}


### <font color='green'>Function with garbage collector to free memory after deleting elements</font> 

In [None]:
import gc


# df1 is not needed anymore
def free_memo(element):
  try:
    del(element)
    gc.collect()
    print(f'element:  deleted')
  except NameError:
    print(f'element not in memory')

In [None]:
gc.get_objects()

In [None]:
  try:
    del(y_train)
    gc.collect()
    print(f'element:  deleted')
  except NameError:
    print(f'element not in memory')

In [None]:
free_memo(res)

In [None]:
%whos DataFrame

In [None]:
!free -h

### <font color='green'>loading data into pandas dataframe</font> 

In [None]:
import pandas as pnd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
list_retailers = ['Amazon' , 'Asda', 'Morrisons', 'Ocado', 'Target', 'Walmart']
retailers = {}
for retailer in list_retailers:

    retailers[retailer] = pnd.read_csv('/tmp/' + retailer.lower() + 
                                      '_ml_opinions_topics.csv', dtype=edited_types, nrows=20000)
    #retailers[retailer] = retailers[retailer].dropna(subset=['review_body'])
    
#to_concat = [retailers[retailer][['text_clean', 'title_clean']] for retailer in list_retailers]
to_concat = [retailers[retailer]['review_body'] for retailer in list_retailers]
data = pnd.concat(to_concat, ignore_index = True)

### <font color='green'>Generating Vocab using Sentencpiece</font> 

In [None]:
import sentencepiece as spm

from tqdm import tqdm
with open('EN_all_text.txt', 'w', encoding='utf-8') as f:
    for x in tqdm(data.values):
        f.write(x + '\n')
#spm.SentencePieceTrainer.Train(' --input_sentence_size=10000 --input=EN_all_text.txt --model_prefix=EN_vocab --vocab_size=7500  --split_by_whitespace=false')
spm.SentencePieceTrainer.Train(' --input_sentence_size=20000 --input=EN_all_text.txt --model_prefix=EN_vocab --vocab_size=2500')

### <font color='green'> initiating libraries and env for our model</font> 

In [None]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import sys
sys.path.insert(0,'../../../Git/phylogenetics/Sentiment_Analysis/Tools/')

import os
import pandas as pd 
import sentencepiece as spm
import tensorflow as tf


import pickle, multiprocessing, re

from keras import backend as K
from keras import callbacks
#from tf.keras.backend.tensorflow_backend import set_session
from keras.optimizers import Adam
#from machine_learning import *
from multiprocessing import cpu_count, Pool
from tqdm import tqdm

config =  tf.compat.v1.ConfigProto() 
config.gpu_options.per_process_gpu_memory_fraction = 0.8
#set_session( tf.compat.v1.Session(config=config))
tf.compat.v1.Session(config=config)

### <font color='green'> calculate topics appartenance for each review</font> 

In [None]:
free_memo(res)

In [None]:
list_retailers = ['Amazon' , 'Asda', 'Morrisons', 'Ocado', 'Target', 'Walmart']
retailers = {}
for retailer in list_retailers:

    retailers[retailer] = pnd.read_csv('/tmp/' + retailer.lower() + 
                                      '_ml_opinions_topics.csv', dtype=edited_types, nrows=20000)
    #retailers[retailer] = retailers[retailer].dropna(subset=['review_body'])
    
to_concat = [retailers[retailer][['text_clean', 'title_clean']] for retailer in list_retailers]
data = pnd.concat(to_concat, ignore_index = True)

In [None]:
data.head()

In [None]:
data['text'] = data['text_clean'] + ' ' + data['title_clean']

In [None]:
data['text'].fillna('', inplace=True)

In [None]:
import pickle
topics =  pickle.load(open('topics.p', 'rb'), encoding='latin1')
targets =  pickle.load(open('targets.p', 'rb'), encoding='latin1')

In [None]:
all_reviews = data.text.unique()

In [None]:
all_reviews[:3]

In [None]:
K.set_epsilon(1e-5)

def transform_value(value):
    if value >= 0.02:
        return 1
    else:
        return 0
    
    
def get_topics(commentBody):
    result = {}
    for topic, words in topics.items():
        total = np.sum([len(re.findall(word, commentBody)) for word in words])
        result[topic] = total/len(commentBody.split())
    result = {k:transform_value(v) for k,v in result.items()}
    return (commentBody, result)

In [None]:
sp = spm.SentencePieceProcessor()
#sp.Load('/content/drive/MyDrive/EN_vocab.model')
sp.Load('EN_vocab.model')
input_length, vocab_size  = 256, 2500

In [None]:
os.cpu_count()
cores = cpu_count()
p = Pool(cores)
try:
    res = list(tqdm(p.imap(get_topics, all_reviews), total=len(all_reviews)))
finally:
    p.close()

In [None]:
res[:10]

In [None]:
review_to_topics = {k:v for k,v in res}

In [None]:
i = 0
for k, v in review_to_topics.items():
  print(k,v)
  i += 1
  if i >3:
    break

In [None]:
"""review_to_topic to df"""
new_df = pd.DataFrame(review_to_topics).T.reset_index()
new_df = new_df.rename(index=str, columns=({'index': 'review'}))
new_df['rep'] = list(new_df[['allergens','competition','delivery','packaging','price','taste']].values)
review_to_topics = dict(new_df[['review', 'rep']].values)

In [None]:
data['topic'] = data['text'].apply(lambda x: review_to_topics[x])

In [None]:
data.head()

### <font color='green'> Import machine learning module</font> 

In [None]:
#!/usr/bin/python

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

import gensim, nltk, re

from tensorflow.python.keras import regularizers
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Bidirectional, Conv1D, CuDNNLSTM, Dense, Dropout, Embedding
from tensorflow.python.keras.layers import normalization, Input, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

def create_tokenizer(line):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(line)
    
    return tokenizer

def encode_docs(tokenizer, max_length, docs):

    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen = max_length, padding = 'post')
    
    return padded

def encode_docs_new_vocab(sp, max_length, docs):
    
    encoded =  [sp.EncodeAsIds(doc) for doc in docs]
    padded = pad_sequences(encoded, maxlen = max_length, padding = 'post')
    
    return padded

def f1(y_true, y_pred):    
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))


def generate_data(df, mean_length, ratio, token=None, sp=None):
    
    # split dataframe into singles dataframes for each rating score
    data_1 =  df.loc[lambda df: df['review_rating'] == 1]
    data_2 =  df.loc[lambda df: df['review_rating'] == 2]
    data_3 =  df.loc[lambda df: df['review_rating'] == 3]
    data_4 =  df.loc[lambda df: df['review_rating'] == 4]
    data_5 =  df.loc[lambda df: df['review_rating'] == 5]
    
    # spliting each score dataframe into two dataframes set by a ratio
    data_val_1 = data_1[:int(ratio*len(data_1))]
    data_train_1 =  data_1[int(ratio*len(data_1)):]

    data_val_2 = data_2[:int(ratio*len(data_2))]
    data_train_2 =  data_2[int(ratio*len(data_2)):]

    data_val_3 = data_3[:int(ratio*len(data_3))]
    data_train_3 =  data_3[int(ratio*len(data_3)):]

    data_val_4 = data_4[:int(ratio*len(data_4))]
    data_train_4 =  data_4[int(ratio*len(data_4)):]

    data_val_5 = data_5[:int(ratio*len(data_5))]
    data_train_5 =  data_5[int(ratio*len(data_5)):]
    
    # concat dfs split by ratio
    train_x = pd.concat([data_train_1, data_train_2,data_train_3,  data_train_4, data_train_5])
    val_x = pd.concat([data_val_1, data_val_2,data_train_3, data_val_4, data_val_5])
    
    # setting positifs 1 for rating >3
    train_x['score'] = train_x['review_rating'].apply(lambda x: 1 if x > 3 else 0)
    val_x['score'] = val_x['review_rating'].apply(lambda x: 1 if x > 3 else 0)
    
    train_y = train_x['score'].values
    val_y = val_x['score'].values
    
    #applying categorical from keras
    y_train =  to_categorical(train_y)
    y_val = to_categorical(val_y)
    
    # choosing tokenization by word or bpe
    if sp == None:
        X_train = encode_docs(token, mean_length, train_x['review_body'])
        X_val = encode_docs(token, mean_length, val_x['review_body'])
    else:
        X_train = encode_docs_new_vocab(sp, mean_length, train_x['review_body'])
        X_val = encode_docs_new_vocab(sp, mean_length, val_x['review_body'])
    
    return X_train, y_train, X_val, y_val

def ml_model_score(vocab_size, input_length, dimension):
    
    embedding_layer = Embedding(vocab_size, dimension, input_length=input_length)
    sequence_input = Input(shape=(input_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(embedded_sequences)
    x = Dropout(0.4)(x)
    x = Dense(64,  activation = 'relu')(x)
    x = Dropout(0.3)(x)

    output_tensor = Dense(2, activation = 'softmax')(x)
    
    return Model(sequence_input, output_tensor)

def ml_model_topics(vocab_size, input_length, dimension):
    
    embedding_layer = Embedding(vocab_size, dimension, input_length=input_length)
    sequence_input = Input(shape=(input_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(embedded_sequences)
    x = Dropout(0.4)(x)
    x = Dense(64,  activation = 'relu')(x)

    output_tensor = Dense(6, activation = 'sigmoid')(x)
    
    return Model(sequence_input, output_tensor)

def plot_confusion_matrix(y_true, y_pred, classes, cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

def precision(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
    _precision = true_pos / (predicted_pos + K.epsilon())
    return _precision

def recall(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_pos = K.sum(K.round(K.clip(y_true, 0, 1)))
    _recall = true_pos / (possible_pos + K.epsilon())
    return _recall





In [None]:
""" split data for training"""
val_x = data[:int(0.2*len(data))]
train_x = data[int(0.2*len(data)):]

y_train = train_x['topic'].values
y_val = val_x['topic'].values

y_train = np.asarray([list(v) for v in y_train])
y_val = np.asarray([list(v) for v in y_val])

X_train = encode_docs_new_vocab(sp, input_length, train_x['text'])
X_val = encode_docs_new_vocab(sp, input_length, val_x['text'])

### <font color='green'> training on the hole data loaded in memory</font> 

In [None]:
model = ml_model_topics(vocab_size, input_length, 100)


checkpoint = callbacks.ModelCheckpoint('EN_weights_model_topics.h5',
                                       monitor='val_acc', save_best_only=True, 
                                           save_weights_only=True, verbose=0)

callbacks_list = [checkpoint]

model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy', f1])

model.fit(X_train, y_train, batch_size=128, epochs=10, validation_data=(X_val, y_val), callbacks=callbacks_list)

In [None]:
""" saving the model weights"""
model.save('EN_weights_model_topics.h5')


### <font color='green'> méthod 2: training using generators, to load batch of data at a time</font> 

In [None]:

# saving training and validation sets as numpy array
np.save('X_train_file.npy', X_train)
np.save('X_val_file.npy', X_val)
np.save('y_val_file.npy', y_val)
np.save('y_train_file.npy', y_train)


In [None]:
""" loading the training and validation sets after restarting env (free memory)"""
entrain_x = np.load('/content/drive/MyDrive/X_train_file.npy')
val_x  = np.load('/content/drive/MyDrive/X_val_file.npy')
entrain_y = np.load('/content/drive/MyDrive/y_train_file.npy')
val_y = np.load('/content/drive/MyDrive/y_val_file.npy')


In [None]:
import random
class Dataset_Gen:
    """Dataset Generator"""
    
    def __init__(self, train_x, valid_x, train_y, valid_y, batch):

        self.entrain_x = np.load(train_x)
        self.entrain_y =np.load(train_y)
        
        self.val_x  = np.load(valid_x)
        self.val_y =np.load(valid_y)

        #Read necessary files from disk

        self.batch = batch 
        
        self.data_sample_n = 0 
        self.v_sample_n = 0

        self.train_samples = int(len(self.entrain_x)/batch)
        self.valid_samples = int(len(self.val_x)/batch)


    def get_train(self):
        while True:
            if(self.data_sample_n == self.train_samples-1 ):
                self.data_sample_n = 0
            
            X = self.entrain_x[ self.data_sample_n*self.batch : self.data_sample_n*self.batch + self.batch]
            y = self.entrain_y [ self.data_sample_n*self.batch : self.data_sample_n*self.batch + self.batch]

            self.data_sample_n += 1

            yield (X,y)

    
    def get_valid(self):
        while True:
            if(self.v_sample_n == self.valid_samples-1 ):
                self.v_sample_n = 0
            
            X = self.val_x[ self.v_sample_n*self.batch : self.v_sample_n*self.batch + self.batch]
            y = self.val_y[ self.v_sample_n*self.batch : self.v_sample_n*self.batch + self.batch]
            self.v_sample_n += 1

            yield (X,y)
        

In [None]:
#dg = Dataset_Gen('/content/drive/MyDrive/X_train_file.npy', '/content/drive/MyDrive/X_val_file.npy', '/content/drive/MyDrive/y_train_file.npy', '/content/drive/MyDrive/y_val_file.npy', 128)
dg = Dataset_Gen('X_train_file.npy', 'X_val_file.npy', 'y_train_file.npy', 'y_val_file.npy', 128)

In [None]:
model = ml_model_topics(vocab_size, input_length, 100)


checkpoint = callbacks.ModelCheckpoint('EN_weights_model_topics.h5',
                                       monitor='val_accuracy', save_best_only=True, 
                                           save_weights_only=True, verbose=0)
# lr_decay = callbacks.LearningRateScheduler(schedule=lambda epoch: 0.001 * (0.95 ** epoch))
callbacks_list = [checkpoint]

model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy', f1])

model.fit_generator(dg.get_train(),dg.train_samples, epochs=10, initial_epoch=0, validation_data= dg.get_valid(), validation_steps = dg.valid_samples, callbacks=callbacks_list)

### <font color='green'>Model plotting</font> 

In [None]:
dot_img_file = 'model_1.png'
tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

In [None]:
"""keras model to dot will Convert a Keras model to dot format."""
from keras.utils.vis_utils import plot_model,model_to_dot
from IPython.display import SVGA
SVG(model_to_dot(model ,show_shapes=True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

In [None]:
""" saving the model weights"""
model.save('EN_weights_model_topics.h5')


In [None]:
"""another method for the generator"""
import random
def generator(features, labels, batch_size): # Create empty arrays to contain batch of features and labels# 
  batch_features = np.zeros((batch_size, 256))
  batch_labels = np.zeros((batch_size, 6)) 
  while True:
    for i in range(batch_size):
      # choose random index in features
      #index= random.choice(len(features),1)
      index= i 
      batch_features[i] = features[index]
      batch_labels[i] = labels[index]
    yield batch_features, batch_labels

### <font color='green'>Evaluating the model</font> 

In [None]:
""" loading the training and validation sets after restarting env (free memory)"""

val_x  = np.load('X_val_file.npy')
val_y = np.load('y_val_file.npy')

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(val_x , val_y, batch_size=128)
print("model accuracy on test data: ", f'{round(results[1], 5)}%')

In [None]:
print("Generate predictions for 3 samples")
predictions = model.predict(val_x[:20])
print("predictions shape:", predictions.shape)

In [None]:
predictions[:3]

In [None]:
print(dg.valid_samples)
print(len(dg.val_x)/128)
print(len(y_pred))
print(y_pred[:5])
print(type(np.asarray(y_pred[:10])))
type(one_hot_to_numbers(dg.val_y[:10]))

In [None]:
def numpy_to_list(labels):
  r_lables = []
  for tab in labels:
    r_lables.append(tab.tolist())
  return r_lables

def one_hot_to_numbers(labels):
  r_lables = []
  for tab in labels:
    try:
      r_lables.append(tab.tolist().index(1))
    except:
      r_lables.append(0)
  return np.array(r_lables)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
Y_pred = model.predict_generator(dg.get_valid(), dg.valid_samples)
y_pred = np.argmax(Y_pred, axis=1)
print('Confusion Matrix')
print(confusion_matrix(one_hot_to_numbers(dg.val_y[:len(y_pred)]), y_pred))
print('Classification Report')
print(classification_report(one_hot_to_numbers(dg.val_y[:len(y_pred)]), y_pred, target_names=targets))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sn

def plot_eval(y_true, y_pred):

  y_true_names = y_true
  y_pred_names = y_pred
  print( classification_report(y_true_names, y_pred_names) )
  cm = confusion_matrix(y_true_names, y_pred_names) 
  labels = targets
  df_cm = pnd.DataFrame(cm, index=labels, columns=labels)
  # config plot sizes
  sn.set(font_scale=1.2)
  sn.heatmap(df_cm, annot=True, annot_kws={"size": 18}, cmap='coolwarm', linewidth=0.5, fmt="")
  plt.title('confusion matrix')
  plt.show()




In [None]:
plot_eval(one_hot_to_numbers(dg.val_y[:len(y_pred)]), y_pred)

In [None]:
probs = np.exp(predictions[:,1])
probs