<a href="https://colab.research.google.com/github/matiasvallejosdev/ar-covid-interactive-visualizer/blob/WorldVisualizer/Colab_5_Construir_RNR_Valoraciones_de_Google_Play.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classifier to google play reviews

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.layers as tfl

import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## 1. Load data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataframe = pd.read_csv('/content/drive/My Drive/Machine Learning/Datasets/googleplaystore_reviews/googleplaystore_user_reviews.csv')

In [None]:
dataframe = dataframe.dropna()
dataframe.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3


In [None]:
dataframe.describe()

Unnamed: 0,Sentiment_Polarity,Sentiment_Subjectivity
count,37427.0,37427.0
mean,0.182171,0.49277
std,0.351318,0.259904
min,-1.0,0.0
25%,0.0,0.357143
50%,0.15,0.514286
75%,0.4,0.65
max,1.0,1.0


## 2. Preprocessing data

In [None]:
dataframe = dataframe[['Translated_Review','Sentiment']]
dataframe.head()

Unnamed: 0,Translated_Review,Sentiment
0,I like eat delicious food. That's I'm cooking ...,Positive
1,This help eating healthy exercise regular basis,Positive
3,Works great especially going grocery store,Positive
4,Best idea us,Positive
5,Best way,Positive


In [None]:
def preprocess_text(sen):
    # Eliminar símbolos de puntuación y números
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Eliminar carácteres sueltos
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Eliminar espacios excesivos
    sentence = re.sub(r'\s+', ' ', sentence)

    # Convertir a minúscula
    sentence = sentence.lower()
  
    # Eliminar las stopwords.
    words = sentence.split()
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    
    return ' '.join(filtered_words)

In [None]:
dataframe['Translated_Review'] = dataframe['Translated_Review'].apply(lambda sen: preprocess_text(sen))
dataframe

Unnamed: 0,Translated_Review,Sentiment
0,like eat delicious food cooking food case best...,Positive
1,help eating healthy exercise regular basis,Positive
3,works great especially going grocery store,Positive
4,best idea us,Positive
5,best way,Positive
...,...,...
64222,ads older many agents much owner posted detail...,Positive
64223,photos posted portal load fit purpose sure sto...,Positive
64226,dumb app wanted post property rent give option...,Negative
64227,property business got link sms happy performan...,Positive


In [None]:
def determine_class(label):
  if label == 'Positive':
    return 0
  elif label == 'Neutral':
    return 1
  elif label == 'Negative':
    return 2

REMOVE_NEUTRAL = False
MERGE_NEGATIVE_NEUTRAL = False

if REMOVE_NEUTRAL:
  indexNames = dataframe[dataframe['Sentiment'] == 'Neutral'].index
  dataframe.drop(indexNames , inplace=True)

  y = dataframe['Sentiment'].apply(lambda x: 1 if x == 'Positive' else 0).to_numpy()
  #y = tf.one_hot(y, 2)
else:
  if MERGE_NEGATIVE_NEUTRAL:
    y = dataframe['Sentiment'].apply(lambda x: 1 if x == 'Positive' else 0).to_numpy()
    #y = tf.one_hot(y, 2)
  else: 
    y = dataframe['Sentiment'].apply(lambda x: determine_class(x)).to_numpy()
    #y = tf.one_hot(y, 3)

X = dataframe['Translated_Review']
y = y.astype(np.uint8)

In [None]:
print('The shape of X is:', X.shape)
print('The shape of y is:', y.shape)

The shape of X is: (37427,)
The shape of y is: (37427,)


In [None]:
X.head()

0    like eat delicious food cooking food case best...
1           help eating healthy exercise regular basis
3           works great especially going grocery store
4                                         best idea us
5                                             best way
Name: Translated_Review, dtype: object

In [None]:
y

array([0, 0, 0, ..., 2, 0, 2], dtype=uint8)

Split data to train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

Padding frecuent preproccesing data

In [None]:
NUMBER_OF_WORDS = 20000
MAX_LEN = 100

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = NUMBER_OF_WORDS)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=MAX_LEN)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=MAX_LEN)

## 3. Create the model

In [None]:
VOCABULARY_SIZE = NUMBER_OF_WORDS
EMBEDDING_SIZE = 128

In [None]:
model = tf.keras.Sequential([
                             tfl.Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, input_shape=(X_train.shape[1],)),
                             tfl.LSTM(units=128, activation='tanh'),
                             tfl.Dense(64, activation='relu'),
                             tfl.Dropout(0.2),
                             tfl.Dense(units=np.unique(y_train).shape[0], activation='sigmoid')
])

In [None]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
dropout_11 (Dropout)         (None, 100, 128)          0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 387       
Total params: 2,691,971
Trainable params: 2,691,971
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

In [None]:
EPOCHS = 10
BATCH_SIZE = 128

In [None]:
model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6bef0e0b90>

In [None]:
test_loss, test_acurracy = model.evaluate(X_test, y_test)
print("Test accuracy: {}".format(test_acurracy))

Test accuracy: 0.6391931772232056
