##Import Bibliotecas

In [None]:
from keras_preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Conv1D, MaxPool1D, Flatten
from keras.utils import to_categorical #transforma o rating pra categorizar entre 1 e 5
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

##Import do Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("reviews.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6209 entries, 0 to 6208
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Time_submitted  6209 non-null   object 
 1   Review          6209 non-null   object 
 2   Rating          6208 non-null   float64
 3   Total_thumbsup  6208 non-null   float64
 4   Reply           13 non-null     object 
dtypes: float64(2), object(3)
memory usage: 242.7+ KB


In [None]:
df.dropna(axis=0, inplace=True, subset=["Rating"])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6208 entries, 0 to 6207
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Time_submitted  6208 non-null   object 
 1   Review          6208 non-null   object 
 2   Rating          6208 non-null   float64
 3   Total_thumbsup  6208 non-null   float64
 4   Reply           13 non-null     object 
dtypes: float64(2), object(3)
memory usage: 291.0+ KB


##Funções

In [None]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Remove words
    '''
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    nopunc = [stemmer.lemmatize(word,pos = "v") for word in nopunc]
    return [stemmer.lemmatize(word) for word in nopunc]

def remove_emoji(string):
  emoji_pattern = re.compile("["
          u"\U0001F600-\U0001F64F"  # emoticons
          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
          u"\U0001F680-\U0001F6FF"  # transport & map symbols
          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', string)

def frase_para_id(frase):
  output = []
  for palavra in frase:
    output.append(dicionario.loc[palavra].id)
  return output

## Preparação dos dados

In [None]:
# criando a coluna com o review transformado em lista e com texto tratado

#df['Review_treated'] = df['Review'].apply(lambda x: remove_emoji(x))
df['Review_treated'] = df['Review'].apply(lambda x: np.array(text_process(x)))

In [None]:
# transformando o rating em binário (0 se menor ou igual a 3, ou 1 se maior que 3)

Rating_binary =[]
for rating in df.Rating:
  if rating < 4:
    Rating_binary.append(0)
  else:
    Rating_binary.append(1)

df['Rating_binary'] = Rating_binary

In [None]:
#obtenção do dicionário de palavras

lista_unica = []

for frase in df.Review_treated.values:
  for palavra in frase:
    lista_unica.append(palavra)

dicionario = pd.DataFrame(np.unique(np.array(lista_unica), return_counts=True)).T
dicionario.rename(columns={0:"word", 1:"frequency"}, inplace=True)

In [None]:
num_words = 5000 #quantidade de palavras que o dicionário vai conter. A partir de n+1, todas são tratadas como a mesma (no caso, 5001)

dicionario = dicionario.sort_values(by="frequency", ascending=False).reset_index().drop("index", axis=1)

dicionario['id'] = dicionario.index

dicionario.iloc[num_words:]["id"] = num_words+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dicionario.iloc[num_words:]["id"] = num_words+1


In [None]:
dicionario.index = dicionario.word
dicionario.drop('word', axis=1)

Unnamed: 0_level_0,frequency,id
word,Unnamed: 1_level_1,Unnamed: 2_level_1
i,60044,0
app,35434,1
song,32828,2
music,25582,3
play,20783,4
...,...,...
onplease,1,5001
onpoint,1,5001
onpun,1,5001
onr,1,5001


In [None]:
# transformando o review em sequencia numérica a partir do dicionário

df['Review_sequence'] = df['Review_treated'].apply(lambda x: np.array(frase_para_id(x)))

In [None]:
df

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply,Review_treated,Review_sequence,Rating_binary
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,,"[great, music, service, audio, high, quality, ...","[17, 3, 75, 215, 490, 111, 1, 79, 7, 45, 894, ...",1
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,,"[please, ignore, previous, negative, rat, this...","[36, 1145, 535, 1034, 330, 41, 1, 315, 17, 0, ...",1
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,,"[this, popup, get, best, spotify, experience, ...","[41, 1233, 12, 37, 5, 80, 165, 58, 36, 86, 12,...",1
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,,"[really, buggy, terrible, use, recently]","[33, 228, 271, 7, 177]",0
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,,"[dear, spotify, i, get, song, i, didnt, put, p...","[1412, 5, 0, 12, 2, 0, 172, 157, 11, 85, 70, 4]",0
...,...,...,...,...,...,...,...,...
61589,2022-01-01 03:01:29,Even though it was communicated that lyrics fe...,1,6,,"[even, though, communicate, lyric, feature, av...","[21, 167, 2441, 116, 74, 187, 103, 5001, 101, ...",0
61590,2022-01-01 02:13:40,"Use to be sooo good back when I had it, and wh...",1,0,,"[use, sooo, good, back, i, i, download, free, ...","[7, 1279, 16, 52, 0, 0, 35, 56, 106, 406, 223,...",0
61591,2022-01-01 01:02:29,This app would be good if not for it taking ov...,2,10,,"[this, app, would, good, take, device, i, star...","[41, 1, 44, 16, 114, 145, 0, 66, 5001, 93, 5, ...",0
61592,2022-01-01 00:49:23,The app is good hard to navigate and won't jus...,2,1,,"[the, app, good, hard, navigate, wont, let, pl...","[18, 1, 16, 309, 539, 69, 86, 4, 2, 202, 4, 11...",0


## Train test split and padding

In [None]:
x_treino, x_teste, y_treino, y_teste = train_test_split(df["Review_sequence"], df["Rating_binary"], test_size=0.2, random_state=42)

In [None]:
x_treino_pad = sequence.pad_sequences(x_treino, maxlen=500)#, padding='post', truncating='post')
x_teste_pad = sequence.pad_sequences(x_teste, maxlen=500)#, padding='post', truncating='post')

In [None]:
y_treino = np.array(y_treino)
y_teste = np.array(y_teste)

## Modelo

In [None]:
modelo = Sequential()
modelo.add(Embedding(10000, 50, input_length=500))
modelo.add(LSTM(50))
modelo.add(Dense(1, activation='sigmoid'))

In [None]:
modelo.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 500, 50)           250050    
                                                                 
 lstm_4 (LSTM)               (None, 50)                20200     
                                                                 
 dense_4 (Dense)             (None, 1)                 51        
                                                                 
Total params: 270,301
Trainable params: 270,301
Non-trainable params: 0
_________________________________________________________________


In [None]:
modelo.compile(loss='binary_crossentropy', metrics=['accuracy'])
modelo.fit(x_treino_pad, y_treino, epochs=10, validation_data=(x_teste_pad, y_teste))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff2845ddb50>

## Conclusão

O objetivo do trabalho era classificar comentários (reviews) do spotify entre positivos e negativos.
Para isso, consideramos negativos os reviews com nota menor ou igual a 3, e positivos os com nota 4 e 5.

Usando uma rede neural recorrente, criamos um modelo capaz de realizar essa tarefa com acuracidade próxima de 90% no conjunto de teste, bastante equilibrada com a acuracidade do conjunto de treino.