In [2]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
reviews = pd.read_csv('./spotify_app_reviews_dataset.csv', encoding='utf8')
print(reviews.head(20))
print(reviews.info())

                                               Review     label
0   Great music service, the audio is high quality...  POSITIVE
1   Please ignore previous negative rating. This a...  POSITIVE
2   This pop-up "Get the best Spotify experience o...  NEGATIVE
3     Really buggy and terrible to use as of recently  NEGATIVE
4   Dear Spotify why do I get songs that I didn't ...  NEGATIVE
5   The player controls sometimes disappear for no...  NEGATIVE
6   I love the selection and the lyrics are provid...  POSITIVE
7   Still extremely slow when changing storage to ...  NEGATIVE
8   It's a great app and the best mp3 music app I ...  POSITIVE
9   I'm deleting this app, for the following reaso...  NEGATIVE
10                    Can't play Spotify when on WiFi  NEGATIVE
11  I had amazon premium music family package and ...  NEGATIVE
12  Worst app always says I'm offline and never sh...  NEGATIVE
13        i hav any music that i like it is superðŸ™Œ  POSITIVE
14  Improve the IA to recommend songs an

In [4]:
reviews['Review'] = reviews['Review'].astype(str).apply(lambda r: r.lower())
# remove punctuation
reviews['Review'] = reviews['Review'].apply(lambda r: r.translate(str.maketrans('', '', string.punctuation)))
# remove noise like ðŸ™Œ
reviews['Review'] = reviews['Review'].astype(str).apply(lambda r: re.sub('[^A-Za-z0-9 ]+', '', r))


In [5]:
reviews['Review'].head(20)

0     great music service the audio is high quality ...
1     please ignore previous negative rating this ap...
2     this popup get the best spotify experience on ...
3       really buggy and terrible to use as of recently
4     dear spotify why do i get songs that i didnt p...
5     the player controls sometimes disappear for no...
6     i love the selection and the lyrics are provid...
7     still extremely slow when changing storage to ...
8     its a great app and the best mp3 music app i h...
9     im deleting this app for the following reasons...
10                       cant play spotify when on wifi
11    i had amazon premium music family package and ...
12    worst app always says im offline and never sho...
13              i hav any music that i like it is super
14    improve the ia to recommend songs and to find ...
15    android user  there are loads of glitches with...
16    i cant listen to my downloaded playlist while ...
17    it always crashing down unable to play and

In [6]:
from collections import Counter

print(Counter(reviews['label']))

label_map = {
    'NEGATIVE': 0,
    'POSITIVE': 1
}

reviews['label'] = reviews['label'].astype(str).replace(label_map).astype(int)
print(Counter(reviews['label']))


Counter({'NEGATIVE': 29423, 'POSITIVE': 23279})
Counter({0: 29423, 1: 23279})


  reviews['label'] = reviews['label'].astype(str).replace(label_map).astype(int)


In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = reviews['Review']
y = reviews['label']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, padding='post')

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=17)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=17)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(33728, 273) (33728,)
(8433, 273) (8433,)
(10541, 273) (10541,)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

vocab_size = len(tokenizer.word_index) + 1  
embedding_dim = 100

simple_model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, trainable=False),
    layers.LSTM(129, return_sequences=True),
    layers.Dropout(0.5),
    layers.LSTM(64),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

simple_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
history = simple_model.fit(X_train, y_train, epochs=5, validation_data=(X_val, y_val))

Epoch 1/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 239ms/step - accuracy: 0.5457 - loss: 0.6892 - val_accuracy: 0.5559 - val_loss: 0.6870
Epoch 2/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 237ms/step - accuracy: 0.5610 - loss: 0.6864 - val_accuracy: 0.5559 - val_loss: 0.6869
Epoch 3/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 235ms/step - accuracy: 0.5554 - loss: 0.6870 - val_accuracy: 0.5559 - val_loss: 0.6872
Epoch 4/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 236ms/step - accuracy: 0.5599 - loss: 0.6859 - val_accuracy: 0.5559 - val_loss: 0.6869
Epoch 5/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 236ms/step - accuracy: 0.5647 - loss: 0.6852 - val_accuracy: 0.5559 - val_loss: 0.6869


In [9]:
loss, accuracy = simple_model.evaluate(X_test, y_test)
print('Test accuracy :', accuracy)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 89ms/step - accuracy: 0.5666 - loss: 0.6844
Test accuracy : 0.559339702129364


In [10]:
embedding_dim = 50
glove_file = './glove.6B.50d.txt'  
embeddings_index = {}

with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


model_pretrained = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.5),
    layers.LSTM(64),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model_pretrained.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history_pretrained = model_pretrained.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 146ms/step - accuracy: 0.5587 - loss: 0.6877 - val_accuracy: 0.5559 - val_loss: 0.6869
Epoch 2/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 144ms/step - accuracy: 0.5558 - loss: 0.6872 - val_accuracy: 0.5559 - val_loss: 0.6869
Epoch 3/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 143ms/step - accuracy: 0.5588 - loss: 0.6868 - val_accuracy: 0.5559 - val_loss: 0.6869
Epoch 4/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 143ms/step - accuracy: 0.5594 - loss: 0.6867 - val_accuracy: 0.5559 - val_loss: 0.6869
Epoch 5/5
[1m1054/1054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 144ms/step - accuracy: 0.5595 - loss: 0.6861 - val_accuracy: 0.5559 - val_loss: 0.6869


In [11]:
loss, accuracy = model_pretrained.evaluate(X_test, y_test)
print('Test accuracy :', accuracy)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - accuracy: 0.5666 - loss: 0.6845
Test accuracy : 0.559339702129364
