In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import libraries

In [None]:
import pandas as pd
import torch
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings

Import file

In [None]:
import pandas as pd

datafile = '/kaggle/input/spotify-app-reviews-2022/reviews.csv'
df = pd.read_csv(datafile, encoding="utf-8")


df.head()

Understand Data

In [None]:
df.info()

Remove URLS

In [None]:
import re


def remove_url(text):
    text = re.sub(r"http\S+", "", text)
    return text


# Example string with weird font characters plus an URL which we gonna remove.
sample = "ℍ𝕚 𝔼𝕧𝕖𝕣𝕪𝕠𝕟𝕖 \n https://www.kaggle.com/ 😊"
print(f"Text before removing url:- \n {sample}")

sample = remove_url(sample)
print(f"Text after removing url:- \n {sample}")

Normalise unicode data

In [None]:
import unicodedata as uni

print(f"Text before Unicode Normalization:- \n {sample}")

sample = uni.normalize('NFKD', sample)
print(f"Text after Unicode Normalization:- \n {sample}")

Install Demoji library

In [None]:
!pip install demoji

Clean emojis

In [None]:
import demoji


def handle_emoji(string):
    emojis = demoji.findall(string)

    for emoji in emojis:
        string = string.replace(emoji, " " + emojis[emoji].split(":")[0])

    return string


print(f"Before Handling emoji:- \n {sample}")
print(f"After Handling emoji:- \n {handle_emoji(sample)}")

Tokenise words

In [None]:
def word_tokenizer(text):
    text = text.lower()
    text = text.split()

    return text


sample = "Hi Everyone I really love the playlists on this app."
print(sample)
print(word_tokenizer(sample))


Here I created a custome list of stopwords including sentiment words, and the name of the app as well as other unimportant words

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
#Adding in extra stopwords related to sentiment and unneeded characters
custom_stopwords = ["spotify", "i", "I", "app", "happy", "sad", "work", "hi", "fun", "idk", "lol", "ive", "trash", "god", "guys", "dont", "worst", "days", "right", "thanks", "lots", "week", "nice", "kind", "day", "ca", "garbage", "today", "sucks", "awesome", "everytime", "bit", "thank", "easy", "things", "keeps", "people", "lot", "thing", "way", "times", "im", "angry", "ve", "music", "great", "don", "want", "good", "really", "love", "hate", "songs", "song", "like", "just"]
en_stopwords = set(stopwords.words('english'))
all_stopwords = set(custom_stopwords).union(set(ENGLISH_STOP_WORDS).union(en_stopwords))
print(f"Stop Words in English : \n{ all_stopwords}")

Remove Stopwords function

In [None]:

def remove_stopwords(text):
    text = [word for word in text if word not in all_stopwords]
    return text


print(f"Before removing stopwords : {word_tokenizer(sample)}")
print(f"After removing stopwords : {remove_stopwords(word_tokenizer(sample))}")

Lemmatisation step below

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

stemmer = PorterStemmer()


def stemming(text):

    text = [stemmer.stem(word) for word in text]
    return text


sample = "I am creating a Notebook"
print(f"Before Stemming : {(sample)}")
print(f"After Stemming : {stemming(word_tokenizer(sample))}")

Import Spacy

In [None]:
import spacy

sp = spacy.load("en_core_web_sm")

Alternative lemmatisation using Spacy library

In [None]:
def lemmatization(text):

    # text = [sp(word).lemma_ for word in text]

    text = " ".join(text)
    token = sp(text)

    text = [word.lemma_ for word in token]
    return text


print(f"Before Lemmatization : {word_tokenizer(sample)}")
print(f"After Lemmatization : {lemmatization(word_tokenizer(sample))}")

Define pre-processing function

In [None]:
from gensim.models import Phrases

def preprocessing(text):
    
    text = remove_url(text) 
    text = uni.normalize('NFKD', text)
    text = handle_emoji(text)
    text = text.lower() 
    text = re.sub(r'[^\w\s]', '', text)
    text = word_tokenizer(text)
    # text = stemming(text)
    text = lemmatization(text)
    text = remove_stopwords(text)

    text = " ".join(text)
    return text

Apply pre-processing

In [None]:
from tqdm import tqdm

tqdm.pandas()

df['clean_review'] = df['Review'].progress_apply(lambda x: preprocessing(x))


See what cleaned review looks like

In [None]:
df.head()

Tokenise clean review

In [None]:
from tqdm import tqdm
tqdm.pandas()

df['clean_review2'] = df['clean_review'].progress_map(word_tokenizer)
data_words = df['clean_review2'].values.tolist()

df.head(20)



Create list of all words

In [None]:
data_words = df['clean_review2'].values.tolist()
len(data_words)
all_words = df['Review'].values.tolist()

Create a disctionary and corpus

In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
corpus_bigram = [id2word.doc2bow(doc) for doc in data_words]
# View
print(corpus[:1][0][:30])



Example preprocessing 

In [None]:

tokenized_corpus = [[id2word[word_id] for (word_id, count) in doc] for doc in corpus]

processed_text = preprocessing("I love spotify so much it's the best thing every and I'm impressed by the audio quality")
print(processed_text)

LDA model building for topic modelling

In [None]:
from gensim.models import LdaMulticore
from gensim.models import LdaModel
from pprint import pprint

# number of topics
num_topics = 10
# Build LDA model
lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, iterations=400)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


Although topics aren't really well defined, I've manually chosen these labels from the topics for the further analysis:

advert
subscription_or_cost
song_selection
user_experience
comparison
audio_quality
support
podcasts
connectivity
podcast
recommendation
playlists

Below I will visualise the clusters from the LDA analysis

In [None]:
import numpy as np
from sklearn.manifold import TSNE
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import Label
from bokeh.palettes import Category10
output_notebook()

# Get Document-Topic Distributions
doc_topic_dists = np.zeros((len(corpus), num_topics))
for doc_num, doc_topics in enumerate(lda_model[corpus_bigram]):
    for topic, prob in doc_topics:
        doc_topic_dists[doc_num, topic] = prob
print(doc_topic_dists.shape)


# Use t-SNE to reduce dimensionality
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(doc_topic_dists)

# Visualising using Bokeh
n_topics = lda_model.num_topics
colormap = np.array(Category10[10])
_lda_keys = np.array(doc_topic_dists).argmax(axis=1).tolist()

# Finding the centroid of each topic
_mean_topic_vectors = []
for t in range(n_topics):
    if colormap[_lda_keys].tolist().count(colormap[t]) > 0:
        _mean_topic_vectors.append(tsne_lda[colormap[_lda_keys] == colormap[t]].mean(axis=0))
        
top_3_words_lda = []
for t in range(n_topics):
    topic_words = lda_model.show_topic(t, 3)
    words_for_topic = [word for word, score in topic_words]
    top_3_words_lda.append(", ".join(words_for_topic))
    



The mean sigma is not great but this is fine for our type of analysis

In [None]:
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), width=700, height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=colormap[_lda_keys])
for t in range(n_topics):
    label = Label(x=_mean_topic_vectors[t][0], y=_mean_topic_vectors[t][1], 
                  text=top_3_words_lda[t], text_color=colormap[t])
    plot.add_layout(label)

show(plot, notebook_handle=True)

Clusters are not well defined but this is fine as we were able to pre-define topics for our analysis below

Next to train the FastText model

In [None]:
# load in open-ai labelled data
input_path = '/kaggle/input/final-dataset-msc-3/assistant_label_responses.txt'
f = open(input_path,'r')

!head /kaggle/input/final-dataset-msc-3/assistant_label_responses.txt
!wc /kaggle/input/final-dataset-msc-3/assistant_label_responses.txt

Preprocess training data

In [None]:
import re

def preprocess_text(text):
    # Add spaces around punctuation
    text = re.sub(r'([.\!?,"/()])', r' \1 ', text)
    
    # Convert uppercase to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

with open("/kaggle/input/final-dataset-msc-3/assistant_label_responses.txt", "r", encoding="utf-8") as infile:
    with open("spotify_final_msc-3.preprocessed.txt", "w", encoding="utf-8") as outfile:
        for line in infile:
            outfile.write(preprocess_text(line) + "\n")

Split training and validation data

In [None]:
import fasttext
!tail -n 1735 /kaggle/input/final-dataset-msc-3/assistant_label_responses.txt > spotify.train
!head -n 434 /kaggle/input/final-dataset-msc-3/assistant_label_responses.txt > spotify.valid
model = fasttext.train_supervised(input="spotify.train", lr=0.73, epoch=67, wordNgrams=1, bucket=200000, dim=50, loss='ova')
model.save_model("model_spotify_final.bin") 

Test in a sample review not from the training set

In [None]:
model.predict("I hate the adds despite paying so much, and audio just sounds tinny", k=12)

The model correctly has high probability for the above example for the aspects "adverts" and "subscription_or_cost" and a borderline acceptable weight for "user_experience". However, even though audio quality has the 4th most likely aspect(along with support), the probability is low

Calculate the optimel precision and recall picking the best threshold

In [None]:
import numpy as np
import matplotlib.pyplot as plt

thresholds = np.linspace(1, 0, 100)  
precisions = []
recalls = []
f1_scores = []
num_labels = 5

for threshold in thresholds:
    _, precision, recall = model.test("spotify.valid", k=num_labels, threshold=threshold)
    
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(2 * (precision * recall) / (precision + recall))

# Plotting
plt.figure(figsize=(10,7))
plt.plot(recalls, precisions, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid()

# Highlight the maximum F1-score point
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
plt.scatter(recalls[optimal_idx], precisions[optimal_idx], color='red')
plt.annotate(f'Threshold:{optimal_threshold:.2f}', (recalls[optimal_idx], precisions[optimal_idx]))

plt.show()


Calculate precision and recall using optimal threshold. 

In [None]:
threshold = optimal_threshold
model.test("spotify.valid", k=num_labels, threshold=threshold)

It's not a bad precision or recall - we can adjust the threshold hgiher for increased precision but there will be a tradeoff for recall
Define an aspect extraction function

In [None]:
def get_aspects(text):
    try:
        return model.predict(text, k=num_labels, threshold=threshold)
    except:
        return 0

In [None]:
tqdm.pandas(desc="Calculating Similarities")
df['label'] = df['Review'].progress_apply(lambda text: get_aspects(text))

In [None]:
pd.set_option('display.max_colwidth', 100)
df.tail(5)

In [None]:
import fasttext
from nltk.tokenize import sent_tokenize
import re
from transformers import pipeline

sentiment_pipeline = pipeline('sentiment-analysis')

def split_on_conjunctions(text):
    splits = re.split(r'\b(but|or|because|so|although|though|while)\b|\s*,\s*', text)
    return [s.strip() for s in splits if s]

def get_sentiment(text):
    result = sentiment_pipeline(text)
    return result[0]['label']

def get_absa(review):
    sentences = sent_tokenize(review)
    label_sentiments = {}

    for sentence in sentences:
        segments = split_on_conjunctions(sentence)
        for segment in segments:
            sentiment = get_sentiment(segment)
            if sentiment == 'POSITIVE':
                sentiment_label = "Positive"
            elif sentiment == 'NEGATIVE':
                sentiment_label = "Negative"
            else:
                sentiment_label = "Neutral"

            labels, probabilities = model.predict(segment, threshold=threshold, k=num_labels)

            for i, label in enumerate(labels):
                stripped_label = label.replace("__label__", "")
                if stripped_label not in label_sentiments or probabilities[i] > label_sentiments[stripped_label][1]:
                    label_sentiments[stripped_label] = (sentiment_label, probabilities[i])

    # Removing the probabilities and keeping only the sentiments for final output
    final_sentiments = {label: sentiment[0] for label, sentiment in label_sentiments.items()}
    return final_sentiments
    

review = ("Easiest and most convenient way for a student to stream music," 
" but the app is plagued with issues, like pausing randomly on my Samsung" 
" S8 *edit* Issues still persist on Samsung S21. I'm paying far too much for the "
"premium I wish it cost less but I kind of enjoy the listening to the podcasts "
"available. Please help  "
"the customer support team is lacking as they don't communicate nor reply and I absolutely hate how slow it is to connect my device on wifi and all the buzzing sounds are annoying")

get_absa(review)

In [None]:
datafile = '/kaggle/input/random-test/randomly shuffled_reviews_only.csv'
test_df = pd.read_csv(datafile, encoding="utf-8")
test_df_100 = test_df.head(100)
tqdm.pandas(desc="Calculating Similarities")
test_df_100['absa'] = test_df_100['Review'].progress_apply(lambda text: get_absa(text))

In [None]:
test_df_100

A great job on my sample review!

In [None]:
test_df_100.to_csv("lda_model_output.csv", encoding='utf-8', index=False)