In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path='drive/MyDrive/'

Mounted at /content/drive


In [2]:
# !pip install pyspark
# !pip install pymongo
# !pip install spotipy

Drawing inspiration from social media music sharing, we've observed that modern individuals frequently use music as a medium for personal expression, enthusiastically sharing and showcasing their preferences. Intriguingly, this implies a reverse correlation as well - the sentiments and language used in song-sharing social media posts can provide significant insights into the character of the songs themselves. 

Motivated by this perspective, we are eager to delve deeper into this phenomenon and integrate it into our music recommendation engine. Our goal is to develop a content-based recommendation system, trained on social media content (particularly tweets), capable of generating tailored playlists from any input text. This approach will infuse our music recommendation system with an element of novelty and unexpected delight.


# [Table of contents]()
### [0. Pre-processing](#pre)
##### [Import packages](#import)
##### [Connect with MongoDB and initialize a SparkSession](#connect)
##### [Toolkits: Pre-defined functions](#toolkits)

### [1. Data Preparation](#data)
##### [Initial the journey 🥳](#initial)
##### [Data Cleaning](#clean)
##### [Data Spliting](#split)
##### [Data Recombination](#recomb)

### [2. Model Initializing: TF-IDF](#tfidf)
##### [Setup: Utilizing hashed TF-IDF](#setup)
##### [Evaluation: Top 15 accuracy achieved 88%](#evaluation)
##### [Prediction: Recommend songs based on text input](#prediction)

### [3. Gaming time 😎](#merge)

# Pre-processing <a name="pre"></a>

## Import packages <a name="import"></a>

In [None]:
# Python standard libraries
import os
import re
import time
import random
import string
import itertools
import warnings
from datetime import datetime

# External general libraries
import numpy as np
import requests
import pymongo
import pyspark
from bs4 import BeautifulSoup
from scipy.stats import ttest_1samp, ttest_ind
from google.cloud import storage
from numpy.linalg import norm

# NLP libraries
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

# Spark libraries
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer, MinMaxScaler, MaxAbsScaler, Normalizer, HashingTF, IDF, IDFModel, Tokenizer
from pyspark.ml.linalg import Vectors, DenseVector, VectorUDT

# Music-related libraries
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials

# Suppress warnings 
warnings.filterwarnings(action = 'ignore') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Connect with MongoDB and initialize a SparkSession <a name="connect"></a>

### Connect to MongoDB Atlas

In [None]:
mongo_username = 'your_mongo_username'
mongo_password = 'your_mongo_password'
mongo_ip_address = 'your_database_name.lasvt.mongodb.net/?retryWrites=true&w=majority'
database_name = 'your_database_name'
collection_name = 'your_collection_name'
client = pymongo.MongoClient(f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}")
db = client[database_name]
collection = db[collection_name]

### Create a SparkSession object and a SparkContext object

In [None]:
blob_name='whole.csv'
bucket_name='spotify-twitter'

spark = SparkSession.builder.config("spark.network.timeout", "360000000s")\
                            .config("spark.executor.heartbeatInterval", "3600s")\
                            .getOrCreate()
sc = spark.sparkContext

## Toolkits: Pre-defined functions <a name="toolkits"></a>

In [None]:
def remove_emojis(text: str) -> str:
    """
    Remove any emojis.
    """
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(text))

In [None]:
# Create instances of stemmers and lemmatizers
ps = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer('english')

def stemming(word: str) -> str:
    """
    Lemmatize the input word to its base or dictionary form.
    Stem the lemmatized word to its root form.
    """
    lemmed_word = wordnet_lemmatizer.lemmatize(word)
    stemmed_word = snowball_stemmer.stem(lemmed_word)
    return stemmed_word

In [None]:
def tokenize(text: str) -> list:
    """
    Normalize each word to lowercase, strip punctuation,
    remove stop words, drop words of length <= 2, strip digits.
    Stem text and return a list of tokenized words.
    """
    text = text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', str(text))
    tokens = text.split(' ')
    tokens = [w for w in tokens if len(w) > 2]  # ignore a, an, to, at, be, ...
    tokens = [w for w in tokens if w not in stop_words]
    # tokens = [stemming(token) for token in tokens]
    return tokens

In [None]:
def simple_tokenize(text: str) -> str:
    """
    Normalize each word to lowercase, strip punctuation, 
    return a list of words.
    """
    text = text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', str(text))
    tokens = text.split(' ')
    tokens = [w for w in tokens if len(w) > 0]  # ignore a, an, to, at, be, ...
    return tokens

In [None]:
def converting_input(input_text: str):
    """
    Split input_text into words.
    Duplicate input until it has at least 15 
    words for better model identification.
    """
    input = tokenize(input_text)
    while len(input_text) < 15:
        input *= 5
    df_input = spark.createDataFrame([[input]],["raw"])
    return df_input

In [None]:
cols_list = ["popularity", "danceability", "energy", "key", "loudness", "mode", "speechiness", 
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

def get_songs_docs(collection):
    """
    Collect corpus from all tweets csv files stored in GCS.
    Filter tweets published by ads account (with 'http', '@').
    Parse music features and corresponding tweets for each song.
    Delete songs without tweets.
    """
    corpus, corpus2, corpus3 = [], [], []
    name_list, id_list = [], []
    drop_name, drop_id = [], []
    drop_num = 0
    total_tweets = collection.aggregate([{"$match": {"tweets": {"$exists": True}}},
                                         {"$group": {"_id": 0, "count": {"$sum": {"$size": "$tweets"}}}}])
    origin_dict = list(collection.find({'tweets':{'$ne':[]}},
                                       {'_id':1,'tweets':1,'name':1,'artist':1}))
    
    # Count songs without tweets
    for line in origin_dict:
        try: 
            tweets = line['tweets']
        except: 
            drop_num += 1
            drop_name.append(line['name'])
            drop_id.append(line['_id'])
            continue

        # Parse song name, artist, tweets and other features
        name = line['name']
        id = line['_id']
        artist = line['artist']
        filter_words = tokenize(name) + tokenize(artist)
        name_list.append(name)
        id_list.append(id)
        docs = sc.parallelize(tweets)\
                    .map(lambda x:x['content'])\
                    .filter(lambda x: isinstance(x, str))\
                    .map(remove_emojis)\
                    .map(lambda x: re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', str(x)))
        for fword in filter_words:
            docs = docs.map(lambda x: x.replace(fword, ''))

        docs_lines = docs.collect()
        corpus3 += zip([id]*len(docs_lines), docs_lines)

    del origin_dict
    
    # Assemble dataframe with song name and tokenized tweets
    lineData = spark.createDataFrame(corpus3, ["id", "tweet_line"])
    df_fea = spark.createDataFrame(collection.find({'tweets':{'$ne':[]}}))\
              .drop('tweets').cache()
    for col in cols_list:
        df_fea = df_fea.withColumn(col, df_fea[col].cast("float"))

    return lineData, df_fea

In [None]:
def scrape_song_lyrics(df_features):
    """
    Scrape lyrics from spotify websites with given dataframe which contains
    the ids, names and other metrics of the songs.
    Return a dictionary stores ids as keys and lyric lists as values.
    Each row of the lyrics is an individual element of the list.
    """
    # Collect the ids of required songs
    list_trackid = df_features.select('_id').rdd.map(lambda x:list(x)[0]).collect()
    d={}
    lines=[]
    words=[]
   
    # Prepare the Spotify auth credentials to access the websites
    username='Your username'
    pw="Your password"
    auth = (username, pw)
    for i,trackid in enumerate(list_trackid):

      # To avoid being blocked by spotify
      if i%5 == 0:
        time.sleep(4) 

      html = 'https://open.spotify.com/track/' + trackid
      r = requests.get(html, auth = auth)
      soup = BeautifulSoup(r.content,features="html.parser")

      # Locate the lyrics in the html files
      if len(soup.find_all('h2')) > 0:
        lyric_tag=soup.find_all('h2')[0]
        if lyric_tag.string == 'Lyrics':
            for line in lyric_tag.find_parent("div").findAll("p"):
                if isinstance(line.string, str) and line.string.count(' ') > 2:
                    lines.append({'id':str(trackid), 'lyric_line': str(line.string)})
    line_lyric=spark.createDataFrame(lines, ["id", "lyric_line"])

    return line_lyric

# Data Preparation <a name="data"></a>

To ensure that we obtain songs with sufficient tweet content and are widely discussed by people, we have selected playlists that are composed of popular songs or have a tendency to go viral. These playlists include both officially created playlists and playlists that contain top songs from different genres such as jazz, pop, and others. Additionally, we have filtered the songs by popularity, a metric obtained from the Spotify API, by keeping only those **songs with a popularity score of 60 or higher**. This ensures that we are working with songs that have been well-received and have a considerable amount of engagement among listeners.

Our project leverages the power of the `Spotify API` and `Twitter API` to collect and analyze data on popular songs and associated tweets. We've implemented a daily data auto-collection pipeline on Airflow, which allows us to efficiently collect and store large volumes of data in **Google Cloud Storage (GCP)** and manage them with **MongoDB Atlas**, a NoSQL database that offers scalable and flexible data storage. 

Over a span of three months, from February to April, we successfully gathered a substantial dataset consisting of **2k songs, 240k tweets, and 84k lyric lines**. To bolster the precision of our recommendation engine, a meticulous filtration process was employed. We pruned songs with less than 50 related tweets, potential marketing tweets, and their corresponding lyrics. As a result, our refined dataset comprised of 63k tweets and 54k lyric lines. Through meticulous data curation and the establishment of a robust data collection framework, we've ensured our music recommendation engine has the most pertinent and insightful input at its disposal.



## Initial the journey 🥳  <a name="initial"></a>

We start by retrieving all the stored data from MongoDB and then scrape the lyrics for each song using the Spotify API. Next, we merge the data into PySpark DataFrames, where we record the song ID and content in each row.

In [None]:
# fetch tweets and features from mongodb
whole_tweet_df_ori, df_features = get_songs_docs(collection)
df_features.count(), whole_tweet_df_ori.count()

(1838, 236973)

In [None]:
# scrape lyrics from spotify API
whole_lyric_df_ori = scrape_song_lyrics(df_features)
whole_lyric_df_ori.count()

83765

In [None]:
# save the dataframes
whole_tweet_df_ori.write.save(file_path+'whole_tweet_df_ori')
whole_lyric_df_ori.write.save(file_path+'whole_lyric_df_ori')
df_features.write.save(file_path+'df_features')

# load the dataframes
# whole_tweet_df_ori = spark.read.load(file_path+'whole_tweet_df_ori')
# whole_lyric_df_ori = spark.read.load(file_path+'whole_lyric_df_ori')
# df_features = spark.read.load(file_path+'df_features')


## Data Cleaning <a name="clean"></a>

In this step, we will identify and drop tweets from robots, tweets with insufficient information, duplicated tweets by PySpark.


In [None]:
# tweets cleaning: drop duplicate tweets; tweets containing stopwords; delete the songs that don't have enough tweets

s1 = whole_tweet_df_ori.count()
whole_tweet_df = whole_tweet_df_ori.dropDuplicates()
udf_wordcount=udf(lambda x: int(len(simple_tokenize(x))), IntegerType())
whole_tweet_df = whole_tweet_df.withColumn('word_count',udf_wordcount(whole_tweet_df.tweet_line))
whole_tweet_df = whole_tweet_df.filter(whole_tweet_df.word_count>10)
whole_tweet_df = whole_tweet_df.filter((~whole_tweet_df.tweet_line.contains('playing'))\
                                       &(~whole_tweet_df.tweet_line.contains('amp'))\
                                       &(~whole_tweet_df.tweet_line.contains('hps:'))\
                                       &(~whole_tweet_df.tweet_line.contains('http'))\
                                       &(~whole_tweet_df.tweet_line.contains('@')))
filter_songs_id = whole_tweet_df.groupby('id').count().filter(col('count')>10).select('id').collect()
filter_songs_id_list = [r.asDict()['id'] for r in filter_songs_id]
whole_tweet_df = whole_tweet_df.filter(col('id').isin(filter_songs_id_list))
s2 = whole_tweet_df.count()
print(f"The length of whole_tweet_df from {s1} to {s2}, narrowed down {(s1-s2)/s1*100:.1f}%.")

The length of whole_tweet_df from 236973 to 63367, narrowed down 73.3%.


In [None]:
# lyric cleaning: drop the lines contains only some duplicate words

t1 = whole_lyric_df_ori.count()
unique_num_udf = udf(lambda x: len(list(set(simple_tokenize(x)))), IntegerType())
whole_lyrics_df = whole_lyric_df_ori.dropDuplicates()
whole_lyrics_df = whole_lyrics_df.withColumn('lyric_word_num',unique_num_udf(whole_lyrics_df.lyric_line))
whole_lyrics_df = whole_lyrics_df.filter(whole_lyrics_df.lyric_word_num>1).drop('lyric_word_num')
t2 = whole_lyrics_df.count()
print(f"The length of whole_lyrics_df from {t1} to {t2}, narrowed down {(t1-t2)/t1*100:.1f}%.")

The length of whole_lyrics_df from 83765 to 54455, narrowed down 35.0%.


## Data Spliting <a name="split"></a>

To ensure that each song's corpus appears in both the training and test sets, we applied a stratified split on the "id" column for each dataframe.


In [None]:
# stratified sampling on tweet_lyric_df
def stratified_split_train_test(df, frac, label, seed=42):
    """ stratfied split of a dataframe in train and test set."""
    df=df.withColumn("index", monotonically_increasing_id())
    fractions = df.select(label).distinct().withColumn("fraction", lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.join(df_frac, on='index', how="left_anti")
    return df_frac, df_remaining

test_lyric_df, train_lyric_df = stratified_split_train_test(whole_lyrics_df, 0.1, 'id')
test_tweet_df, train_tweet_df = stratified_split_train_test(whole_tweet_df, 0.1, 'id')

print(f"Lyrics: \t The length of train data: {train_lyric_df.count()} \t The length of test data: {test_lyric_df.count()}")
print(f"Tweets: \t The length of train data: {train_tweet_df.count()} \t The length of test data: {test_tweet_df.count()}")

Lyrics: 	 The length of train data: 48950 	 The length of test data: 5505
Tweets: 	 The length of train data: 57008 	 The length of test data: 6359


## Data Recombination <a name="recomb"></a>

In an initial step, we consolidated tweet lines and lyric lines related to specific songs from the training data into two data frames, `train_tweet_df` and `train_lyric_df`. Each line was grouped by song ID and merged into a singular text string. 

Following this, we performed an 'inner' join on these frames to unify lyrics and tweets for each song. Leveraging a user-defined function, the collective content was then tokenized and stemmed. Subsequently, we once again combined lyrics and tweets, generating a fresh `tweet_lyric_cor` column within a data frame destined for model training. 

In [None]:
# concat lines of tweets / lyrics from the same song into a single line separately
train_tweet_corpus_df = train_tweet_df.groupby("id").agg(concat_ws(" ", collect_list(train_tweet_df.tweet_line)).alias('tweet_cor')).withColumnRenamed('id','ID_')
train_lyric_corpus_df = train_lyric_df.groupby("id").agg(concat_ws(" ", collect_list(train_lyric_df.lyric_line)).alias('lyric_cor'))

# join, concat lyric and tweets and split into words
lyirc_tweet_corpus_df = train_tweet_corpus_df.join(train_lyric_corpus_df, train_lyric_corpus_df.id==train_tweet_corpus_df.ID_, 'inner')
udf_tokenize = udf(lambda x: [stemming(i) for i in tokenize(x)], ArrayType(StringType()))
lyirc_tweet_corpus_concat_df = lyirc_tweet_corpus_df.select(concat_ws(' ', lyirc_tweet_corpus_df.tweet_cor, lyirc_tweet_corpus_df.lyric_cor).alias('tweet_lyric_cor'),'id')
lyirc_tweet_corpus_split_df = lyirc_tweet_corpus_concat_df.select('id', udf_tokenize(lyirc_tweet_corpus_concat_df.tweet_lyric_cor).alias('raw'))

print(f"The length of merged data: {lyirc_tweet_corpus_split_df.count()}")

The length of merged data: 889


In [None]:
# save the dataframes
lyirc_tweet_corpus_split_df.write.save(file_path+'lyirc_tweet_corpus_split_df')

# load the dataframes
# lyirc_tweet_corpus_split_df = spark.read.load(file_path+'lyirc_tweet_corpus_split_df')

# Model Initializing: TF-IDF <a name="tfidf"></a>

Harnessing the strength of the TFIDF (Term Frequency-Inverse Document Frequency) methodology, which effectively quantifies the importance of a word in a document within a large corpus, we devised an approach to encapsulate the essence of each song's content.

In our approach towards calculating the TFIDF for each song, we initially merged tweets and lyrics with the same song ID from the training dataset. This harmonized data was then combined, forming a comprehensive corpus for every song. Following this, we hashed the tokenized and stemmed content of each song, consequently generating their respective TFIDF vectors. In a bid to ensure that each hashing value is unique to a single word, we elected to employ a substantial number of features, set at 100,000.

When predicting based on a text input, we tokenize and stem the input, converting it into a hashed vector. This vector is then compared with each song's TFIDF vector through a dot product operation. The values obtained from this process, indicative of the correlation between the input and each song, are arranged in descending order. We then curate a recommendation playlist comprising the songs possessing the highest correlational values, i.e., the ones most in tune with the input text.

We subjected our methodology to rigorous testing on a subset of 50 songs from our test set. For each song, we fed the corresponding tweet corpus into our system and checked whether the target song featured among the top 15 recommended songs. This process resulted in an impressive 'top 15 accuracy' of 88%.

## Setup: Utilizing hashed TF-IDF <a name="setup"></a>

In [None]:
def tfidf_model(df, numFeatures=100000):
    """
    Train a TF model and a IDF model.
    Transform the input using tf_md and idf_md.
    """
    # tf
    input_name=df.columns[1]
    tf_md = HashingTF(inputCol=input_name, outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = tf_md.transform(df)

    # idf
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idf_md = idf.fit(featurizedData)
    rescaledData = idf_md.transform(featurizedData)
    return tf_md, idf_md, rescaledData


In [None]:
def target_rank(input_text,target_id):
    """
    Tokenize and stem the sentence then transfrom the input to hashing vector. 
    Then dot product the hashing vector with each song's tfidf vector, rank the 
    product value in descending order. Select top ten as the final recommendation.
    
    Return the rank of target song in the recommendation playlist.
    If the target song is not in the playlist, return 100000.
    """
    input_vec = tf_md.transform(converting_input(input_text)).head().rawFeatures
    udf_sv_product = udf(lambda x: float(input_vec.dot(x)), FloatType())
    rescaledData_pd = rescaledData.select('id',udf_sv_product(rescaledData.features).alias('dot_pd'))
    res_id = [x.asDict()['id'] for x in rescaledData_pd.sort(rescaledData_pd.dot_pd.desc()).select('id').head(10)]
    if target_id in res_id:
        target_index = res_id.index(target_id)
        return int(target_index)
    else:
        return 100000

In [None]:
def top15_acc(test_tweet_df, target_rank, test_size):
    """
    Evaluate the recommendation performance by calculating the "top 15 accuracy", 
    which is the proportion of instances where the target song is among the top 
    15 recommended songs based on a given tweet from the test data.
    """
    udf_target_rank = udf(target_rank, IntegerType())
    inputList = test_tweet_df.rdd.map(lambda x: x.asDict()['tweet_line']).collect()
    targetList = test_tweet_df.rdd.map(lambda x: x.asDict()['id']).collect()
    ranklist = []
    for i in range(test_size):
        if inputList[i] and len(''.join(inputList[i].split(' ')))>1:
            x = target_rank(inputList[i], targetList[i])
            ranklist.append(x)
    acc = np.sum(np.array(ranklist) <= 15) / len(ranklist)
    return ranklist, acc

## Evaluation: Top 15 accuracy achieved 88% <a name="evaluation"></a>

To thoroughly gauge the effectiveness of our recommendations, we've adopted "top 15 accuracy" as the evaluation metric for our recommendation system. We feed the system with tweet corpus corresponding to each song in the test set. If the top 15 recommended songs include the target song, we deem the recommendation as successful. This metric enables us to quantify the proportion of successful recommendations among all the tested songs. 

Upon testing with 50 songs, the system demonstrated a robust **"top 15 accuracy" of 88.0%**. This score signifies that, for 88.0% of the test instances, the target song featured within the top 15 recommendations, demonstrating the system's promising precision.

In [None]:
# Define the parameter and calculate the tfidf scores for each song. Stored in dataframe.
tf_md, idf_md, rescaledData = tfidf_model(lyirc_tweet_corpus_split_df)

In [None]:
# Final Accuracy (super slow)
test_size = 50
ranklist, accuracy = top15_acc(test_tweet_df, target_rank, test_size)
print(f'The top 15 accuracy for the recommendation system is {accuracy*100:.1f}%')

The top 15 accuracy for the recommendation system is 88.0%


## Prediction: Recommend songs based on text input <a name="prediction"></a>

In our prediction process, the input text is first subjected to tokenization and stemming, thus transforming it into a corresponding hashed vector. This vector is then systematically cross-compared, via a dot product operation, with the TFIDF vector of each song in the existing database. 

The resulting values, representing the degree of correlation between the input and each song, are subsequently ranked in descending order. From this ordered list, we select the songs with the highest correlational values to craft the final recommendation playlist.




In [None]:
def tfidf_recommendation_3(input_text, tf_md, idf_md, rescaledData, recommend_num, df_features):
    """
    Process an input text and returns a certain number of recommended songs based 
    on the TF-IDF values, by calculating the dot product between the input vector 
    and each song's TF-IDF vector, and then selecting the top recommend_num songs 
    with the highest dot product values.
    """
    start_time = time.time()
    input_dataframe= converting_input(input_text)
    input_vec = tf_md.transform(input_dataframe).head().rawFeatures
    udf_sv_product = udf(lambda x: float(input_vec.dot(x)), FloatType())
    rescaledData_pd = rescaledData.select('id', udf_sv_product(rescaledData.features).alias('dot_pd')).toPandas()
    res_id=list(rescaledData_pd.nlargest(recommend_num, columns='dot_pd')['id'].to_numpy())

    # res_id = [x.asDict()['id'] for x in rescaledData_pd.sort(rescaledData_pd.dot_pd.desc()).select('id').head(recommend_num)]
    print(f'This recommendation spent {time.time() - start_time:.1f} seconds.')
    return df_features.filter(col('_id').isin(res_id)).select('name')

In [None]:
recommend_num = 10
rescaledData = rescaledData.repartition(recommend_num)
input_text = 'beautiful summer, wanna make a ride into mountains and find fountains'

tfidf_recommendation_3(input_text, tf_md, idf_md, rescaledData, recommend_num, df_features).show(recommend_num, False)

This recommendation spent 74.9 seconds.
+------------------------------------------------+
|name                                            |
+------------------------------------------------+
|I Don’t Wanna Live Forever (Fifty Shades Darker)|
|Deep Down (feat. Never Dull)                    |
|1-800-273-8255                                  |
|You                                             |
|just wanna rock (Lil Uzi Vert) - Sped Up Version|
|Our House                                       |
|Yeah! (feat. Lil Jon & Ludacris)                |
|Ride                                            |
|Summer                                          |
|Cruel Summer                                    |
+------------------------------------------------+



# Gaming time 😎 <a name="merge"></a>

Having successfully encapsulated our model and hashed vectors, we're all set to reload them and bring our music recommendation system to life! So, let's dive in and explore the unique playlists our system can curate with different text inputs. Ready for some personalized musical discovery? Let's hit the play button!

In [None]:
def recommend_system(input, file_df_path, recommend_num):
    """
    Load stored dataframes and fit the TFIDF model,
    then make prediction based on the input sentence.
    """
    # Load_files
    df = spark.read.load(file_df_path[0])
    df_features = spark.read.load(file_df_path[1])

    # Fit tfidf models
    tf_md, idf_md, rescaledData = tfidf_model(df)
    res = tfidf_recommendation_3(input, tf_md, idf_md, rescaledData, recommend_num, df_features)

    return res

In [None]:
file_df_path = [file_path+'lyirc_tweet_corpus_split_df', file_path+'df_features']
recommend_num = 15

Let's generate playlist for any text input!

In [None]:
input = 'beautiful summer, wanna make a ride into mountains and find fountains'
recommend_system(input, file_df_path, recommend_num).show(recommend_num)

This recommendation spent 10.6 seconds.
+--------------------+
|                name|
+--------------------+
|I Wanna Dance wit...|
|       Summer Of '69|
|                 You|
|just wanna rock (...|
|Imagine - Remaste...|
|           Our House|
|Yeah! (feat. Lil ...|
|                Ride|
|              Summer|
|        Cruel Summer|
|Players - DJ Smal...|
|I Don’t Wanna Liv...|
|    I Wanna Be Yours|
|Deep Down (feat. ...|
|      1-800-273-8255|
+--------------------+



In [None]:
input2 = 'tough days doing internship and homework, need help!'
recommend_system(input2, file_df_path, recommend_num).show(recommend_num)

This recommendation spent 3.7 seconds.
+--------------------+
|                name|
+--------------------+
|                 Her|
|              Anyone|
|                 You|
|Hello (feat. A Bo...|
|Imagine - Remaste...|
|           Our House|
|Yeah! (feat. Lil ...|
|Players - DJ Smal...|
|Deep Down (feat. ...|
|            Remember|
|            Medicine|
|Kill Bill - Sped ...|
|                 Top|
|Don't You (Forget...|
|      1-800-273-8255|
+--------------------+



Hold on to your headphones, because things just got a lot faster and more personal! We've cached our system to turn any text input into a customized playlist in **just 5-10 seconds** - that's a stunning leap from our initial 75 seconds. So go on, pick your favourite quote, lyric, or even a line from a book, and let's pump up the volume on your unique, tailor-made playlist!