## 1. Generate offline dataset (csv) and tokenized file

### 1.1 Requirements

- conda install rank_bm25
- conda install spacy
- python -m spacy download en_core_web_sm

### 1.2 Create csv offline file from all folders

In [37]:
import os
import pandas as pd
pd.set_option('display.max_colwidth', None)
url = r"C:\Users\guntheb\OneDrive - University of Illinois - Urbana\Python\TIS\Music Project"
url_db = os.path.join(url, "database_categorized")
df = pd.DataFrame(columns = ["url", "type", "title", "artist", "text"])
for root, dirs, files in os.walk(url_db):
    for filepath in files:
        classification = root.split('\\')[-1].split('_')[0]
        file_name = os.path.join(root, filepath)
        music = []
        temp_out = ''
        with open(file_name, 'r', encoding='utf-8-sig') as file:
            music.append(file_name)
            music.append(classification)
            end = False
            for line in file:
                if line[:3] == "___":
                    end = True
                if end == False:
                    temp_out += line
                else:
                    line = line.strip().split()
                    if line == []: 
                        break
                    if line[0] in ["Name", "Artist"]:
                        music.append(" ".join(line[1:]))
            music.append(temp_out)
        df.loc[len(df)] = music

### 1.3 Tokenizing using SpaCy

In [48]:
import spacy
import pickle
from rank_bm25 import BM25Okapi
nlp = spacy.load("en_core_web_sm")
bm25 = {}
for i in range(1,6):
    df_tmp = df[df.type == str(i)]
    tok_text=[] # for our tokenised corpus
    for doc in nlp.pipe(df_tmp.text.str.lower().values, disable=["tagger", "ner", "lemmatizer"]):
        tok = [t.text for t in doc if t.is_alpha]
        tok_text.append(tok)
    bm25[i] = BM25Okapi(tok_text)

### 1.4 Generating offline dataset and tokenized file

In [49]:
df.to_csv(os.path.join(url, "music.csv"), encoding='utf-8-sig')

with open(os.path.join(url, "bm25.pkl"), "wb") as tf:
    pickle.dump(bm25,tf)

## 2. Web App code start here

### 2.1 Reading offline dataset (csv) and tokenized file

In [50]:
df_read = pd.read_csv(os.path.join(url, "music.csv"))

with open(os.path.join(url, "bm25.pkl"), "rb") as tf:
    bm25_read = pickle.load(tf)

### 2.2 Creating the query and print title results

In [51]:
query = "true love"
mood = 1
n_results = 10

df_mood = df_read[df_read.type == str(mood)]
tokenized_query = query.lower().split(" ")
import time

t0 = time.time()
results = bm25_read[mood].get_top_n(tokenized_query, df_mood.title.values, n=n_results)
t1 = time.time()
print(f'Searched {len(df_mood)} records in {round(t1-t0,3) } seconds \n')
for i in results:
  print(i)

Searched 1183 records in 0.002 seconds 

Sad But True
My Heart Is a Liar
Battleground
Real & True
Edge Of Black (Remix)
Edge of Black
The Lotus Eater
Carry Me
Sea of Sorrow
The Eco-Terrorist In Me


### 2.3 Printing title and text of results

In [58]:
for i in range(len(results)):
    df_result = df_mood[df_mood.title == results[i]]
    print("___________________________________")
    print ("")
    print (f"Result #{i+1}: {df_result.title.values[0]}")
    print (df_result.text.values[0])

___________________________________

Result #1: Sad But True
Hey (hey)
I’m your life
I’m the one who takes you there
Hey (hey)
I’m your life
I’m the one who cares
They (they)
They betray
I’m your only true friend now
They (they)
They’ll betray
I’m forever there

I’m your dream, make you real
I’m your eyes when you must steal
I’m your pain when you can’t feel
Sad but true

I’m your dream, mind astray
I’m your eyes while you’re away
I’m your pain while you repay
You know it’s sad but true

Sad but true

You (you)
You’re my mask
You’re my cover, my shelter
You (you)
You’re my mask
You’re the one who’s blamed
Do (do)
Do my work
Do my dirty work, scapegoat
Do (do)
Do my deeds
For you’re the one who’s shamed

I’m your dream, make you real
I’m your eyes when you must steal
I’m your pain when you can’t feel
Sad but true

I’m your dream, mind astray
I’m your eyes while you’re away
I’m your pain while you repay
You know it’s sad but true

Sad but true

I’m your dream
I’m your eyes
I’m your pain


In [20]:
df_mood.columns

Index(['Unnamed: 0', 'url', 'type', 'title', 'artist', 'text'], dtype='object')