# All algorithm will be here

# essential library

In [2]:
import pandas as pd # do some data
import string
import timeit # just import for timer
import numpy as np
import json
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf built in function
from scipy import sparse

import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data = pd.read_csv('lyrics-data.csv')
data = data.drop_duplicates()
data = data[data.Idiom == "ENGLISH"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["Lyric"].astype('U'))

In [4]:
def get_and_clean_lyric():
    description = data[data.Idiom == "ENGLISH"]["Lyric"]
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [22]:
class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False,ngram_range=(1,2))
        self.b = b
        self.k1 = k1

    def fit(self, X):
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        b, k1, avdl = self.b, self.k1, self.avdl

        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

In [23]:
bm25 = BM25()
bm25.fit(data["Lyric"].astype('U'))

## get all artists

In [7]:
def convert_to_list(genres) :
    if type(genres) == float :
        return []
    return genres.split('; ')  

class Artists :
    def __init__(self,artists,song,popularity,link,genre,genres) :
        self.artists = artists
        self.song = song
        self.popularity = popularity
        self.link = link
        self.genre = genre
        self.genres = genres
    
    def get_artist(self) :
        return {
            "artists" : self.artists,
            "song" : self.song, 
            "popularity" : self.popularity,
            "link" : self.link,
            "genre" :self.genre,
            "genres" : convert_to_list(self.genres)
        }
    

        

In [8]:
def get_all_artists():
    data = pd.read_csv('artists-data.csv')
    artists = []
    for i in range(len(data)):
        artist = Artists(
            data.iloc[i].to_dict()["Artist"],
            data.iloc[i].to_dict()["Songs"],
            data.iloc[i].to_dict()["Popularity"],
            data.iloc[i].to_dict()["Link"],
            data.iloc[i].to_dict()["Genre"],
            data.iloc[i].to_dict()["Genres"]
        )
        artists.append(json.dumps(artist.get_artist())) 
    print(artists[0])

In [8]:
get_all_artists()

{"artists": "10000 Maniacs", "song": 110, "popularity": 0.3, "link": "/10000-maniacs/", "genre": "Rock", "genres": ["Rock", "Pop", "Electronica", "Dance", "J-Pop/J-Rock", "Gospel/Religioso", "Infantil", "Emocore"]}


In [9]:
def get_artists_by_name(name):
    data = pd.read_csv('artists-data.csv')
    query = name.lower()
    artists = []
    for i in range(len(data)) :
        musician = data.iloc[i].to_dict()["Artist"].lower()
        if query in musician:
            artist = Artists(
                data.iloc[i].to_dict()["Artist"],
                data.iloc[i].to_dict()["Songs"],
                data.iloc[i].to_dict()["Popularity"],
                data.iloc[i].to_dict()["Link"],
                data.iloc[i].to_dict()["Genre"],
                data.iloc[i].to_dict()["Genres"]
            )
            artists.append(json.dumps(artist.get_artist()))
    
    if len(artists) == 0:
        return "not found"
    return artists

In [10]:
print(get_artists_by_name("taylor"))

['{"artists": "Taylor Swift", "song": 261, "popularity": 28.1, "link": "/taylor-swift/", "genre": "Pop", "genres": ["Pop", "Rom\\u00e2ntico", "Pop/Rock", "Country", "Electro Swing", "Dance", "House", "Funk Carioca", "Black Music", "Funk", "Ax\\u00e9", "Electronica", "Indie", "R&B", "Sertanejo", "Hip Hop", "Pop/Punk", "Trilha Sonora", "Rap", "Rock Alternativo", "Emocore", "Rock", "Blues", "Chillout", "Piano Rock", "Disco", "Hard Rock", "Reggae", "Velha Guarda", "J-Pop/J-Rock", "Jazz", "Pagode", "Tecnopop", "Folk", "Cl\\u00e1ssico", "Forr\\u00f3", "Gospel/Religioso"]}']


## song part

In [10]:
class Songs :
    def __init__(self,alink,sname,lyric) :
        self.alink = alink
        self.sname = sname
        self.lyric = lyric
        
    def get_song(self) :
        return {
            "name" : self.sname, 
            "lyric" : self.lyric,
            "artist" : self.alink,
        }


In [11]:
class QuerySongs :
    def __init__(self,rank,alink,sname,query_before,query_after) :
        self.rank = rank
        self.alink = alink
        self.sname = sname
        self.query_before = query_before
        self.query_after = query_after
        
    def get_song(self) :
        return {
            "rank" : self.rank,
            "artist" : self.alink,
            "song" : self.sname, 
            "queryBefore" : self.query_before,
            "queryAfter" : self.query_after,
        }


In [12]:
def get_all_song():
    data = pd.read_csv('lyrics-data.csv')
    songs = []
    for i in range(50):
        song = Songs(
            data.iloc[i].to_dict()["ALink"],
            data.iloc[i].to_dict()["SName"],
            data.iloc[i].to_dict()["Lyric"],
        )
        songs.append(json.dumps(song.get_song())) 
    print(songs[1])

In [13]:
def get_song_by_name(name):
    songs = data[data.SName == name]
    for i in range(len(songs)) :
        print("Song name:",songs.iloc[i]["SName"])
        print("By:",songs.iloc[i]["ALink"])
        print("Lyric:",songs.iloc[i]["Lyric"])
        print()

In [111]:
get_all_song()

In [14]:
def clean_lyric(lyric):
    ps = PorterStemmer()
    s = word_tokenize(lyric)
    stopwords_set = set(stopwords.words())
    stop_dict = {s: 1 for s in stopwords_set}
    s = [w for w in s if w not in stop_dict]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s

# Search Feature

# Search By Tf

In [15]:
def search_by_tf(query):
    lyric = get_and_clean_lyric()
    vectorizer = CountVectorizer(preprocessor=clean_lyric,ngram_range=(1,2))
    vectorizer.fit_transform(lyric)
    result = vectorizer.transform([query])
    print(result)

# Search By Tf-idf

In [16]:
def serach_by_tf_idf(query):
    query_vec = vectorizer.transform([query])
    results = cosine_similarity(X,query_vec).reshape((-1,))
    return results.argsort()[-10:][::-1]
#         print(data.iloc[i,0],"--",data.iloc[i,1])

# Search By BM25

In [17]:
def search_by_bm25(query):
    result = bm25.transform(query,data[data.Idiom == "ENGLISH"]["Lyric"].astype('U'))
    return result.argsort()[-10:][::-1]

# Search Song algorithm

In [25]:
def get_song_by_lyric(query,score) :
    lyric = query.lower()
    result = []
    rank = 1
    if score == 'tf' :
        search_by_tf(query)
        return
    
    if score == 'tf-idf':
        songs = serach_by_tf_idf(lyric)
        for i in songs: 
            song = QuerySongs(
                rank,
                data.iloc[i].to_dict()["ALink"],
                data.iloc[i].to_dict()["SName"],
                query,
                clean_lyric(lyric)
            )
            rank+=1
            result.append(json.dumps(song.get_song())) 
        print(result)
        return
    if score == 'bm25' :
        songs = search_by_bm25(query)
        for i in songs: 
            song = QuerySongs(
                rank,
                data.iloc[i].to_dict()["ALink"],
                data.iloc[i].to_dict()["SName"],
                query,
                clean_lyric(lyric)
            )
            rank+=1
            result.append(json.dumps(song.get_song())) 
        print(result)
        return 
    print('in correct method for seaching')

In [None]:
get_song_by_lyric("Hey mama","tf")

In [28]:
get_song_by_lyric("Hello","tf-idf")

['{"rank": 1, "artist": "/cody-simpson/", "song": "Hello", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 2, "artist": "/beyonce/", "song": "Hello", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 3, "artist": "/sambo/", "song": "Smells Like Teen Spirit", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 4, "artist": "/amy-lee/", "song": "Hello, Goodbye", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 5, "artist": "/velozes-e-furiosos/", "song": "Pick Up The Phone", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 6, "artist": "/the-maine/", "song": "Hello World", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 7, "artist": "/nirvana/", "song": "Smells Like Teen Spirit (Electro Remix)", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 8, "artist": "/within-temptation/", "song": "Smells Like Teen Spirit (Nirvana cover)", "queryBefore": "Hello", "queryAfter": "hello"}', '{"rank": 9, "artist": "/karmin/", "song": "

In [24]:
get_song_by_lyric("I am","bm25")

['{"rank": 1, "artist": "/michael-jackson/", "song": "I Am a Loser", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 2, "artist": "/sia/", "song": "Here I Am (With Dolly Parton)", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 3, "artist": "/jojo/", "song": "I Am.", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 4, "artist": "/radiohead/", "song": "Street Spirit (Fade Out) - Cifrada", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 5, "artist": "/will-i-am/", "song": "I Am", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 6, "artist": "/placebo/", "song": "One Of A Kind", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 7, "artist": "/r-e-m/", "song": "Superman", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 8, "artist": "/glee/", "song": "Superman", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 9, "artist": "/calvin-harris/", "song": "Giant (With Rag\'n\'Bone Man)", "queryBefore": "I am", "queryAfter": ""}', '{"rank": 10, "artist": "/leona-

In [374]:
get_song_by_name("Black Country Woman")

Song name: Black Country Woman
By: /led-zeppelin/
Lyric: Hey, hey, mama, what's the matter here (X2). You didn't have to tell me that you love me so. You didn't have to love me, mama, let me go. Hey, hey, mama, what's the matter here. You didn't have to make me a total disgrace. You didn't have to leave me with that beer in my face. Hey, hey, mama, what's the matter here. That's alright, it's awful dog-gone clear.. Hey, hey, baby, why you treat me mean (X2). You didn't have to crucify me like you did. You didn't have to tell me I was just your kid. Hey, hey, mama, why'd you treat me mean. You didn't have say you'd always be by my side. Y' didn't have to tell me you'd be my blushin' bride. Hey, hey, mama, why you treat me mean. But that's alright, I know your sisters, too. You didn't have to tell me that you love me so. You didn't have to leave me, mama, let me go. Hey, hey, mama, what is wrong with you. You didn't have to leave me like a total disgrace. You didn't have to leave me with

In [320]:
data[data.ALink == "/10000-maniacs/"]
# data["ALink"]

Unnamed: 0,ALink,SName,SLink,Lyric,Idiom
0,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH
1,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH
2,/10000-maniacs/,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH
3,/10000-maniacs/,A Campfire Song,/10000-maniacs/a-campfire-song.html,"A lie to say, ""O my mountain has coal veins an...",ENGLISH
4,/10000-maniacs/,Everyday Is Like Sunday,/10000-maniacs/everyday-is-like-sunday.html,Trudging slowly over wet sand. Back to the ben...,ENGLISH
...,...,...,...,...,...
128,/10000-maniacs/,When We Walked On Clouds,/10000-maniacs/when-we-walked-on-clouds.html,Everybody's wondering what. And where they all...,ENGLISH
129,/10000-maniacs/,Who Knows Where The Time Goes?,/10000-maniacs/who-knows-where-the-time-goes.html,"Across the evening sky, all the birds are leav...",ENGLISH
130,/10000-maniacs/,Wildwood Flower,/10000-maniacs/wildwood-flower.html,i will twine with your mingles of raven black ...,ENGLISH
131,/10000-maniacs/,You Happy Puppet,/10000-maniacs/you-happy-puppet.html,[ music: Robert Buck/words: Natalie Merchant ]...,ENGLISH


# Artist Feature

In [273]:
art = pd.read_csv('artists-data.csv')

In [336]:
artist_list = sorted([i.lower() for i in art["Artist"]])
# print(artist_list)

In [341]:
songs_list = sorted([i.lower() for i in data["SName"]])

In [377]:
def search_song(song):
    if song in artist_list :
        song = song.replace(" ","-")
        song = "/" + song + "/"
        song_list = sorted([i for i in data[data.ALink==song]["SName"]])
        print(song_list)
        
    if song.lower() in songs_list :
        get_song_by_name(song)

In [378]:
search_song("Black Country Woman")

Song name: Black Country Woman
By: /led-zeppelin/
Lyric: Hey, hey, mama, what's the matter here (X2). You didn't have to tell me that you love me so. You didn't have to love me, mama, let me go. Hey, hey, mama, what's the matter here. You didn't have to make me a total disgrace. You didn't have to leave me with that beer in my face. Hey, hey, mama, what's the matter here. That's alright, it's awful dog-gone clear.. Hey, hey, baby, why you treat me mean (X2). You didn't have to crucify me like you did. You didn't have to tell me I was just your kid. Hey, hey, mama, why'd you treat me mean. You didn't have say you'd always be by my side. Y' didn't have to tell me you'd be my blushin' bride. Hey, hey, mama, why you treat me mean. But that's alright, I know your sisters, too. You didn't have to tell me that you love me so. You didn't have to leave me, mama, let me go. Hey, hey, mama, what is wrong with you. You didn't have to leave me like a total disgrace. You didn't have to leave me with

In [361]:
data[data.SName == "Black Country Woman"]

Unnamed: 0,ALink,SName,SLink,Lyric,Idiom
31935,/led-zeppelin/,Black Country Woman,/led-zeppelin/black-country-woman.html,"Hey, hey, mama, what's the matter here (X2). Y...",ENGLISH


# spell checking feature

In [34]:
from spellchecker import SpellChecker
import os
from pathlib import Path

In [36]:
path = "E:\CMU\953\IR481\Module4\IULA\EN"
os.chdir(path)
root_folder = os.listdir()

context = ''
for file in root_folder:
    folder = f"E:\CMU\953\IR481\Module4\IULA\EN\{file}"
    os.chdir(folder)
    plan_text = os.listdir()[1]
    file_path = f"{folder}\{plan_text}"
    temp = Path(file_path).read_text('utf-8')
    temp = temp.replace('\n', '')
    context += temp
    
context = re.sub('[^A-Za-z]'," ",context)
context = " ".join(context.split())
context = context.lower()

list_of_word = context.split(" ")

In [43]:
spell = SpellChecker()

spell.word_frequency.load_words(list_of_word)

word = "infomatiion I forgor everthing at the end of time"

misspelled = spell.unknown(word.split(" "))

if len(misspelled) != 0 :
    for word in misspelled:
        print(spell.correction(word))
        print(spell.candidates(word))
print("no mispelling")

forgot
{'forgot', 'forger', 'forgo'}
infomation
{'infomation'}
no mispelling
