##### Following cleaning is one time process.

Few cleanings involved:

1) Removal of non-english songs which takes longer time to execute since data size is big

2) Removal of duplicate songs like cover songs based on song name usually songs along with [] mentioned after original song name

3) Setting up files for human tagging

4) indicoio api is used to get sentiment and emotion scores and for model building we have already saved scores. Limit for api got exhausted so api call may not work

5) Preparing data for emotion band grouping and emotion scores retrieved through api

Files Tagged_songs_combined_final.csv and Emotional_scores_500 are used for model

### We won't recommend to run this script since it takes longer time to execute as well as limited number of api calls.


In [1]:
import os

os.chdir('..')
print("Current Working Directory " , os.getcwd())
#
os.chdir('Datasets')

Current Working Directory  E:\MITB\Text Analytics\Project\Datasets\Submission_folder


In [2]:
# Data cleaning and preparation for modeling

from langdetect import detect
import pandas as pd
import re
import indicoio
indicoio.config.api_key = 'f44acadd7a4d32ef0b944cfc0a0347fc'


In [3]:
data=pd.read_csv("Lyrics1.csv")


In [27]:
# Function to detect language if song is english include

def language_detect(lyrics):
    try:
        t = detect(lyrics[:100])
    except:
        t = None
    
    return t

In [28]:
# Function call to detect language
data['Language'] = data.apply(lambda x: language_detect(x[1]),axis = 1)

In [31]:
# Pattern to check if there are any remix songs exists, assuming songs with square brackets [] are remix songs

pattern = r'\[.*?\]'

data['Remix'] = data['Song'].apply(lambda x: "Matched" if re.compile(pattern).search(str(x)) else "Not Matched")

new_df = data[data.Remix == "Not Matched"].reset_index(drop = True)


In [32]:
# Create short lyrics to eliminate similar songs

# Short lyrics is 50 charecters from lyrics since we assume most of the songs might have same start and can be 

new_df['Short_Lyrics'] = new_df['Lyrics'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x[:50]).lower().replace("\\r", "").replace("\\n", " ").replace(" ", ""))


In [35]:
# Preparing if there are any duplicate songs

duplicate_song_list = list(group_song_freq[group_song_freq.No_of_songs > 1].Songs)

dup_songs = new_df[new_df.Song.isin(duplicate_song_list)].reset_index(drop = True)


In [36]:
# Based on duplicate song list and short lyrics find similar song and find cover song


new_df['count'] = 1
grouped_df  = new_df[new_df.Song.isin(duplicate_song_list)].groupby(['Song','Short_Lyrics']).agg({'count':sum})

g = grouped_df['count'].groupby(level=0, group_keys=False)
finalList_withShortLyrics = g.nlargest(1)
finalList = finalList_withShortLyrics.rename_axis(['Song','Short_Lyrics']).reset_index(name='No_of_songs')


In [37]:
# Process to remove duplicate songs

short_lyrics_toremove = set(finalList['Short_Lyrics'])
list_dup_songs = set(finalList['Song'])
dup_songs['Duplicated'] = dup_songs.apply(lambda x: 0 if (x[2] in list_dup_songs) & (x[5] in short_lyrics_toremove) else 1 , axis = 1)
dup_songs = dup_songs[dup_songs['Duplicated'] == 0].reset_index(drop = True)
dup_songs = dup_songs.drop_duplicates(subset=['Song','Short_Lyrics'], keep="last").reset_index(drop = True)
dup_songs = dup_songs.drop(['Language','Remix','Duplicated'], axis = 1)
Only_song_list = new_df[ ~ new_df.Song.isin(duplicate_song_list)].reset_index(drop = True)
df_final = pd.concat([Only_song_list, dup_songs],axis = 0).reset_index(drop = True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [38]:
df = df_final.drop(['Language','Remix','count'], axis = 1)


In [40]:
df_groupbyband = df_final.Band.value_counts().rename_axis('Band').reset_index(name='No_of_songs')
df_groupbybandtop500 = list(df_groupbyband.iloc[0:500,0])
sampleForTagging = df[df.Band.isin(df_groupbybandtop500)].groupby('Band').apply(lambda x: x.sample(n = 10, replace = True)).reset_index(drop = True)
sampleForTagging = sampleForTagging[['Band','Lyrics','Song']]
sampleForTagging.Lyrics = sampleForTagging.Lyrics.replace(regex='\\r',value='')
sampleForTagging['Polarity'] = ""
sampleForTagging['Emotion'] = ""

In [7]:
# Duplicate songs removal of a song function
def removeDuplicatesLines(k):
    lines_seen = list()
    for line in k:
        if line not in lines_seen:
            lines_seen.append(line)
    clean_lyric = '\n'.join(lines_seen)
    
    return clean_lyric

In [5]:
# Tagged songs for sentiment analysis load csv files

with open('Tagged_songs_combined.csv', encoding='utf-8', errors = 'ignore') as file:
        songs_df = pd.read_csv(file)


In [8]:
# Remove duplicate lines
songs_df['Clean_lyrics'] = songs_df['Lyrics'].apply(lambda x: removeDuplicatesLines(x.splitlines()))

In [11]:
songs_df['sentiment_score_indicoio']=indicoio.sentiment(list(songs_df['Clean_lyrics']))

In [12]:
songs_df.to_csv('Tagged_songs_combined_final.csv',index = False)

In [19]:
# EMotion band grouping and emotion scores retrieved though api

# Output of api is in form of dictionary
df = pd.read_csv('Songs_clean_en.csv')
df['Clean_lyrics'] = df['Lyrics'].apply(lambda x: removeDuplicatesLines(x.splitlines()))

Bands = df['Band'].value_counts().rename_axis('Band').reset_index(name='No_of_songs')
Band_list_gt10 = Bands[Bands.No_of_songs >= 10].Band

top_bands = Band_list_gt10[0:500]
df_classify = df[df.Band.isin(top_bands)].groupby('Band').apply(lambda x: x.sample(n = 10, replace = True)).reset_index(drop = True)


In [20]:
df_classify['Emotional_score']=indicoio.emotion(list(df_classify['Clean_lyrics']))

In [21]:
# Result dictionary of api has been seperated out 

df_classify['sadness'] = df_classify['Emotional_score'].apply(lambda x: x['sadness'])
df_classify['joy'] = df_classify['Emotional_score'].apply(lambda x: x['joy'])
df_classify['anger'] = df_classify['Emotional_score'].apply(lambda x: x['anger'])
df_classify['fear'] = df_classify['Emotional_score'].apply(lambda x: x['fear'])
df_classify['surprise'] = df_classify['Emotional_score'].apply(lambda x: x['surprise'])

In [24]:
df_classify.to_csv('Emotional_scores_500.csv', index = False)