# Preprocessing

In [1]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import random
import nltk
from scipy import sparse
from scipy.sparse import csr_matrix, vstack
from textblob import TextBlob
from langdetect import detect_langs
import pickle
from datetime import datetime

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [2]:
# import data
song_df = pd.read_csv('./songlyrics/songdata.csv')
song_df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
text_in_round_brackets = sum(list(song_df['text'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(text_in_round_brackets)))

Number of round brackets: 63285


In [4]:
random.seed(0)
random.choices(text_in_round_brackets, k=20)

['beautiful',
 'oh aah',
 'no no',
 'For sure',
 'Dolly Parton',
 'oo',
 'Still in love',
 'go Low',
 'provoke',
 'No batteries required',
 'from the DP',
 'hey',
 '2x',
 'Got to be there',
 'my boys',
 'Cough up the bucks',
 'I might as well talk to a brick wall',
 'On my own',
 'Just to keep things right',
 'I love cheap thrills']

In [5]:
text_in_square_brackets = sum(list(song_df['text'].map(lambda s: re.findall(r'\[(.*?)\]',s))), [])
print('Number of square brackets: {}'.format(len(text_in_square_brackets)))

Number of square brackets: 29009


In [6]:
random.seed(0)
random.choices(text_in_square_brackets, k=20)

['Chorus x2',
 'Verse 2',
 '3x',
 'Incomprehensible',
 'Pre-Chorus',
 'Think about your mother',
 'Pink Floyd cover',
 'Chorus',
 'Repeat: x6',
 'Chorus',
 'Chorus',
 'Chorus',
 'Verse 2:',
 'Chorus',
 'Chorus',
 'chorus 2x',
 ' sax ',
 'you hapen to go thru right now',
 'Chorus',
 'Hook']

Looking at text within round bracket and square bracket. The text within square bracket are more likely not relevant to the lyrics. So the square bracket including text are removed

In [7]:
# remove round brackets but not text within
song_df['text'] = song_df['text'].map(lambda s: re.sub(r'\(|\)', '', s))

# remove square brackest and text within
song_df['text'] = song_df['text'].map(lambda s: re.sub(r'\[(.*?)\] ', '', s))

Remove line breaks and count the number of line breaks

In [8]:
# count number of lines
song_df['lines'] = song_df['text'].map(lambda t: len(re.findall(r'\n', t)))
# remove line breaks
song_df['text'] = song_df['text'].map(lambda s: re.sub(r' \n|\n', '', s))
song_df.head()

Unnamed: 0,artist,song,link,text,lines
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...",20
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...",53
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,39
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,40
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,40


In [9]:
song_df['text'][0]

"Look at her face, it's a wonderful face And it means something special to me Look at the way that she smiles when she sees me How lucky can one fellow be?  She's just my kind of girl, she makes me feel fine Who could ever believe that she could be mine? She's just my kind of girl, without her I'm blue And if she ever leaves me what could I do, what could I do?  And when we go for a walk in the park And she holds me and squeezes my hand We'll go on walking for hours and talking About all the things that we plan  She's just my kind of girl, she makes me feel fine Who could ever believe that she could be mine? She's just my kind of girl, without her I'm blue And if she ever leaves me what could I do, what could I do?"

Remove non-english songs using langdetect by calculating the probability if the text is english

In [10]:
def detectLang(text):
    textLang = detect_langs(text)
    for detection in textLang:
        if detection.lang == 'en':
            return detection.prob
        
    return 0

song_df['EngProb'] = song_df['text'].map(detectLang)
print('English songs : {}'.format(sum(song_df['EngProb']>=0.5)))
print('Non-English songs : {}'.format(sum(song_df['EngProb']<0.5)))

English songs : 57184
Non-English songs : 466


In [11]:
song_df = song_df.loc[song_df['EngProb'] >= 0.5]

In [12]:
len(song_df)

57184

In [13]:
## Tokenization
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
song_df['tokens'] = song_df['text'].map(tokenizer.tokenize)

print('Text:')
print(song_df['text'].iloc[0])

print('Tokens:')
print(song_df['tokens'].iloc[0])

Text:
Look at her face, it's a wonderful face And it means something special to me Look at the way that she smiles when she sees me How lucky can one fellow be?  She's just my kind of girl, she makes me feel fine Who could ever believe that she could be mine? She's just my kind of girl, without her I'm blue And if she ever leaves me what could I do, what could I do?  And when we go for a walk in the park And she holds me and squeezes my hand We'll go on walking for hours and talking About all the things that we plan  She's just my kind of girl, she makes me feel fine Who could ever believe that she could be mine? She's just my kind of girl, without her I'm blue And if she ever leaves me what could I do, what could I do?
Tokens:
['Look', 'at', 'her', 'face', 'it', 's', 'a', 'wonderful', 'face', 'And', 'it', 'means', 'something', 'special', 'to', 'me', 'Look', 'at', 'the', 'way', 'that', 'she', 'smiles', 'when', 'she', 'sees', 'me', 'How', 'lucky', 'can', 'one', 'fellow', 'be', 'She', 's

In [14]:
song_df.drop(columns=['link'],inplace=True)
song_df.head()

Unnamed: 0,artist,song,text,lines,EngProb,tokens
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face And it...",20,0.999997,"[Look, at, her, face, it, s, a, wonderful, fac..."
1,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently l...",53,0.999997,"[Take, it, easy, with, me, please, Touch, me, ..."
2,ABBA,As Good As New,I'll never know why I had to go Why I had to p...,39,0.999997,"[I, ll, never, know, why, I, had, to, go, Why,..."
3,ABBA,Bang,Making somebody happy is a question of give an...,40,0.999997,"[Making, somebody, happy, is, a, question, of,..."
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,40,0.999998,"[Making, somebody, happy, is, a, question, of,..."


In [15]:
## Stemming
stemmer = nltk.stem.porter.PorterStemmer()

tokenToStem = {}
tokenCount = 0

for lists in song_df['tokens']:
    for token in lists:
        tokenCount +=1
        
        if token not in tokenToStem:
            tokenToStem[token] = stemmer.stem(token)
            
song_df['stems'] = song_df['tokens'].map(lambda lists: [tokenToStem[token] for token in lists])

print('Number of tokens: {}'.format(tokenCount))
print('Number of unique tokens: {}'.format(len(tokenToStem.keys())))
print('Number of unique stems: {}'.format(len(set(tokenToStem.values()))))
song_df.head()

Number of tokens: 13255935
Number of unique tokens: 103312
Number of unique stems: 57414


Unnamed: 0,artist,song,text,lines,EngProb,tokens,stems
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face And it...",20,0.999997,"[Look, at, her, face, it, s, a, wonderful, fac...","[look, at, her, face, it, s, a, wonder, face, ..."
1,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently l...",53,0.999997,"[Take, it, easy, with, me, please, Touch, me, ...","[take, it, easi, with, me, pleas, touch, me, g..."
2,ABBA,As Good As New,I'll never know why I had to go Why I had to p...,39,0.999997,"[I, ll, never, know, why, I, had, to, go, Why,...","[I, ll, never, know, whi, I, had, to, go, whi,..."
3,ABBA,Bang,Making somebody happy is a question of give an...,40,0.999997,"[Making, somebody, happy, is, a, question, of,...","[make, somebodi, happi, is, a, question, of, g..."
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,40,0.999998,"[Making, somebody, happy, is, a, question, of,...","[make, somebodi, happi, is, a, question, of, g..."


# EDA

In [16]:
print('Number of songs: ', str(len(song_df)))
print('number of artists: ', str(len(song_df['artist'].unique())))

Number of songs:  57184
number of artists:  638


In [17]:
song_count_df = song_df.groupby('artist')[['song']].count()
song_count_df

Unnamed: 0_level_0,song
artist,Unnamed: 1_level_1
'n Sync,93
ABBA,112
Ace Of Base,73
Adam Sandler,70
Adele,54
...,...
Zoegirl,38
Zornik,12
Zox,21
Zucchero,23


In [19]:
fig = px.histogram(song_count_df, x='song', title='Songs per artist', labels={'song': 'Songs'})
fig.show()

In [21]:
# Words per song
song_df['n_stems'] = song_df['stems'].map(len)

fig = px.histogram(song_df, x='n_stems', title='Words per song')
fig.show()

In [22]:
# create dataframe with lists of artists
song_df['stems_str'] = song_df['stems'].map(lambda lst: ' '.join(lst))

# map text to artists
stems_to_artist = {}
for tp in song_df[['artist', 'stems_str']].itertuples(index=False):
    artist = tp[0]
    stems = tp[1]
    if stems in stems_to_artist:
        stems_to_artist[stems].append(artist)
    else:
        stems_to_artist[stems] = [artist]

In [None]:
# insert list of artists to dataframe
song_df['artists'] = song_df['stems_str'].map(stems_to_artist)
song_df['duplicates'] = song_df['artists'].map(len) - 1
song_df.head()