In [43]:
import requests
import html
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup as bs

import time
import os
import seaborn as sns
import pandas as pd
import scipy as sc
import numpy as np
import re

import statsmodels.formula.api as sm

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
%matplotlib inline  
plt.rcParams['figure.figsize'] = (10, 6) 

# Data Acquisition & Data Cleaning & Data Vectorization

# Part 1 - Load and clean the data
The following cells perform 2 things:
* load the csv file which contain the dataframe of spotify songs
* replace all of the unnecessary punctuation in each title and artist for the further work with scraping

In [44]:
df_spotify=pd.read_csv("./data/spotify.csv")
df_spotify=df_spotify.drop('Unnamed: 0', 1)    

## Define a function to clean the titles and artists string for the search in metrolyrics

In [45]:
def cleanDf(df):
    for i in range(0,len(df)):
        df['Title'][i]=df['Title'][i].replace('MotorSport','motor sport').replace('PILLOWTALK','pillow talk').replace('Back To You','Back 2 you')
        
        list=df['Title'][i].split(' (feat. ')
        df['Title'][i]=list[0]
        list=df['Title'][i].split('- From ')
        df['Title'][i]=list[0]
        list=df['Title'][i].split(' feat. ')
        df['Title'][i]=list[0]
        list=df['Title'][i].split(' (with ')
        df['Title'][i]=list[0]
        list=df['Title'][i].split(' (Original ')
        df['Title'][i]=list[0]
        list=df['Title'][i].split(' (From ')
        df['Title'][i]=list[0]
        list=df['Title'][i].split(' (Fifty ')
        df['Title'][i]=list[0]
        
        df['Title'][i]=df['Title'][i].replace("Wanna","want to").replace('\n','').replace(';','').replace("'","").replace(',','').replace('/ ','').replace('- ','').replace('-','').replace('é','e').replace('?','').replace('"','').replace("!","").replace("in'","ing").replace(" and "," ").replace("’","")
        df['Title'][i]=df['Title'][i].replace('(','').replace(')','').replace('.','')
        
        df['Artist'][i]=df['Artist'][i].replace('P!nk','Pink').replace('NERD','nerd the neptunes').replace('ZAYN','zayn malik').replace('Axwell /\ Ingrosso','Axwell Ingrosso').replace('Ayo & Teo','Ayo Teo')
        df['Artist'][i]=df['Artist'][i].replace('é','e').replace('í','i').replace('.','').replace('\n','').replace(' + ','-').replace('.','').replace('ó','o').replace('$','s').replace("'","").replace("!","")

## Clean our dataframe

In [46]:
cleanDf(df_spotify)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy 

## In the next cell we create a function to add a column to our dataframe

In [47]:
def append_col(df,lst,colName):
    df[colName] = lst

## Checking the amount of rows in dataframe

In [48]:
len(df_spotify)

587

# Part 2 - Load and store lyrics data


## Define a function to create the lyrics_urls list to store each song's lyrics url on metrolyrics
The following function perform 5 things:
* create list of urls from metrolyrics for each song 
* replace all of the unnecessary strings from the title of each song
* handle with cases of several artists of a song. split them and take the first one to build with it a url
* replace all of the spaces to '-' for the url
* clean the text from unnecessary punctuation

In [49]:

def create_lyrics_urls(df):
    #creating the list of urls for each song's lyrics
    lyrics_urls=[]
    for i in range (0,len(df)):
        #the case of & in songs:
        name=df['Title'][i].replace("& ","").replace("'","")
        name=name.replace(" ","-")

        list=df['Artist'][i].split(',')
        #we will always take the first artist
        artist_str=list[0]
        list=artist_str.split(' & ')
        artist_str=list[0]
        list=artist_str.split(' and ')
        artist_str=list[0]
        list=artist_str.split('featuring')
        artist_str=list[0]

        url='https://www.metrolyrics.com/'+name+'-lyrics-'+artist_str.replace('"','').replace(" ","-")+'.html'
        #the case of "Maroon-5-.html"
        url=url.replace("-.",".") 

        lyrics_urls.append(url)      
    return lyrics_urls    

In [50]:
urls=create_lyrics_urls(df_spotify)
len(urls) 

587

### Loading the nltk package for the further work with text

In [51]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, movie_reviews

## Define function to delete the stopwords from the lyrics

In [52]:
stop_words = set(stopwords.words('english'))

def remove_mystopwords(string):
    tokens = string.split(" ")
    tokens_filtered = [word for word in tokens if not word in stop_words]
    return (" ").join(tokens_filtered)

## Define function to count the stopwords in each song

In [53]:
def count_stopwords(string):
    count=0
    tokens = string.split(" ")
    for word in tokens:
        if word in stop_words:
            count+=1
    return count
    

## The following cell pefrom 5 things:

* scrap the metrolyrics
* load the lyrics of each song
* check the amount of words in each song 
* check the amount of stopwords in each song 
* clean the text from digits, stopwords, and unnecessary punctuation


In [54]:
#the part of creation of the lyrics_texts dictionary which contain the lyrics of each song in top 2016,2017,2018

all_words_count=[]
stopwords_count=[]

def find_lyrics_txt(df):
    lyrics_texts=[]
    time.sleep(7)
    for i in range (0,len(df)):
        url = urls[i]
        response = requests.get(url)
        data = response.text
        soup = bs(data,'html.parser')
        couplets = soup.findAll("p",{"class":"verse"})
        temp = ""
        for p in couplets:
            temp = str(temp) + str(p.text)
            temp=temp+"\n"
        #cleaning the text from unnecessary punctuation
        temp=re.sub(r'[^(a-zA-Z)\s]','', temp)
        temp=temp.replace('[','').replace(']','').replace('(','').replace(')','').replace('"','').replace("'","")
        #remove all of the digits 
        temp=''.join(j for j in temp if not j.isdigit())
        #count the number of words in each song
        all_words_count.append(len(temp))
        #count the number of stopwords in each song
        stopwords_c=count_stopwords(temp)
        stopwords_count.append(stopwords_c)
        #remove all of the stopwords
        temp = remove_mystopwords(temp)
        #for the further work with text we would like to lowercase all of the words in text   
        lyrics_texts.append(temp.lower())
        lyrics_texts[i]=lyrics_texts[i].replace('\n ','\n')  
    return lyrics_texts    

In [None]:
lyrics=find_lyrics_txt(df_spotify)

## Add 2 columns to our dataframe

In [None]:
append_col(df_spotify,all_words_count,'words_count')
append_col(df_spotify,stopwords_count,'stopwords_count')
print(stopwords_count)

In [None]:
print(all_words_count)

In [None]:
lyrics_without_empty_strings = []
for string in lyrics:
    if (string != ""):
        lyrics_without_empty_strings.append(string)

## In the next cell we want to check the amount of songs with zero words (the algorithm of finding the url for metrolyrics didn't work for them) so we can delete them in the dataframe in the future

In [None]:
res_list = [] 
count=0
for i in range(0, len(all_words_count)) : 
    if all_words_count[i] == 0 : 
        res_list.append(i)    
print(res_list)        

## Cleaning: In the next cell we drop every row which value of amout of words in song is 0

In [None]:

for i in range (0,len(df_spotify)):
    if (df_spotify['words_count'][i]) == 0:
        df_spotify['words_count'][i]=pd.np.NaN
        
df_spotify=df_spotify.dropna()

## Check the amount of rows in a new dataframe without nans

In [None]:
len(df_spotify)

## Defining a text as positive, negative or neutral using textblob package

In [None]:
# !pip install textblob

## Getting the polarity value for each song and defining the range for each pnn value: 

* pnn between -0.3 and 0.3 is neutral and its value will marked as 0
* pnn above 0.3 is positive and its value will marked as 1
* pnn below -0.3 is negative and its value will marked as -1

In [None]:
from textblob import TextBlob
# The sentiment function of textblob returns two properties, polarity, and subjectivity.
# Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement

def pos_neg_neutral(lyrics_texts):
    blobs=[]
    for i in range (0,len(lyrics_texts)):
        blob=TextBlob(lyrics_texts[i])
        pnn=blob.sentiment.polarity
        definition=0
        if -0.3 < pnn < 0.3:
            definition=0
        if pnn  <= -0.3:
            definition=-1
        if pnn >= 0.3:
            definition=1
        blobs.append(definition)    
    return blobs

In [None]:
blobs_polarity=pos_neg_neutral(lyrics_without_empty_strings)

In [None]:
len(blobs_polarity)

In [None]:
print(blobs_polarity)

## In the next cell we will check hom many negative,neutral,positive songs do we have

In [None]:
blobs_res={"pos": 0, "neg": 0, "neutral": 0}
for i in range (0,len(blobs_polarity)):
    if blobs_polarity[i]==1:
        blobs_res["pos"]+=1
    if blobs_polarity[i]==-1:
        blobs_res["neg"]+=1
    if blobs_polarity[i]==0:
        blobs_res["neutral"]+=1
print(blobs_res)        

In [None]:
append_col(df_spotify,blobs_polarity,'blobs_polarity')

## In the next cell we will tokenize (convert the whole lyrics text to words) the lyrics of each song

In [None]:
tokenized_lyrics=[]

for i in range (0,len(df_spotify)):
    tokens=word_tokenize(lyrics_without_empty_strings[i])
    tokenized_lyrics.append(tokens)

In [None]:
print(str(tokenized_lyrics[0]))

## The next 3 cells perfom 3 things:

* create a list with positive words by scraping a web page that includes the necessary information for this and count the total amount of these words in each song
* create a list with negative words by scraping a web page that includes the necessary information for this and count the total amount of these words in each song
* create a list with curse words by scraping a web page that includes the necessary information for this and count the total amount of these words in each song

In [None]:
count_pos=[]

keywords_pos=[]

url='https://www.enchantedlearning.com/wordlist/positivewords.shtml'
response = requests.get(url)
data = response.text
soup = bs(data,'html.parser')
words = soup.findAll("div",{"class":"wordlist-section"})

for div in words:
    divs=soup.findAll("div",{"class":"wordlist-item"})
    for div in divs:
        keywords_pos.append(div.text)

for i in range (0,len(df_spotify)):
    temp=[]
    for word in keywords_pos:
        #here we create a temp list to store all the words from keywords_neg which appear in lyrics 
        temp+=re.findall(word, str(lyrics_without_empty_strings[i]))
    #now we'd want to know how many times these words appeared in text
    count_pos.append(len(temp))
print(count_pos)         

In [None]:
count_neg=[]

keywords_neg=[]

url='https://www.enchantedlearning.com/wordlist/negativewords.shtml'
response = requests.get(url)
data = response.text
soup = bs(data,'html.parser')
words = soup.findAll("div",{"class":"wordlist-section"})

for div in words:
    divs=soup.findAll("div",{"class":"wordlist-item"})
    for div in divs:
        keywords_neg.append(div.text)
        
for i in range (0,len(df_spotify)):
    temp=[]
    for word in keywords_neg:
        #here we create a temp list to store all the words from keywords_neg which appear in lyrics 
        temp+=re.findall(word, str(lyrics_without_empty_strings[i]))
    #now we'd want to know how many times these words appeared in text
    count_neg.append(len(temp))
print(count_neg)  

In [None]:
count_swear=[]

keywords_swear=[]

url='https://en.wiktionary.org/wiki/Category:English_swear_words'
response = requests.get(url)
data = response.text
soup = bs(data,'html.parser')
div_class = soup.findAll("div",{"class":"mw-category-group"})

for div in div_class:
    uls=div.find_all('ul')
    for ul in uls:
        lis=ul.find_all('li')
        for ls in lis:
            links=ls.find_all('a')
            for link in links:
                keywords_swear.append(link.text)
                
keywords_swear.pop(0)         

for k in range(0,len(keywords_swear)):
    keywords_swear[k] =keywords_swear[k].lower() 

# print(keywords_swear)

for i in range (0,len(df_spotify)):
    temp=[]
    for word in keywords_swear:
        #here we create a temp list to store all the words from keywords_neg which appear in lyrics 
        temp+=re.findall(word, str(lyrics_without_empty_strings[i]))
    #now we'd want to know how many times these words appeared in text
    count_swear.append(len(temp))
print(count_swear)  

In [None]:
append_col(df_spotify,count_neg,'words_neg_count')
append_col(df_spotify,count_pos,'words_pos_count')
append_col(df_spotify,count_swear,'words_curse_count')

## In the next cell we'd like to find the total count of repeated words and the most common words in each song using Counter

In [None]:
from collections import Counter

reg = re.compile('\S{4,}')

repeated_words=[]
most_common_words=[]

for i in range (0,len(df_spotify)):
    c = Counter(ma.group() for ma in reg.finditer(lyrics_without_empty_strings[i]))
    repeated_words.append(sum(c.values()))
    most_common_words.append([k for k,v in c.most_common()])

In [None]:
type(most_common_words[0])

In [None]:
avg_most_common_words_size=[]

for i in range(0,len(most_common_words)):
    total=0
    for word in most_common_words[i]:
        total += len(word)
    ave_size = round(float(total) / float(len(most_common_words[i])) , 2) 
    avg_most_common_words_size.append(ave_size)
        
print(avg_most_common_words_size)        

In [None]:
append_col(df_spotify,avg_most_common_words_size,'avg_most_common_words_size')
append_col(df_spotify,repeated_words,'words_repeat_count')

In [None]:
df_spotify.to_csv('./data/spotify_after.csv',index=False)

## From this point we saw that we schould add an additional data to our dataframe 

In [None]:
df_spotify_add_pct = pd.read_csv('./data/spotify_after.csv')

In [None]:
df_spotify_add_pct.columns

## In the next cell we add new columns to the dataframe to see the percentage of:

* amount of negative words in the song
* amount of positive words in the song
* amount of curse words in the song
* amount of repeated words in the song
* amount of stopwords in the song

## to the total number of words in a song

In [None]:

df_spotify_add_pct['words_negative_pct'] = round((df_spotify_add_pct['words_neg_count'] / df_spotify_add_pct['words_count'])*100,2)
df_spotify_add_pct['words_positive_pct'] = round((df_spotify_add_pct['words_pos_count'] / df_spotify_add_pct['words_count'])*100,2)
df_spotify_add_pct['words_curse_pct'] = round((df_spotify_add_pct['words_curse_count'] / df_spotify_add_pct['words_count'])*100,2)
df_spotify_add_pct['words_repeat_pct'] = round((df_spotify_add_pct['words_repeat_count'] / df_spotify_add_pct['words_count'])*100,2)
df_spotify_add_pct['stopwords_pct'] = round((df_spotify_add_pct['stopwords_count'] / df_spotify_add_pct['words_count'])*100,2)


In [None]:
df_spotify_add_pct[:10]

In [None]:
df_spotify_add_pct.to_csv('./data/spotify_after.csv',index=False)