# FINAL PROJECT - DATA CLEANING

#### IMPORT LIBRARIES

In [20]:
import pandas as pd
import numpy as np
from langdetect import detect, detect_langs
import nltk
from nltk.tokenize import word_tokenize
from nltk.text import Text
from nltk.corpus import stopwords
from nltk.probability import FreqDist
stop_words = stopwords.words('english')

from textblob import TextBlob

import matplotlib.pyplot as plt
import seaborn as sns

import ast
import re
import itertools
from autocorrect import Speller

In [None]:
# nltk.download()

#### Import dataset with all the songs lyrics to clean

In [None]:
df = pd.read_csv("../data/songs_ed_sheeran_elton_john.csv")
print("Total songs:", len(df))

#### Remove all songs that haven't been processed

In [None]:
df_clean = df[df['lyrics'] != 'Pending'].copy()
print("Processed songs:", len(df_clean))

#### Drop column with artist_2 as is not giving much information

In [None]:
#df_clean['main_artist'] = df_clean['artists'].apply(lambda x: [n.strip() for n in ast.literal_eval(x)][0])
df_clean.drop(columns='artist_2', inplace=True)

#### Drop rows with None values (should only be in Lyrics for the songs that we haven't found them)

In [None]:
df_clean.dropna(inplace=True)
print("Songs with lyrics:", len(df_clean))

#### Clean Lyrics

In [None]:
def clean_lyrics(lyrics):
    """
    This function cleans the lyrics and leave them as a succession of words.
    parameters:
    lyrics: original string coming from Genius
    """
    lyrics = lyrics.replace("\n"," ")
    lyrics = lyrics.lower()
    lyrics = lyrics.replace("(","").replace(")","")
    
    #remove the weblinks
    lyrics = re.sub(r'https?:\/\/.\S+', "", lyrics)
    
    #replace the contractions
    apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will", "'d":" would","'ve":" have","'re":" are"} 
    for key,value in apos_dict.items(): 
        if key in lyrics: 
            lyrics = lyrics.replace(key,value) 
    lyrics = lyrics.strip()
    
    #One letter in a word should not be present more than twice in continuation 
    lyrics = ''.join(''.join(s)[:2] for _, s in itertools.groupby(lyrics)) 
    
    #spell check 
    spell = Speller(lang='en')
    lyrics = spell(lyrics)
    
    return lyrics

In [None]:
df_clean['lyrics_cleaned'] = df_clean['lyrics'].apply(clean_lyrics)

#### Detect language of the lyrics and we keep only English songs

In [None]:
df_clean['language'] = df_clean['lyrics'].apply(detect)

In [None]:
df_clean['language'].value_counts()

In [None]:
df_english = df_clean[df_clean['language'] == 'en'].copy()

#### Get all words for each song

In [None]:
def get_most_common_words(lyrics):
    tokens = word_tokenize(lyrics)
    words = [word for word in tokens if word.isalpha()]
    lowerwords = [word.lower() for word in words]
    clean_words = [w for w in lowerwords if not w in stop_words]
    fdist = FreqDist(clean_words)
    list_most_used = fdist.most_common()
    return { i[0] : i[1] for i in list_most_used }

In [None]:
df_english['most_used'] = df_english['lyrics'].apply(get_most_common_words)

#### Lyrics Polarity

In [None]:
df_english['lyrics_polarity'] = df_english['lyrics'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
sns.regplot(x='valence', y='lyrics_polarity', data=df_english, scatter=False)

#### Split years by decades

In [None]:
df_english['decade'] = pd.cut(x=df_english['year'], bins=[1949, 1959, 1969, 1979, 1989, 1999, 2009, 2029], labels=['50s', '60s', '70s', '80s', '90s', '00s', '10s'])

In [None]:
df_english.groupby('decade').lyrics_polarity.describe()

#### Popularity

In [None]:
df_english.groupby('decade').popularity.describe()

#### Save dataframe to csv file

In [None]:
df_english.to_csv('../data/songs_partial_english.csv', index=False)