Valence — A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# NLP
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# visualization
from matplotlib import pyplot as plt

### Import Data

In [2]:
def length(text):    
    '''a function which returns the length of text'''
    return len(text)

In [3]:
def substitute(text):
    regex = r'\n|\r|Hook 1'
    text = re.sub(regex, " ", text)
    text = re.sub(' +', ' ', text)
    return text

In [4]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    # replacing the punctuations with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation) # Remove punctuation without replacing; space replace with space
    # return the text stripped of punctuation marks
    return text.translate(translator)

In [5]:
def stopwords(text):
    '''a function for removing the stopword'''
    sw = stopwords.words('english')
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)

In [6]:
def stemming(text):    
    '''a function which stems each word in the given text'''
    
    # create an object of stemming function
    stemmer = SnowballStemmer("english")

    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [7]:
def createDF(fileName):
    
    # Import CSV file
    df = pd.read_csv(fileName,sep=',', index_col=[0])
    
    # Rename columns names: "seq": "lyrics", "song": "song name", "label":"valence"
    df.rename(columns={"seq": "lyrics", "song": "song name", "label":"valence"}, inplace=True)
    
    # Create binary column: 1 represent "happy" mood while 0 represent "sad column"
    df['Mood'] = np.where(df['valence'] > 0.5, 1, 0) 
    
    return df

In [8]:
def dataCleansing(df):
    
    # Substitue special regex/characters
    df['lyrics'] = df['lyrics'].apply(substitute)
    
    # Remove puncuation
    df['lyrics'] = df['lyrics'].apply(remove_punctuation)
    
    # Lowercase all words
    df['lyrics'] = df['lyrics'].apply(lambda x:x.lower())
    
    # Create 'length' column that represent the lyrics' number of words
    df['length'] = df['lyrics'].apply(length)
    
    # Keep song with lyrics length between 500 and 2000
    df = df[(df['length'] < 2000) & (df['length'] > 500)]
    
    df.drop_duplicates(subset=['lyrics'], inplace=True)
    
    # Remove StopWords
    #data['lyrics'] = data['lyrics'].apply(stopwords)
    
    # Stemming
    df['lyrics'] = df['lyrics'].apply(stemming)
    
    return df

In [9]:
fileName = 'labeled_lyrics_cleaned.csv'
lyrics_df = createDF(fileName)
lyrics_df = dataCleansing(lyrics_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset=['lyrics'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lyrics'] = df['lyrics'].apply(stemming)


In [10]:
lyrics_df

Unnamed: 0,artist,lyrics,song name,valence,Mood,length
0,Elijah Blake,no no i aint ever trap out the bando but oh lo...,Everyday,0.626,1,1835
2,Elijah Blake,she dont live on planet earth no more she foun...,The Otherside,0.240,0,1366
3,Elijah Blake,trippin off that grigio mobbin light low tripp...,Pinot,0.536,1,1691
4,Elijah Blake,i see a midnight panther so gallant and so bra...,Shadows & Diamonds,0.371,0,824
5,Elijah Blake,i just want to readi your mind caus ill still ...,Uno,0.321,0,1095
...,...,...,...,...,...,...
158347,Adam Green,bind me gag me take me to the bunnyranch peopl...,Bunnyranch,0.723,1,567
158348,Adam Green,and we live on borrow time but this headshot p...,Friends of Mine,0.737,1,810
158349,Adam Green,frozin in time forev carri that torch for so l...,Frozen in Time,0.482,0,645
158351,Adam Green,i want to chose to die and be buri with a rubi...,I Wanna Die,0.361,0,531


In [11]:
lyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109032 entries, 0 to 158352
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   artist     109032 non-null  object 
 1   lyrics     109032 non-null  object 
 2   song name  109032 non-null  object 
 3   valence    109032 non-null  float64
 4   Mood       109032 non-null  int32  
 5   length     109032 non-null  int64  
dtypes: float64(1), int32(1), int64(1), object(3)
memory usage: 5.4+ MB


In [12]:
lyrics_df.describe(include='all')

Unnamed: 0,artist,lyrics,song name,valence,Mood,length
count,109032,109032,109032,109032.0,109032.0,109032.0
unique,9651,108554,78301,,,
top,Elvis Presley,swing low sweet chariot comin for to carri me ...,Home,,,
freq,551,3,84,,,
mean,,,,0.500215,0.488214,1029.160934
std,,,,0.246096,0.499863,347.572502
min,,,,0.0,0.0,501.0
25%,,,,0.301,0.0,751.0
50%,,,,0.491,0.0,973.0
75%,,,,0.696,1.0,1253.0


In [19]:
lyrics_df.to_csv("Song Lyrics After Cleaning.csv")