In [167]:
import numpy as np
import pandas as pd
import os
import re
import string
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('words')
nlp = spacy.load('en_core_web_sm')
words = set(nltk.corpus.words.words())

from alphabet_detector import AlphabetDetector
ad = AlphabetDetector()

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# nlp = English()

[nltk_data] Downloading package words to /Users/lucid75/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
file_list = []

for f in os.listdir('lyrics'):
    file_list.append(f'lyrics/{f}')
    
file_list.sort()

In [3]:
df_list = []

for filepath in file_list:
    df = pd.read_json(filepath)
    df_list.append(df)

In [4]:
lyrics_df = pd.concat(df_list, axis = 0)

In [5]:
lyrics_df.shape

(87359, 4)

In [6]:
music_df = pd.read_csv('all_music.csv')

In [7]:
music_df.columns

Index(['Name', 'Artist', 'Composer', 'Album', 'Grouping', 'Work',
       'Movement Number', 'Movement Count', 'Movement Name', 'Genre', 'Size',
       'Time', 'Disc Number', 'Disc Count', 'Track Number', 'Track Count',
       'Year', 'Date Modified', 'Date Added', 'Bit Rate', 'Sample Rate',
       'Volume Adjustment', 'Kind', 'Equalizer', 'Comments', 'Plays',
       'Last Played', 'Skips', 'Last Skipped', 'My Rating', 'Location'],
      dtype='object')

In [8]:
music_df['Skips'] = music_df['Skips'].fillna(0)
music_df['Plays'] = music_df['Plays'].fillna(0)

In [9]:
music_df['My Rating'] = music_df['My Rating'].fillna(0)

In [10]:
# modifying training data for songs that got many skips and not many plays

def convert_rating(df):
    converted_list = []
    for i, row in df.iterrows():
        if (row['My Rating'] > 0):
            converted_list.append(1)
        elif (row['My Rating'] ==  0) & (row['Skips'] >= 1):
            converted_list.append(0)
        else:
            converted_list.append(None)
            
    return converted_list

In [11]:
lyrics_df['liked'] = convert_rating(music_df)

In [12]:
lyrics_df['liked'].value_counts()

1.0    5117
0.0    3574
Name: liked, dtype: int64

In [13]:
lyrics_df = lyrics_df.reset_index(drop = True)

In [14]:
lyrics_df = lyrics_df.drop(columns = ['order'])

In [15]:
# lyrics_df['liked'] = music_df['My Rating']

In [16]:
final_df = lyrics_df.copy()

In [17]:
final_df.head()

Unnamed: 0,lyrics,song_title,artist_name,liked
0,,Bunny,Tourist,
1,,1990,PINES,
2,,Scarlett Groove (feat. Saint Saviour),Maribou State,
3,\n\n[Verse 1]\nI can't remember\nThe words wer...,If You Want It,Jay Som,
4,\n\n[Verse 1]\nI'm not that kind of fool\nWho ...,Superbike,Jay Som,


In [18]:
final_df[final_df['lyrics'].isna()].head()

Unnamed: 0,lyrics,song_title,artist_name,liked
0,,Bunny,Tourist,
1,,1990,PINES,
2,,Scarlett Groove (feat. Saint Saviour),Maribou State,
18,,Miss Americana,Taylor Swift,0.0
19,,Paper Hearts,Taylor Swift,


In [19]:
final_df = final_df[final_df['lyrics'].notnull()]

In [20]:
# stopword_count = 0
# total_word_count = 0

# for word in nlp(final_df['lyrics'].iloc[0]):
#     if word.is_stop:
#         stopword_count += 1
#         total_word_count += 1
#     else:
#         total_word_count += 1
        
# print(stopword_count, total_word_count)

In [21]:
# final_df['lyrics'].iloc[0]

# nlped_words = [] # removing all stopwords/lemmatize

# for word in nlp(final_df['lyrics'].iloc[0]):
#     if word.is_stop == False:
#         nlped_words.append(word.lemma_)
#     else:
#         print(word)
        
# ' '.join(nlped_words)

In [22]:
final_df.head()

Unnamed: 0,lyrics,song_title,artist_name,liked
3,\n\n[Verse 1]\nI can't remember\nThe words wer...,If You Want It,Jay Som,
4,\n\n[Verse 1]\nI'm not that kind of fool\nWho ...,Superbike,Jay Som,
5,\n\n[Verse 1]\nPoint me to my chair\nMake me s...,Peace Out,Jay Som,
6,\n\n[Verse 1]\nUsed to be the one to cry\nAnd ...,Devotion,Jay Som,
7,\n\n[Verse 1]\nI'm sinking in my bed\nWe’re le...,Nighttime Drive,Jay Som,


In [23]:
def clean_string(lyric):
    try:
        
        if lyric == None:
            return None
        else:
            lyric = re.sub('\[.*\]|\(|\)', '', lyric) # remove punct
            lyric = re.sub('\s+', ' ', lyric) # get rid of newline stuff
            modified_punct = '!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~'
            lyric = [char.lower() for char in lyric if char not in modified_punct + string.digits] # remove more punct and digits
            lyric = ''.join(lyric) #rejoin list of characters back into words
            lyric = lyric[1:] # removing first space in all lyrics
            
            tokenized_lyric = nlp(lyric) #nlp toeknization
            
            nlped_words = [] # removing all stopwords/lemmatize
            
            for word in tokenized_lyric: # lematize
                if word.is_stop == False:
                    nlped_words.append(word.lemma_)
            
            lyric = ' '.join(nlped_words) # rejoin words into string
            lyric = ' '.join(w for w in nltk.wordpunct_tokenize(lyric) if w.lower() in words or not w.isalpha()) # get rid of anything that's not a word
            lyric = ''.join(w for w in lyric if ad.is_latin(w)) # get rid of words containing non-latin characters
            
            if lyric[-1] == ' ':
                lyric = lyric[:-1]
                return lyric
            else:
                return lyric
            
    except Exception as e:
        if len(lyric) == 0:
            return None
        else:
            return 'fuck'

In [24]:
def clean_list(list_of_lyrics):
    
    index = range(len(list_of_lyrics))
    
    error_indices = []
    cleaned_list = []
    
    for i, lyric in zip(index, list_of_lyrics):
        if clean_string(lyric) == 'fuck':
            error_indices.append(i)   
        else:
            cleaned_list.append(clean_string(lyric))
            if (i+1) % 10000 == 0:
                print(f'{i+1} done...')
            
    print(error_indices)
            
    return cleaned_list

In [25]:
final_df.shape

(45838, 4)

In [26]:
final_df['liked'].value_counts()

1.0    2139
0.0    1844
Name: liked, dtype: int64

In [27]:
final_df['cleaned_lyrics'] = clean_list(final_df['lyrics'])

10000 done...
20000 done...
30000 done...
40000 done...
[]


In [28]:
# final_df = final_df.drop(columns = ['lyrics'])

In [29]:
final_df = final_df[final_df['cleaned_lyrics'].notnull()]

In [30]:
final_df['liked'].value_counts(dropna = False)

NaN    38071
0.0     1656
1.0     1639
Name: liked, dtype: int64

In [61]:
final_df.to_csv('cleaned_lemmatized_unstopped_df.csv', index = False)