# Creating the OHCO for our Corpus of Song Lyrics

In [4]:
# Import libraries
import pandas as pd
import os
os.chdir('/Users/nickbruno/Documents/spring_2019/DS5559/project/code')

In [5]:
# Upload raw corpus
data = pd.read_csv('songdata.csv')

In [6]:
data.columns

Index(['artist', 'song', 'link', 'text'], dtype='object')

In [7]:
artists = data.artist.unique().tolist()

In [8]:
len(artists) # 643 total artists

643

In [9]:
artists_df = pd.DataFrame(artists)

In [10]:
artists_df.head() # can be useful in creating a database later

Unnamed: 0,0
0,ABBA
1,Ace Of Base
2,Adam Sandler
3,Adele
4,Aerosmith


In [11]:
artists_df.insert(0, 'artist_id', range(len(artists_df)))
    # creates a unique id for each artist

In [12]:
artists_df.head()

Unnamed: 0,artist_id,0
0,0,ABBA
1,1,Ace Of Base
2,2,Adam Sandler
3,3,Adele
4,4,Aerosmith


In [13]:
artists_df = artists_df.rename(columns={0: 'artist'})

In [14]:
new_df = pd.merge(data, artists_df)

In [15]:
new_df = new_df.drop('link', axis=1)

In [16]:
new_df.head()

Unnamed: 0,artist,song,text,artist_id
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd...",0
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl...",0
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...,0
3,ABBA,Bang,Making somebody happy is a question of give an...,0
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,0


### Get rid of duplicate song titles
#### This allows each song to have a unique song_id

In [17]:
# Find duplicate songs #
all_dup = new_df[new_df['song'].duplicated() == True]

In [18]:
all_dup_list = all_dup.song.unique().tolist() # creates a song list of duplicates

In [19]:
# Remove duplicate songs #
songs = new_df.song

In [20]:
# Create list of songs that are not duplicated in the corpus #
no_dup_song_list = [x for x in songs if x not in all_dup_list]

In [21]:
# Creates a corpus of songs that are not duplicated in the corpus #
newer_df = new_df[new_df['song'].isin(no_dup_song_list)]

In [22]:
newer_df.columns

Index(['artist', 'song', 'text', 'artist_id'], dtype='object')

In [23]:
newer_df.head()

Unnamed: 0,artist,song,text,artist_id
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd...",0
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl...",0
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...,0
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,0
5,ABBA,Burning My Bridges,"Well, you hoot and you holler and you make me ...",0


In [24]:
len(newer_df.song.unique()) # removed a lot of songs
    # should make it easier to make an OHCO included every word

38690

In [25]:
# Create clearer dataframe #
songs = newer_df.song

In [26]:
songs_df = pd.DataFrame(songs)

In [27]:
songs_df.insert(0, 'song_id', range(len(songs_df)))

In [28]:
songs_df = songs_df.rename(columns={0: 'song'})

In [29]:
final_df = pd.merge(newer_df, songs_df)

In [30]:
# Cleaner dataframe #
final_df.head()

Unnamed: 0,artist,song,text,artist_id,song_id
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd...",0,0
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl...",0,1
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...,0,2
3,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,0,3
4,ABBA,Burning My Bridges,"Well, you hoot and you holler and you make me ...",0,4


In [None]:
## Moving on ##

In [33]:
# Set index to artist_id and song_id to create aN OHCO
final_df = final_df.set_index(['artist_id', 'song_id'])

In [34]:
final_df = final_df.drop(['artist','song'], axis=1)

In [36]:
final_df = final_df.rename(columns={'text': 'lyrics'})

In [37]:
# New OHCO
final_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lyrics
artist_id,song_id,Unnamed: 2_level_1
0,0,"Look at her face, it's a wonderful face \nAnd..."
0,1,"Take it easy with me, please \nTouch me gentl..."
0,2,I'll never know why I had to go \nWhy I had t...
0,3,Making somebody happy is a question of give an...
0,4,"Well, you hoot and you holler and you make me ..."


In [38]:
# write out to a .csv (this is a good starting point)
final_df.to_csv('artist_song_OHCO_df.csv', index=True) # write to .csv
    # could be a good start for a MALLET analysis

In [44]:
# Split song lyrics by verse #
verses = final_df.lyrics.str.split('  \n  \n', expand=True)\
            .stack()\
            .to_frame()\
            .rename(columns={0: 'Verse'})

In [58]:
verses.index.names = ['artist_id','song_id','verse_num']

In [59]:
verses.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Verse
artist_id,song_id,verse_num,Unnamed: 3_level_1
0,0,0,"Look at her face, it's a wonderful face \nAnd..."
0,0,1,"She's just my kind of girl, she makes me feel ..."
0,0,2,And when we go for a walk in the park \nAnd s...
0,0,3,"She's just my kind of girl, she makes me feel ..."
0,1,0,"Take it easy with me, please \nTouch me gentl..."


In [60]:
# Write out to .csv #
verses.to_csv('artist_song_verse_OHCO_df.csv') # write out to a .csv

In [53]:
# Split by line
lines = verses.Verse.str.split('  \n', expand=True)\ # '  \n' represents a line split
    .stack()\
    .to_frame()\
    .rename(columns={0:'Line'})

In [61]:
lines.index.names = ['artist_id','song_id','verse_num','line_num']

In [62]:
lines.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Line
artist_id,song_id,verse_num,line_num,Unnamed: 4_level_1
0,0,0,0,"Look at her face, it's a wonderful face"
0,0,0,1,And it means something special to me
0,0,0,2,Look at the way that she smiles when she sees me
0,0,0,3,How lucky can one fellow be?
0,0,1,0,"She's just my kind of girl, she makes me feel ..."


In [63]:
# Write to csv
lines.to_csv('artist_song_verse_line_OHCO_df.csv') # write out to a .csv

### Creating the final OHCO with a BOW is problematic because our corpus is so large 
#### So we tried on a smaller corpus

In [85]:
# Create a subset
ten_thousand_songs = lines.query('song_id >= 0 and song_id <= 10000')
    # only looks at the first ten thousand songs (test how long it takes)
    # takes 7.5 minutes

In [86]:
# Split the lines by Token #
TOKEN_PAT = r'(\W+)'
tokens = ten_thousand_songs.Line.str.split(TOKEN_PAT, expand=True)\
    .stack()\
    .to_frame()\
    .rename(columns={0:'token_str'})

In [88]:
tokens.query('song_id == 10000')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
artist_id,song_id,verse_num,line_num,Unnamed: 4_level_1,Unnamed: 5_level_1
143,10000,0,0,0,Confusion
143,10000,0,1,0,Desillution
143,10000,0,2,0,No
143,10000,0,2,1,
143,10000,0,2,2,Time
143,10000,0,3,0,No
143,10000,0,3,1,
143,10000,0,3,2,time
143,10000,0,3,3,
143,10000,0,3,4,for


In [None]:
# Apply it to the whole dataframe #
TOKEN_PAT = r'(\W+)'
tokens = lines.Line.str.split(TOKEN_PAT, expand=True)\
    .stack()\
    .to_frame()\
    .rename(columns={0:'token_str'})
    # DOES NOT RUN BECAUSE IT IS SO LARGE

In [None]:
words.index.names = ['artists_id', 'song_id', 'Song','Verse','Line']