# Music Lyrics Through the Years

#### Import (possibly) necessary libraries

In [1]:
import pandas as pd; pd.options.display.max_rows = 200
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

import re
from unicodedata import normalize

import requests as rq
from bs4 import BeautifulSoup as bs
from selenium import webdriver

import pickle
import gzip

from functions.webscraping import *

import json

%load_ext autoreload
%autoreload 2

## Scrape song title and artist

In [2]:
%%time

# list of unique songs in top 10 between 1960 and 2019
all_top10s = [
    yearly_top10s(f'https://top40weekly.com/{year}-all-charts/') for year in range(1960,2020)
]

# confirm 60 years
len(all_top10s)

CPU times: user 14.7 s, sys: 195 ms, total: 14.9 s
Wall time: 1min 51s


60

In [3]:
# number of songs, before removing some duplicates across years
sum([len(year) for year in all_top10s])

5293

In [4]:
# uncomment to save
with gzip.open('data/all_top10s.pkl', 'wb') as goodbye:
    pickle.dump(all_top10s, goodbye, protocol=pickle.HIGHEST_PROTOCOL)
    
# uncomment to load
with gzip.open('data/all_top10s.pkl', 'rb') as hello:
    all_top10s = pickle.load(hello)

- Since some songs may be within the top 10 in two different years, I need to remove those from the list.

In [5]:
top10s = []

for i,year in enumerate(all_top10s):
    
    for song in year:
        
        song_dict = {}
        song_dict['year'] = i+1960
        song_dict['title'] = song[0]
        song_dict['artist'] = song[1]
        top10s.append(song_dict)

In [6]:
temp_df = pd.DataFrame(top10s)
temp_df = temp_df.drop_duplicates(subset=['title', 'artist'])

unique_top10s = temp_df.to_dict('records')

In [7]:
len(unique_top10s)

4735

## Scrape lyrics

In [8]:
# parse personal api keys
with open('data/api_keys.json', 'r') as json_file:
    creds = json.load(json_file)

# set the credentials based on the credentials file
client_id = creds['client_id']
client_secret = creds['client_secret']
access_token = creds['access_token']

In [9]:
%%time

top10s = []
print('Failed grabs:')

for song in unique_top10s:
    
    search_term = f"{song['title']} {song['artist']}"
        
    try:
        song['lyrics'] = lyrics_grabber(access_token, search_term)

    except:
        try:
            search_term = f"{re.match('[A-Z,. ]+', song['title']).group()} {' '.join(song['artist'].split()[:2])}"
            song['lyrics'] = lyrics_grabber(access_token, search_term)

        except:
            print(search_term)
            song['lyrics'] = np.nan
            
    top10s.append(song)
        
len(top10s)

Failed grabs:
(Down At) PAPA JOE’S The Dixiebelles
AN OPEN LETTER TO MY TEENAGE SON Victor Lundberg
/ 14 DOWN ON THE CORNER / FORTUNATE SON Creedence Clearwater Revival
Timeout raised and caught:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
MIDNIGHT COWBOY Ferrante and
JUNGLE FEVER The Chakachas
YOU AIN Bachman-Turner Overdrive
JUJU ON DAT BEAT  Zayion McCall
CPU times: user 5min 53s, sys: 15.4 s, total: 6min 9s
Wall time: 1h 57min 57s


4735

In [10]:
df = pd.DataFrame(top10s)
df.shape

(4735, 4)

In [11]:
df.head()

Unnamed: 0,year,title,artist,lyrics
0,1960,THE VILLAGE OF ST. BERNADETTE,Andy Williams,"I've traveled far, the land and the sea\nBeaut..."
1,1960,PAPER ROSES,Anita Bryant,I realize the way your eyes deceived me\nWith ...
2,1960,IN MY LITTLE CORNER OF THE WORLD,Anita Bryant,"Oh, come along with me\nTo my little corner of..."
3,1960,O DIO MIO,Annette,"[Strofa 1]\nE un giorno lui mi disse: ""Nei tuo..."
4,1960,WONDERLAND BY NIGHT,Bert Kaempfert,Stars hang suspended\nAbove a floating yellow ...


In [12]:
df.tail()

Unnamed: 0,year,title,artist,lyrics
4730,2019,YOU NEED TO CALM DOWN,Taylor Swift,[Verse 1]\nYou are somebody that I don't know\...
4731,2019,ME!,Taylor Swift featuring Brendon Urie,"JANUARY""Giant"" by Calvin Harris featuring Rag ..."
4732,2019,HEARTLESS,The Weeknd,"[Intro]\nYoung Metro, young Metro, young Metro..."
4733,2019,DANCE MONKEY,Tones and I,"[Verse 1]\nThey say, ""Oh my God, I see the way..."
4734,2019,HIGHEST IN THE ROOM,Travis Scott,[Chorus]\nI got room in my fumes (Yeah)\nShe f...


### Remove brackets and words contained within
- These tend to be denoting verse from chorus (or something similar) or other non-lyric words.

In [13]:
df['lyrics'] = df.lyrics.map(remove_brackets)

### Save/Load

In [14]:
# # uncomment to save
# with gzip.open('data/lyrics_df.pkl', 'wb') as goodbye:
#     pickle.dump(df, goodbye, protocol=pickle.HIGHEST_PROTOCOL)
    
# # uncomment to load
# with gzip.open('data/lyrics_df.pkl', 'rb') as hello:
#     df = pickle.load(hello)

## Rescrapes

### 1960

In [50]:
# rescrape
df.loc[32, 'lyrics'] = lyrics_grabber(access_token, search_term="because they're young duane eddy and the rebels")
df.loc[51, 'lyrics'] = lyrics_grabber(access_token, search_term='jackie wilson night')
df.loc[68, 'lyrics'] = lyrics_grabber(access_token, search_term='mister custer')
df.loc[75, 'lyrics'] = lyrics_grabber(access_token, search_term='the big hurt toni fisher')
df.loc[85, 'lyrics'] = lyrics_grabber(access_token, search_term='Only the Lonely (Know the Way I Feel)')


# more complex ones
soup = soupify('https://www.flashlyrics.com/lyrics/lolita/sailor-your-home-is-the-sea-41')
df.loc[69, 'lyrics'] = '\n'.join(
    [line.contents[0] for line in soup.find('div', attrs={'class': 'main-panel-content'}).find_all('span')
     if line.contents][2:]
)

soup = soupify('https://www.flashlyrics.com/lyrics/the-ivy-three/yogi-79')
df.loc[98, 'lyrics'] = '\n'.join(
    [line.contents[0] for line in soup.find('div', attrs={'class': 'main-panel-content'}).find_all('span')
     if line.contents]
)

In [52]:
# remove lines
df.loc[45, 'lyrics'] = remove_n_lines(df.loc[45, 'lyrics'], 11)

In [53]:
# instrumentals
df.loc[38, 'lyrics'] = np.nan
df.loc[39, 'lyrics'] = np.nan
df.loc[81, 'lyrics'] = np.nan
df.loc[102, 'lyrics'] = np.nan

### 1961

In [80]:
# rescrape
df.loc[106, 'lyrics'] = lyrics_grabber(access_token, 'arthur lyman yellow bird')
df.loc[136, 'lyrics'] = rescrape('https://www.lyricsondemand.com/d/dickanddeedeelyrics/themountainshighlyrics.html',
                                 name='div', attrs={'class': 'lcontent'})
df.loc[174, 'lyrics'] = lyrics_grabber(access_token, "ral donner you don't know what you've got")

# more complex ones
soup = soupify('https://www.flashlyrics.com/lyrics/adam-wade/as-if-i-didnt-know-55')
df.loc[105, 'lyrics'] = '\n'.join(
    [line.contents[0] for line in soup.find('div', attrs={'class': 'main-panel-content'}).find_all('span')
     if line.contents]
)

soup = soupify('https://www.flashlyrics.com/lyrics/ferrante-and-teicher/tonight-12')
df.loc[144, 'lyrics'] = '\n'.join(
    [line.contents[0] for line in soup.find('div', attrs={'class': 'main-panel-content'}).find_all('span')
     if line.contents]
)

In [81]:
# instrumentals
df.loc[110, 'lyrics'] = np.nan
df.loc[145, 'lyrics'] = np.nan
df.loc[161, 'lyrics'] = np.nan
df.loc[185, 'lyrics'] = np.nan
df.loc[198, 'lyrics'] = np.nan

In [82]:
# remove lines
df.loc[105, 'lyrics'] = remove_n_lines(df.loc[105, 'lyrics'], 2)
df.loc[174, 'lyrics'] = remove_n_lines(df.loc[174, 'lyrics'], 5)
df.loc[195, 'lyrics'] = remove_n_lines(df.loc[195, 'lyrics'], 10)
df.loc[208, 'lyrics'] = remove_n_lines(df.loc[208, 'lyrics'], 3)

In [83]:
# update song
df.loc[177, 'title'] = 'Corrine, Corrina'

In [85]:
# remove duplicate
df.drop(index=178, inplace=True)
df.drop(index=182, inplace=True)

### 1962

In [100]:
# rescrape
df.loc[272, 'lyrics'] = lyrics_grabber(access_token, 'joey dee shout')

In [101]:
# instrumentals
df.loc[214, 'lyrics'] = np.nan
df.loc[215, 'lyrics'] = np.nan
df.loc[221, 'lyrics'] = np.nan
df.loc[241, 'lyrics'] = np.nan
df.loc[242, 'lyrics'] = np.nan
df.loc[276, 'lyrics'] = np.nan
df.loc[282, 'lyrics'] = np.nan
df.loc[315, 'lyrics'] = np.nan
df.loc[316, 'lyrics'] = np.nan

In [102]:
# remove lines
df.loc[226, 'lyrics'] = remove_n_lines(df.loc[226, 'lyrics'], 7, 3)
df.loc[256, 'lyrics'] = remove_n_lines(df.loc[256, 'lyrics'], 2)
df.loc[273, 'lyrics'] = remove_n_lines(df.loc[273, 'lyrics'], 5, 1)
df.loc[318, 'lyrics'] = remove_n_lines(df.loc[318, 'lyrics'], 2)

### 1963

In [110]:
# rescrape
df.loc[360, 'lyrics'] = lyrics_grabber(access_token, 'fingertips stevie')

# more complex ones
soup = soupify('https://www.flashlyrics.com/lyrics/dixie-belles/down-at-papa-joes-70')
df.loc[403, 'lyrics'] = '\n'.join(
    [line.contents[0] for line in soup.find('div', attrs={'class': 'main-panel-content'}).find_all('span')
     if line.contents][1:]
)

In [111]:
# instrumentals
df.loc[324, 'lyrics'] = np.nan
df.loc[353, 'lyrics'] = np.nan
df.loc[361, 'lyrics'] = np.nan
df.loc[362, 'lyrics'] = np.nan
df.loc[397, 'lyrics'] = np.nan
df.loc[419, 'lyrics'] = np.nan

In [112]:
# remove lines
df.loc[327, 'lyrics'] = remove_n_lines(df.loc[327, 'lyrics'], 3)
df.loc[344, 'lyrics'] = remove_n_lines(df.loc[344, 'lyrics'], 2)
df.loc[351, 'lyrics'] = remove_n_lines(df.loc[351, 'lyrics'], 2)

### 1964

In [117]:
# rescrape
df.loc[442, 'lyrics'] = lyrics_grabber(access_token, 'summer song chad')
df.loc[473, 'lyrics'] = lyrics_grabber(access_token, 'dancing in the street martha')
df.loc[482, 'lyrics'] = lyrics_grabber(access_token, 'roy orbison pretty woman')
df.loc[518, 'lyrics'] = rescrape('http://www.songlyrics.com/the-larks/jerk-lyrics/',
                                 name='p', attrs={'class': 'songLyricsV14 iComment-text'})

In [118]:
# instrumentals
df.loc[431, 'lyrics'] = np.nan
df.loc[519, 'lyrics'] = np.nan
df.loc[534, 'lyrics'] = np.nan

In [119]:
# remove lines
df.loc[513, 'lyrics'] = remove_n_lines(df.loc[513, 'lyrics'], 6)
df.loc[521, 'lyrics'] = remove_n_lines(df.loc[521, 'lyrics'], 6)

In [120]:
# update artist name
df.loc[442, 'artist'] = 'Chad and Jeremy'

In [121]:
# remove duplicate
df.drop(index=467, inplace=True)

### 1965

In [123]:
# uncomment to save
with gzip.open('data/lyrics_df_updated.pkl', 'wb') as goodbye:
    pickle.dump(df, goodbye, protocol=pickle.HIGHEST_PROTOCOL)
    
# # uncomment to load
# with gzip.open('data/lyrics_df_updated.pkl', 'rb') as hello:
#     df = pickle.load(hello)

In [122]:
df[df.year == 1965]

Unnamed: 0,year,title,artist,lyrics
536,1965,"YES, I’M READY",Barbara Mason,(Barbara Mason)\n (Are you ready?)\n \n (Are y...
537,1965,EVE OF DESTRUCTION,Barry McGuire,The eastern world it is explodin'\nViolence fl...
538,1965,DOWN IN THE BOONDOCKS,Billy Joe Royal,"Down in the boondocks, down in the boondocks\n..."
539,1965,LIKE A ROLLING STONE,Bob Dylan,Once upon a time you dressed so fine\nThrew th...
540,1965,POSITIVELY 4TH STREET,Bob Dylan,You got a lot of nerve\nTo say you are my frie...
541,1965,I WILL,Dean Martin,I don't want to be the one to say I'm gonna mi...
542,1965,KEEP SEARCHIN’ (We’ll Follow the Sun),Del Shannon,No one will understand what I've gotta do\nI'v...
543,1965,MAKE THE WORLD GO AWAY,Eddy Arnold,Make the world go away\nGet it off my shoulder...
544,1965,CRYING IN THE CHAPEL,Elvis Presley,You saw me crying in the chapel\nThe tears I s...
545,1965,RESCUE ME,Fontella Bass,Rescue me\nOr take me in your arms\nRescue me\...


In [None]:
# rescrape
df.loc[558, 'lyrics'] = lyrics_grabber(access_token, 'nowhere to run martha')
df.loc[582, 'lyrics'] = lyrics_grabber(access_token, "Papa’s Got a Brand New Bag (Part 1)")
df.loc[583, 'lyrics'] = lyrics_grabber(access_token, "I Can’t Help Myself (Sugar Pie, Honey Bunch)")
df.loc[597, 'lyrics'] = lyrics_grabber(access_token, 'i got you babe')

In [None]:
# instrumentals
df.loc[578, 'lyrics'] = np.nan
df.loc[585, 'lyrics'] = np.nan

In [None]:
# remove duplicate
df.drop(index=587, inplace=True)

In [116]:
print(df.loc[534, 'lyrics'])

"SHE STOOPS TO CONQUER"




by Oliver Goldsmith





She Stoops To Conquer; Or, The Mistakes Of A Night.

A Comedy.





To Samuel Johnson, LL.D.

Dear Sir,—By inscribing this slight performance to you, I do not mean so much to compliment you as myself. It may do me some honour to inform the public, that I have lived many years in intimacy with you. It may serve the interests of mankind also to inform them, that the greatest wit may be found in a character, without impairing the most unaffected piety.

I have, particularly, reason to thank you for your partiality to this performance. The undertaking a comedy not merely sentimental was very dangerous; and Mr. Colman, who saw this piece in its various stages, always thought it so. However, I ventured to trust it to the public; and, though it was necessarily delayed till late in the season, I have every reason to be grateful.

I am, dear Sir, your most sincere friend and admirer,

OLIVER GOLDSMITH.








Contents


PROLOGUE,

DRAMATIS P

### 1966

In [None]:
# rescrape
df.loc[650, 'lyrics'] = lyrics_grabber(access_token, 'california dreamin')
df.loc[665, 'lyrics'] = lyrics_grabber(access_token, 'lady godiva peter')
df.loc[665, 'lyrics'] = lyrics_grabber(access_token, 'lady godiva peter')
df.loc[702, 'lyrics'] = lyrics_grabber(access_token, 'kicks paul revere')

In [None]:
# remove duplicate
df.drop(index=740, inplace=True)

### 1967

In [None]:
# rescrape
df.loc[845, 'lyrics'] = rescrape('http://www.songlyrics.com/victor-lundberg/an-open-letter-to-my-teenage-son-lyrics/',
                                 name='p', attrs={'class': 'songLyricsV14 iComment-text'})

### 1968

In [None]:
# rescrape
df.loc[900, 'lyrics'] = lyrics_grabber(access_token, 'mrs. robinson')

In [None]:
# instrumentals
df.loc[904, 'lyrics'] = np.nan
df.loc[909, 'lyrics'] = np.nan
df.loc[927, 'lyrics'] = np.nan
df.loc[937, 'lyrics'] = np.nan
df.loc[945, 'lyrics'] = np.nan

### 1969

In [None]:
# rescrape
df.loc[986, 'lyrics'] = lyrics_grabber(access_token, 'i want you back the jackson 5')
df.loc[1008, 'lyrics'] = lyrics_grabber(access_token, 'boxer simon garfunkel')

In [None]:
# instrumentals
df.loc[1012, 'lyrics'] = np.nan
df.loc[1015, 'lyrics'] = np.nan
df.loc[1022, 'lyrics'] = np.nan

In [None]:
# update artist
df.loc[986, 'artist'] = 'The Jackson 5'
df.loc[989, 'artist'] = 'Blue Swede'

### 1970

In [None]:
# rescrape
df.loc[1068, 'lyrics'] = lyrics_grabber(access_token, 'bridge over troubled water simon garfunkel')
df.loc[1085, 'lyrics'] = lyrics_grabber(access_token, 'abc the jackson 5')

In [None]:
# update artist
df.loc[1085, 'artist'] = 'The Jackson 5'
df.loc[1092, 'artist'] = 'The Jackson 5'

### 1971

In [None]:
# instrumentals
df.loc[1182, 'lyrics'] = np.nan

### 1972

In [None]:
# instrumentals
df.loc[1182, 'lyrics'] = np.nan
df.loc[1253, 'lyrics'] = np.nan
df.loc[1292, 'lyrics'] = np.nan

# edge case; non-lyric song
df.loc[1252, 'lyrics'] = np.nan

### 1973

In [None]:
# rescrape
df.loc[1321, 'lyrics'] = lyrics_grabber(access_token, 'neither one of us gladys knight')
df.loc[1324, 'lyrics'] = lyrics_grabber(access_token, 'The Morning After (Theme From The Poseidon Adventure)')

In [None]:
# instrumentals
df.loc[1385, 'lyrics'] = np.nan

# edge case; non-lyric song
df.loc[1398, 'lyrics'] = np.nan

In [None]:
# remove lines
df.loc[1386, 'lyrics'] = remove_n_lines(2, df.loc[1386, 'lyrics'])

### 1974

In [None]:
# rescrape
df.loc[1449, 'lyrics'] = lyrics_grabber(access_token, 'jet wings')
df.loc[1453, 'lyrics'] = lyrics_grabber(access_token, 'band on the run wings')
df.loc[1488, 'lyrics'] = lyrics_grabber(access_token, 'jungle boogie')
df.loc[1514, 'lyrics'] = lyrics_grabber(access_token, 'hollywood swinging')

In [None]:
# instrumentals
df.loc[1440, 'lyrics'] = np.nan
df.loc[1476, 'lyrics'] = np.nan
df.loc[1479, 'lyrics'] = np.nan

In [None]:
# remove duplicate
df.drop(index=1481, inplace=True)

### 1975

In [None]:
# rescrape
df.loc[1577, 'lyrics'] = lyrics_grabber(access_token, 'love rollercoaster ohio players')
df.loc[1587, 'lyrics'] = lyrics_grabber(access_token, 'yesterday carpenters')
df.loc[1596, 'lyrics'] = lyrics_grabber(access_token, 'philadelphia freedom elton john')
df.loc[1622, 'lyrics'] = lyrics_grabber(access_token, 'fire ohio players')

In [None]:
# instrumentals
df.loc[1594, 'lyrics'] = np.nan

### 1976

In [None]:
# rescrape
df.loc[1625, 'lyrics'] = lyrics_grabber(access_token, 'sara smile hall oates')
df.loc[1652, 'lyrics'] = lyrics_grabber(access_token, 'more more more andrea true')
df.loc[1664, 'lyrics'] = lyrics_grabber(access_token, 'sing a song earth')
df.loc[1667, 'lyrics'] = lyrics_grabber(access_token, 'muskrat love')
df.loc[1690, 'lyrics'] = lyrics_grabber(access_token, 'disco duck rick dees')

In [None]:
# instrumentals
df.loc[1639, 'lyrics'] = np.nan
df.loc[1681, 'lyrics'] = np.nan

### 1977

In [None]:
# rescrape
df.loc[1736, 'lyrics'] = lyrics_grabber(access_token, 'jet airliner steve miller')
df.loc[1775, 'lyrics'] = lyrics_grabber(access_token, 'eagle steve miller')

In [None]:
# instrumentals
df.loc[1784, 'lyrics'] = np.nan

In [None]:
# remove lines
df.loc[1720, 'lyrics'] = remove_n_lines(2, df.loc[1720, 'lyrics'])

### 1978

In [None]:
# rescrape
df.loc[1844, 'lyrics'] = lyrics_grabber(access_token, 'ymca village people')

In [None]:
# instrumentals
df.loc[1827, 'lyrics'] = np.nan

### 1979

In [None]:
# instrumentals
df.loc[1929, 'lyrics'] = np.nan
df.loc[1932, 'lyrics'] = np.nan

### 1980

In [None]:
# rescrape
df.loc[1961, 'lyrics'] = lyrics_grabber(access_token, 'another brick in the wall')

In [None]:
# remove lines
df.loc[1967, 'lyrics'] = remove_n_lines(3, df.loc[1967, 'lyrics'])
df.loc[2013, 'lyrics'] = remove_n_lines(3, df.loc[2013, 'lyrics'])

### 1981

In [None]:
# remove lines
df.loc[2027, 'lyrics'] = remove_n_lines(6, df.loc[2027, 'lyrics'])

In [None]:
# instrumentals
df.loc[2038, 'lyrics'] = np.nan

### 1982

In [None]:
# rescrape
df.loc[2117, 'lyrics'] = lyrics_grabber(access_token, 'rock this town stray cats')
df.loc[2130, 'lyrics'] = lyrics_grabber(access_token, 'crimson and clover')

In [None]:
# instrumentals
df.loc[2153, 'lyrics'] = np.nan
df.loc[2160, 'lyrics'] = np.nan

### 1983

In [None]:
# rescrape
df.loc[2237, 'lyrics'] = lyrics_grabber(access_token, 'sweet dreams eurythmics')

In [None]:
# remove lines
df.loc[2179, 'lyrics'] = remove_n_lines(8, df.loc[2179, 'lyrics'])

In [None]:
# remove duplicate
df.drop(index=2205, inplace=True)

### 1984

In [None]:
# rescrape
df.loc[2266, 'lyrics'] = lyrics_grabber(access_token, 'state of shock jacksons')
df.loc[2266, 'lyrics'] = lyrics_grabber(access_token, 'state of shock jacksons')

### 1985

In [None]:
# rescrape
df.loc[2309, 'lyrics'] = lyrics_grabber(access_token, 'the warrior scandal')

In [None]:
# instrumentals
df.loc[2358, 'lyrics'] = np.nan
df.loc[2405, 'lyrics'] = np.nan

In [None]:
# remove lines
df.loc[2385, 'lyrics'] = remove_n_lines(1, df.loc[2385, 'lyrics'])

### 1986

In [None]:
# rescrape
df.loc[2501, 'lyrics'] = lyrics_grabber(access_token, 'your wildest dreams')

### 1987

In [None]:
# rescrape
df.loc[2557, 'lyrics'] = lyrics_grabber(access_token, 'the one i love rem')
df.loc[2663, 'lyrics'] = lyrics_grabber(access_token, 'got the look prince')

In [None]:
# instrumentals
df.loc[2646, 'lyrics'] = np.nan

In [None]:
# update artist
df.loc[2557, 'artist'] = 'R.E.M.'

### 1988

In [None]:
# rescrape
df.loc[2678, 'lyrics'] = lyrics_grabber(access_token, "she's like the wind patrick swayze")

### 1989

In [None]:
# rescrape
df.loc[2862, 'lyrics'] = lyrics_grabber(access_token, 'friends jody watley')

### 1990

In [None]:
# remove lines
df.loc[2917, 'lyrics'] = remove_n_lines(3, df.loc[2917, 'lyrics'])

### 1991 - 1995

In [None]:
# rescrape
df.loc[3280, 'lyrics'] = lyrics_grabber(access_token, "gangsta's paradise coolio")

### 1996

In [None]:
# rescrape
df.loc[3371, 'lyrics'] = lyrics_grabber(access_token, 'nobody keith sweat')

In [None]:
# remove lines
df.loc[3324, 'lyrics'] = remove_n_lines(9, df.loc[3324, 'lyrics'])

In [None]:
# instrumentals
df.loc[3365, 'lyrics'] = np.nan

In [None]:
# remove duplicate
df.drop(index=3331, inplace=True)

### 1997

In [None]:
# rescrape
df.loc[3374, 'lyrics'] = lyrics_grabber(access_token, 'mmmbop')
df.loc[3384, 'lyrics'] = lyrics_grabber(access_token, 'feel so good ma$e')
df.loc[3395, 'lyrics'] = lyrics_grabber(access_token, 'gotham city r. kelly')
df.loc[3411, 'lyrics'] = lyrics_grabber(access_token, 'CAN’T NOBODY HOLD ME DOWN')

### 1998-1999

In [None]:
# rescrape
df.loc[3493, 'lyrics'] = lyrics_grabber(access_token, 'lost in you garth')
df.loc[3512, 'lyrics'] = lyrics_grabber(access_token, 'smooth santana')

### 2000

In [None]:
# rescrape
df.loc[3541, 'lyrics'] = lyrics_grabber(access_token, "destiny's child independent")
df.loc[3578, 'lyrics'] = lyrics_grabber(access_token, 'maria santana')

In [None]:
# remove lines
df.loc[3543, 'lyrics'] = remove_n_lines(3, df.loc[3543, 'lyrics'])

In [None]:
# remove duplicate
df.drop(index=3579, inplace=True)

### 2001

In [None]:
# rescrape
df.loc[3597, 'lyrics'] = lyrics_grabber(access_token, "lady marmalade Lil’ Kim, Mýa, Christina Aguilera & P!nk")
df.loc[3616, 'lyrics'] = lyrics_grabber(access_token, 'always on time ja rule')
df.loc[3626, 'lyrics'] = lyrics_grabber(access_token, "i'm real jennifer lopez")

In [None]:
# update artist
df.loc[3597, 'artist'] = 'Lil’ Kim, Mýa, Christina Aguilera & P!nk'

### 2002

In [None]:
# rescrape
df.loc[3659, 'lyrics'] = lyrics_grabber(access_token, 'down 4 u irv gotti')

In [None]:
df[df.year == 2002]

In [None]:
print(df.loc[3659, 'lyrics'])

### 1998

In [None]:
df.loc[1324, 'lyrics'] = lyrics_grabber(access_token, 'The Morning After (Theme From The Poseidon Adventure)')

In [None]:
remove_n_lines(2, df.loc[1386, 'lyrics'])

In [None]:
df[df.artist == 'The Larks']

In [None]:
rescrape('http://www.songlyrics.com/the-larks/jerk-lyrics/',
                                 soup.find('p', attrs={'class': 'songLyricsV14 iComment-text'}).contents)

In [None]:
soup = soupify('http://www.songlyrics.com/the-larks/jerk-lyrics/')

In [None]:
soup.find('p', attrs={'class': 'songLyricsV14 iComment-text'}).contents

In [None]:
'\n'.join(
    [line.strip() for line in soup.find('div', attrs={'class': 'inner'}).contents
     if isinstance(line, str) if line.strip()]
)

In [None]:
soup.find('div', attrs={'class': 'inner'}).contents

In [None]:
please break this code

In [None]:
print(df.loc[81, 'lyrics'])

In [None]:
regex = re.compile(".*?\[(.*?)\]")
result = re.findall(regex, df.loc[1, 'lyrics'])
result

In [None]:
df['lyrics'] = df.lyrics.map(remove_brackets)
df.lyrics.head(100)

In [None]:
# graph number of unique songs per year
plt.figure(figsize=(10,6))
plt.plot([len(year) for year in all_top10s])
plt.xticks(ticks=range(0,60,5), labels=range(1960,2021,5))
plt.show()

In [None]:
# graph number of unique songs per year
plt.figure(figsize=(10,6))
sns.barplot(x=[year for year in range(1960,2020)], y=[len(year) for year in all_top10s])
plt.xticks(ticks=range(0,60,5), labels=range(1960,2021,5))
plt.show()