## The goal of this script is to scrape various sites to create a dataframe of songs of various cities in the United States

In [56]:
from requests import get
import pandas as pd
from bs4 import BeautifulSoup
import re

### A) Wikipedia (https://en.wikipedia.org/wiki/List_of_songs_about_cities#United_States)

In [57]:
response = get("https://en.wikipedia.org/wiki/List_of_songs_about_cities#United_States")
html_soup = BeautifulSoup(response.text, 'html.parser')
h3s = html_soup.find_all("h3")

In [58]:
places = []
songs =[]
for i in range(348,497):
    places.append(h3s[i].text)
    songs.append(h3s[i].findNext("ul").text)

In [59]:
data = pd.DataFrame({"places":places,"songs":songs})

In [60]:
data[["city","state"]]=data["places"].str.split(",",expand = True)

In [61]:
data = (data
 .set_index(["city","state"])
 .songs.str.split("\n",expand = True)
 .stack()
 .reset_index()
 .rename(columns={0:'songs'})
 .loc[:,["city","state","songs"]]
)

In [62]:
data["state"] = data["state"].str.split("[",expand = True).iloc[:,0]

In [63]:
data["songs"][0]

'"I-95" by Fountains of Wayne'

In the Wikipedia page, there are some cities which have so many songs that they have their own separate wikipages
The webscraper does not capture this, we need to manually get these

In [64]:
data[data.duplicated(["songs"],keep="last")][["city","state"]].sample(3)

Unnamed: 0,city,state
112,Cleveland,Ohio
224,Jackson,Mississippi
387,New York City,New York


Lets drop these for now and maybe scrape them separately later

In [65]:
data_cleaned = data.drop_duplicates("songs",keep="last").reset_index(drop =True)

In [66]:
data_cleaned["state"].value_counts()

 Texas             86
 New Mexico        54
 Ohio              33
 Oklahoma          33
 Tennessee         33
 Pennsylvania      32
 Missouri          27
 California        25
 Minnesota         24
 Maryland          24
 Nevada            24
 Arizona           15
 Florida           14
 Virginia          12
 D.C.              11
 Kansas            10
 Colorado          10
 Wyoming            9
 Michigan           9
 Alabama            8
 Nebraska           8
 Georgia            7
 South Carolina     7
 New Jersey         6
 Illinois           6
 North Carolina     5
 Arkansas           5
 Indiana            5
 Kentucky           4
 Idaho              4
 Washington         4
 Iowa               4
 Mississippi        4
 Alaska             3
 Wisconsin          3
 Louisiana          3
 New York           3
 Maine              3
 Utah               2
 Oregon             2
 Hawaii             2
Name: state, dtype: int64

In [67]:
data_cleaned.to_csv("songs.csv")