In [470]:
import requests
import lxml.html as lx
import re
import pickle
import folium
import json
import pandas as pd
from scipy.stats import chi2_contingency

In [471]:
with open("C:/Users/Brandon Xia/Downloads/top_50_cities_provinces.pkl", 'rb') as file:
    top_50_cities_provinces = pickle.load(file)
topcities = []
for city in top_50_cities_provinces:
    if city['city_name'] == 'The Bronx': #remove identical city
        continue
    if city['city_name'] == 'Washington D.C.': #special case with stateless city
        topcities.append('Washington District of Columbia')
        continue
    citystate = city['city_name'] + ' ' + city['state'] #correct format for city_data.com
    topcities.append(citystate)

In [472]:
url = 'https://city-data.com/city/'
cityraces = {}
for city in topcities:
    cityurl = re.sub(' ', '-', city)
    response = requests.get(url+cityurl+'.html', headers = {'Accept': 'text/html'})
    html = lx.fromstring(response.text)
    racedic = {}
    races = html.xpath('//*[@id="races-graph"]/div/ul/li[2]/ul//li[@class="list-group-item"]')
    for race in races:
        percentage = race.xpath('.//span[@class="badge alert-info"]/text()')[0].strip('%') #grab percentage without '%'
        name = race.xpath('.//b/text()')[0].strip() #name of the racial category
        racedic[name] = percentage
    cityraces[city] = racedic #add dictionary of races to preexisting dictionary

In [473]:
allraces = set()
for city in topcities:
    for key in cityraces[city].keys():
        allraces.add(key)

allraces #catalog every race category

{'American Indian alone',
 'Asian alone',
 'Black alone',
 'Hispanic',
 'Native Hawaiian and Other',
 'Other race alone',
 'Two or more races',
 'White alone'}

In [474]:
stateraces = {}
response = requests.get('https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_race/ethnicity', headers = {'Accept': 'text/html'})
html = lx.fromstring(response.text)
states = html.xpath('//*[@id="mw-content-text"]/div[1]/table/tbody//tr[not(@style)]')
states = states[1:51] #remove empty header

In [475]:
with open("C:/Users/Brandon Xia/Downloads/genre_state.pkl", 'rb') as file:
    stategenre = pickle.load(file)
genres = stategenre #create copy of the original which will not be altered

In [476]:
for i, statesection in enumerate(states): #add dictionary of races to each state
    racedic = {}
    state = statesection.xpath('./td[1]/b/a/text()')[0]
    racedic['White alone'] = statesection.xpath('./td[4]/text()')[0].strip()
    racedic['Black alone'] = statesection.xpath('./td[6]/text()')[0].strip()
    racedic['Hispanic'] = statesection.xpath('./td[18]/text()')[0].strip()
    racedic['Asian alone'] = statesection.xpath('./td[10]/text()')[0].strip()
    racedic['Two or more races'] = statesection.xpath('./td[16]/text()')[0].strip()
    racedic['Native Hawaiian and Other'] = statesection.xpath('./td[12]/text()')[0].strip()
    racedic['American Indian alone'] = statesection.xpath('./td[8]/text()')[0].strip()
    racedic['Other race alone'] = statesection.xpath('./td[14]/text()')[0].strip()
    stateraces[state] = racedic
    stategenre[i]['races'] = racedic
    stategenre[i]['Population'] = statesection.xpath('./td[2]/text()')[0].strip()

In [477]:
allgenres = set()
for state in genres:
    allgenres.add(state['genre'])
allgenres = list(allgenres)
allgenres #find every genre that will need to be plotted

['Classic Rock',
 'Metal',
 'Electronic',
 'Pop',
 'Country',
 'Indie',
 'Rhythm and Blue',
 'Latin',
 'Alternative']

In [478]:
colors = ['red', 'grey', 'green', 'pink', 'orange', 'purple', 'blue', 'yellow', 'black']
gendic = dict(zip(allgenres, colors))
gendic #set each genre to a map color

{'Classic Rock': 'red',
 'Metal': 'grey',
 'Electronic': 'green',
 'Pop': 'pink',
 'Country': 'orange',
 'Indie': 'purple',
 'Rhythm and Blue': 'blue',
 'Latin': 'yellow',
 'Alternative': 'black'}

In [479]:
def getgencolor(state): #function to get the color to be used in the folium lambda function
    for i in stategenre:
        if i['state'] == state:
            return gendic[i['genre']]
        else:
            continue

In [480]:
def getgen(state): #function to get the top genre in each state
    for i in stategenre:
        if i['state'] == state:
            return i['genre']
        else:
            continue

In [481]:
def getrace(state): #function to get the races of each state to be used in the folium tooltip
    for i in stategenre:
        if i['state'] == state:
            return re.sub(r"[{}']", "", str(i['races'])).split(',')
        else:
            continue

In [482]:
boundaries = "C:/Users/Brandon Xia/Downloads/gz_2010_us_040_00_5m.json" #GeoJson boundary file
with open(boundaries, 'r') as f:
    boundaries = json.load(f)

In [483]:
def generate_tt_contentrace(feature):#create tooltip info for a given state
    state_name = feature['properties']['NAME']
    srace = getrace(state_name)
    if state_name == 'District of Columbia' or state_name == 'Puerto Rico':
        return 'Territory: ' + state_name
    sracehtml = '<br>'.join(srace)
    return f"""
    <b>State:</b> {state_name}<br>
    <b>Racial Demographics:</b><br>{sracehtml}<br>
    """#html text handles line breaks and text bolding

In [484]:
with open("C:/Users/Brandon Xia/Downloads/top_50_cities_location (1).pkl", 'rb') as file:
    toplocations = pickle.load(file) #load city coordinates

In [485]:
with open("C:/Users/Brandon Xia/Downloads/transformed_city_top10artist_data.txt", 'r') as file:
    artists = json.load(file) #load top 10 artists per city

In [486]:
def getArtists(city): #extract the top artists from the nested dictionaries, format, and number them
    for index in artists:
        if city == index['city_name']:
            return [('<b>' + str(i+1) + ': </b>' + artist['name']) for i, artist in enumerate(index['top_artists'])]

In [487]:
m = folium.Map(location = [40, -95],
               zoom_start = 4)
folium.TileLayer(
    tiles='https://server.arcgisonline.com/ArcGIS/rest/services/World_Terrain_Base/MapServer/tile/{z}/{y}/{x}',
    attr='Esri',
    name='Esri Satellite', overlay=True, control=False
).add_to(m)

<folium.raster_layers.TileLayer at 0x26620012f10>

In [488]:
legend_html = """
<div style="
    position: fixed; 
    bottom: 50px; 
    left: 50px; 
    width: 200px; 
    height: auto; 
    background-color: white; 
    border:2px solid grey; 
    z-index:9999; 
    font-size:14px;
    padding: 10px;
">
    <b>Legend</b><br>
    <i style="background: blue; width: 18px; height: 18px; display: inline-block;"></i> Rhythm & Blue<br>
    <i style="background: red; width: 18px; height: 18px; display: inline-block;"></i> Classic Rock<br>
    <i style="background: green; width: 18px; height: 18px; display: inline-block;"></i> Electronic<br>
    <i style="background: black; width: 18px; height: 18px; display: inline-block;"></i> Alternative<br>
    <i style="background: yellow; width: 18px; height: 18px; display: inline-block;"></i> Latin<br>
    <i style="background: orange; width: 18px; height: 18px; display: inline-block;"></i> Country<br>
    <i style="background: pink; width: 18px; height: 18px; display: inline-block;"></i> Pop<br>
    <i style="background: grey; width: 18px; height: 18px; display: inline-block;"></i> Metal<br>
    <i style="background: purple; width: 18px; height: 18px; display: inline-block;"></i> Indie<br>
</div>
""" #legend for state genre colors
folium.GeoJson(
    boundaries,
    name='Genres',
    style_function=lambda feature: {
        'fillColor': getgencolor(feature['properties']['NAME']),
        'color': 'black',
        'weight': 2,
        'fillOpacity': 0.85,
    },
).add_to(m)

for feature in boundaries['features']:
    state_name = feature['properties']['NAME']
    tooltip_content = generate_tt_contentrace(feature)

    # Add dynamic tooltips
    folium.GeoJson(
        feature,
        style_function=lambda x: {
            'color': None,
            'fillOpacity': 0.1,
        },
        tooltip=folium.Tooltip(tooltip_content),
    ).add_to(m)
legend = folium.Element(legend_html)
m.get_root().html.add_child(legend) #add legend to the map

<branca.element.Element at 0x26622965820>

In [489]:
for city in toplocations: #cities layered above the states
    artistlist = getArtists(city['city_name'])
    artistshtml = '<br>'.join(artistlist)
    popupcity = f"<b>City: {city['city_name']}</b><br><br><b>Top Artists:</b><br>{artistshtml}" #clickable popup for each city, html formatted
    folium.CircleMarker( #create circular markers for each city
        location=[city["latitude"], city["longitude"]],
        radius = 4,
        color = 'black',
        fill = True,
        fill_color = 'white',
        fill_opacity = 1,
        popup=folium.Popup(popupcity, max_width=300),
    ).add_to(m)

In [490]:
m.save("CityStateMap.html")

In [491]:
statelist = []  #format state data in a way that fits the chi squared test for independence
whites = []
blacks = []
hispanics = []
asians = []
nativeams = []
pacificislanders = []
others = []
twoormore = []
genre = []
pop = []
for state in stategenre: #turn string percentages into floats
    statelist.append(state['state'])
    whites.append(float(state['races']['White alone'].strip('%')))
    blacks.append(float(state['races']['Black alone'].strip('%')))
    hispanics.append(float(state['races']['Hispanic'].strip('%')))
    asians.append(float(state['races']['Asian alone'].strip('%')))
    nativeams.append(float(state['races']['American Indian alone'].strip('%')))
    pacificislanders.append(float(state['races']['Native Hawaiian and Other'].strip('%')))
    others.append(float(state['races']['Other race alone'].strip('%')))
    twoormore.append(float(state['races']['Two or more races'].strip('%')))
    genre.append(state['genre'])
    pop.append(float(state['Population'].replace(',','')))
data = {'State': statelist, 'Population': pop, 'White alone': whites, 'Black alone': blacks, 'Hispanic': hispanics, 'Asian alone': asians, 'American Indian alone': nativeams,
        'Native Hawaiian and Other': pacificislanders, 'Other race alone': others, 'Two or more races': twoormore, 'genre': genre}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,State,Population,White alone,Black alone,Hispanic,Asian alone,American Indian alone,Native Hawaiian and Other,Other race alone,Two or more races,genre
0,Alabama,5024279.0,63.12,25.64,5.26,1.51,0.46,0.05,0.29,3.67,Country
1,Alaska,733391.0,57.51,2.83,6.79,5.92,14.84,1.7,0.62,9.78,Country
2,Arizona,7151502.0,53.37,4.43,30.65,3.48,3.69,0.2,0.44,3.73,Metal
3,Arkansas,3011524.0,68.52,14.94,8.53,1.7,0.68,0.47,0.27,4.89,Country
4,California,39538223.0,34.69,5.36,39.4,15.12,0.39,0.35,0.57,4.12,Pop


In [492]:
racecols = ['White alone', 'Black alone', 'Hispanic', 'Asian alone', 'American Indian alone', 'Native Hawaiian and Other', 'Other race alone', 'Two or more races']	#turn df into a contingency table
for race in racecols:
    df[f'{race}_count'] = df['Population'] * (df[race] / 100)
df_long = df.melt(id_vars=['State', 'genre'], 
                  value_vars=racecols, 
                  var_name='Race', 
                  value_name='Count')
contingency_table = df_long.groupby(['Race', 'genre'])['Count'].sum().unstack(fill_value=0)

In [493]:
chi2, p, dof, expected = chi2_contingency(contingency_table) #chi squared test for independence

print("Contingency Table:")
print(contingency_table)
print(f"\nChi-squared: {chi2}")
print(f"p-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("\nExpected Frequencies:")
print(expected)

Contingency Table:
genre                      Alternative  Classic Rock  Country  Electronic  \
Race                                                                        
American Indian alone            16.60          3.53    20.33        0.75   
Asian alone                      11.08         48.41    35.47        8.57   
Black alone                      20.50        226.53   102.50        9.40   
Hispanic                         94.20        121.02    97.51       28.68   
Native Hawaiian and Other         0.66          0.60     3.78        0.74   
Other race alone                  2.40          5.48     4.65        0.55   
Two or more races                21.59         44.95    53.80        5.38   
White alone                     332.98        749.45   881.96       45.93   

genre                       Indie  Latin   Metal    Pop  Rhythm and Blue  
Race                                                                      
American Indian alone        7.23   0.29   25.70   0.39     