# Data on maps

This part of the project is focused on presenting collected data on maps. Main goal is to show differences between countries in respect to artists' gender, type(group/solo) and music genres, and how have those factors changed in time.

## Data preparation

Load and combine data from existing files - use main dataset and dataset with geolocalization information. 

In [1]:
import folium


In [2]:
import pandas as pd
import ast
import pickle

In [3]:
countries_dict = pickle.load(open("geonames/country_codes.pkl", 'rb'))
def translate_code(code):
    return countries_dict.get(code)

In [4]:
#dataframe with songs features for each artist
#need to change format of genres to list from string
artist_songs = pd.read_csv("data/artist_songs.csv")
artist_songs['gender'] = pd.to_numeric(artist_songs['gender'], errors='coerce')

In [5]:
artist_songs.head()

Unnamed: 0,_id,followers,mb_id,popularity,reduced_genres,gender,type,begin_date_year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,3XSyTI9ct70ZheMESAv2st,3752.0,290e5513-55d3-426f-a4ba-1ac3ce589107,61.0,['broadway'],2.0,Person,1980.0,0.5459,0.4401,6.4,-9.4249,0.8,0.1402,0.7565,3.18e-07,0.1588,0.6575,125.3684,153180.0
1,2jYHSJBXjusgmYdrNeaRmg,1361.0,09cddf26-fe4a-493e-bb1d-64723892ed8d,29.0,['jazz'],2.0,Person,1960.0,0.6975,0.5541,5.8,-11.2578,0.4,0.04146,0.311739,0.71967,0.09388,0.6947,103.1691,324042.8
2,5LmehwqsJa7a4Ya5SaqXpx,1147.0,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,37.0,['doo-wop'],0.0,Group,1956.0,0.5071,0.55695,6.1,-10.1384,1.0,0.06511,0.7269,0.09358214,0.20124,0.8238,139.8446,141725.6
3,10BFTSAfLauhKVmdby4zac,0.0,3a8ecc10-f888-48c0-9674-5c0ccd1fe93f,0.0,[],0.0,Group,1965.0,,,,,,,,,,,,
4,4hwEAtCJZa1LMgbuRpUWJB,4.0,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,0.0,[],0.0,Group,1956.0,,,,,,,,,,,,


In [6]:
#remove columns from dataframe that are not useful for current task
artist_songs = artist_songs[['_id', 'mb_id', 'reduced_genres', 'gender', 'type','begin_date_year' ]]

In [7]:
# load geolocalization data
localization_data = pd.read_csv("data/artist_localization.csv")
localization_data['mb_id'] = localization_data['_id']
localization_data = localization_data.drop(columns=['_id'])

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
data = artist_songs.merge(localization_data, how='left', left_on='mb_id', right_on='mb_id')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 739298 entries, 0 to 739297
Data columns (total 11 columns):
_id                739297 non-null object
mb_id              739298 non-null object
reduced_genres     738885 non-null object
gender             546464 non-null float64
type               546741 non-null object
begin_date_year    182520 non-null float64
country            270766 non-null object
latitude           274206 non-null float64
longtitude         274206 non-null float64
name               14089 non-null object
place              14087 non-null object
dtypes: float64(4), object(7)
memory usage: 67.7+ MB


In [10]:
data.loc[data['reduced_genres'].isna(),'reduced_genres'] = "[]"

In [11]:
data['genres'] = data['reduced_genres'].apply(ast.literal_eval)

In [12]:
data['country'] = data['country'].apply(translate_code)

In [13]:
data.head()

Unnamed: 0,_id,mb_id,reduced_genres,gender,type,begin_date_year,country,latitude,longtitude,name,place,genres
0,3XSyTI9ct70ZheMESAv2st,290e5513-55d3-426f-a4ba-1ac3ce589107,['broadway'],2.0,Person,1980.0,USA,38.0,-97.0,,,[broadway]
1,2jYHSJBXjusgmYdrNeaRmg,09cddf26-fe4a-493e-bb1d-64723892ed8d,['jazz'],2.0,Person,1960.0,USA,38.0,-97.0,,,[jazz]
2,5LmehwqsJa7a4Ya5SaqXpx,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,['doo-wop'],0.0,Group,1956.0,USA,38.0,-97.0,,,[doo-wop]
3,10BFTSAfLauhKVmdby4zac,3a8ecc10-f888-48c0-9674-5c0ccd1fe93f,[],0.0,Group,1965.0,,,,,,[]
4,4hwEAtCJZa1LMgbuRpUWJB,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,[],0.0,Group,1956.0,USA,38.0,-97.0,,,[]


## Distribution of artists around the world

In [14]:
import os
import folium
import json

print(folium.__version__)

0.5.0


In [15]:
countries = os.path.join('geonames', 'countries-land-10km.geo.json')
geo_json_data = json.load(open(countries))

In [16]:
art_by_country = data[['_id','country']].groupby('country').agg(['count'])

In [17]:
art_by_country.info()

<class 'pandas.core.frame.DataFrame'>
Index: 215 entries, ABW to ZWE
Data columns (total 1 columns):
(_id, count)    215 non-null int64
dtypes: int64(1)
memory usage: 3.4+ KB


In [18]:
art_by_country.columns = art_by_country.columns.droplevel()
art_by_country.head()

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
ABW,5
AFG,76
AGO,96
AIA,3
ALB,54


In [19]:
art_by_country.reset_index(inplace=True)
art_by_country.head()

Unnamed: 0,country,count
0,ABW,5
1,AFG,76
2,AGO,96
3,AIA,3
4,ALB,54


In [35]:
map1 = folium.Map([30, 0],zoom_start=2)

map1.choropleth(geo_json_data,
               data=art_by_country,
                name="Artists by country",
                columns=['country', 'count'],
                key_on='properties.A3',
                legend_name="Number of artists",
                threshold_scale = [1, 5000,10000,20000,30000, 40000],
                fill_color='GnBu',
                fill_opacity=0.7,
                highlight=True,
                line_opacity=0.2,)

map1.save("figures/map1.html")
map1

## Male to Female artist ratio

In [36]:
data_gender = data[['_id', 'country', 'gender']]
data_gender = data_gender[data_gender['gender'] !=0]
data_gender.dropna(inplace=True)

In [37]:
gender_counts = data_gender.groupby(['country', 'gender']).agg(['count'])
gender_counts.columns = gender_counts.columns.droplevel()

In [38]:
gender_counts.reset_index(inplace=True)

In [39]:
countries = set(gender_counts.country.tolist())

In [40]:
countries_dict = {}
for country in countries:
    temp = gender_counts[gender_counts['country']==country]
    try:
        ratio = float(temp[temp['gender']==2]['count'].values)/float(temp[temp['gender']==1]['count'].values)
        countries_dict[country] = ratio
    except:
        print(temp[temp['gender']==2]['count'].values)
        if temp[temp['gender']==2]['count'].values == None:
            countries_dict[country] = 0
        else:
            countries_dict[country] = 15
        print(temp)

[6]
    country  gender  count
146     GUY     2.0      6
[11]
    country  gender  count
132     GMB     2.0     11
[]
    country  gender  count
206     LSO     1.0      3
[1]
    country  gender  count
145     GUM     2.0      1
[7]
    country  gender  count
238     MOZ     2.0      7
[1]
    country  gender  count
328     TKM     2.0      1
[5]
  country  gender  count
0     ABW     2.0      5
[1]
    country  gender  count
330     TON     2.0      1
[1]
    country  gender  count
225     MHL     2.0      1
[]
    country  gender  count
360     VUT     1.0      1
[10]
    country  gender  count
246     MWI     2.0     10
[1]
    country  gender  count
124     GGY     2.0      1
[]
   country  gender  count
28     BDI     1.0      1


  if __name__ == '__main__':


[3]
    country  gender  count
190     KNA     2.0      3
[]
    country  gender  count
307     SOM     1.0      1
[1]
    country  gender  count
329     TLS     2.0      1
[3]
    country  gender  count
361     WSM     2.0      3
[1]
    country  gender  count
140     GRL     2.0      1
[9]
   country  gender  count
31     BEN     2.0      9
[2]
    country  gender  count
278     PRK     2.0      2
[]
    country  gender  count
104     ESH     1.0      4
[]
    country  gender  count
301     SLB     1.0      1
[15]
    country  gender  count
251     NER     2.0     15
[2]
    country  gender  count
355     VGB     2.0      2
[1]
   country  gender  count
38     BHR     2.0      1
[]
   country  gender  count
19     ATA     1.0      1
[6]
   country  gender  count
92     DMA     2.0      6
[1]
    country  gender  count
187     KGZ     2.0      1
[2]
    country  gender  count
352     VCT     2.0      2
[2]
    country  gender  count
317     SWZ     2.0      2
[3]
    country  gender  

In [42]:
g_c = pd.DataFrame.from_dict(countries_dict, orient='index')
g_c.reset_index(inplace=True)
g_c.columns = ['country', 'ratio']

In [43]:
map2 = folium.Map([30, 0],zoom_start=2)

map2.choropleth(geo_json_data,
               data=g_c,
                columns=['country', 'ratio'],
                key_on='properties.A3',
                legend_name="Male to Female artist ratio",
                fill_color='GnBu',
                fill_opacity=0.7,
                highlight=True,
                line_opacity=0.2,)

map2.save("figures/map2.html")
map2