# Web sources

This is a sandbox to explore potential web indicator data collections for EIS.

We will:

* Create a summary table
* Collect LinkedIn skills migration data
* Explore options to query Google Big query about:
  * GitHub
  * Python downloads
* Carry out a toy scrape of the Study portals website

## Preamble

In [None]:
%run ../notebook_preamble.ipy

from eis.utils.data_processing import *
from datetime import datetime
import seaborn as sn
from ast import literal_eval
import altair as alt
from altair_saver import save

from google.oauth2 import service_account
import pandas_gbq

plt.style.use('seaborn-whitegrid')
plt.rc('font', size=14) 

In [None]:
#Various credentials to collect Nesta and Google Big Query data
from dotenv import load_dotenv

load_dotenv()

cp = os.environ.get('config_path')



## Analysis

In [None]:
country_codes = pd.read_csv(
    'https://www.eea.europa.eu/data-and-maps/data/waterbase-lakes-4/country-codes-and-abbreviations-32-records/country-codes-and-abbreviations-32-records/at_download/file')

### Table

In [None]:
#Read data
ind = pd.read_csv(f"{project_dir}/data/aux/eis_indicator_inventory.csv",na_values='TBC')

In [None]:
ind.loc[ind['method_type']=='Web'][
    ['category','indicator','source','description']].to_csv(f"{material_outputs}/table_4_web.csv",index=False)

### LinkedIn

In [None]:
li = pd.read_excel('https://development-data-hub-s3-public.s3.amazonaws.com/ddhfiles/144635/public_use-talent-migration.xlsx',
                  sheet_name='Skill Migration')

In [None]:
eu_codes = set([x.lower() for x in country_codes['ISO2']])
eu_li = li.loc[[x in eu_codes for x in li['country_code']]]

In [None]:
ai = ['Artificial Intelligence','Data Science','Natural Language Processing']
eu_ai = eu_li.loc[[x in ai for x in eu_li['skill_group_name']]]

In [None]:
rel = ['country_name','country_code','net_per_10K_2015','net_per_10K_2016','net_per_10K_2017','net_per_10K_2018']

eu_ai_long = eu_ai[rel].melt(id_vars=['country_name','country_code'])

eu_ai_long['year'] = [int(x.split('_')[-1]) for x in eu_ai_long['variable']]

ai_agg = eu_ai_long.pivot_table(
    index='country_code',columns='year',values='value',aggfunc='sum').sort_values(2018,ascending=True)

ai_agg = ai_agg.T.rolling(window=2).mean().dropna().T

In [None]:
ai_agg_long = ai_agg.reset_index(drop=False).melt(id_vars=['country_code'],
                                                  var_name='year')

In [None]:
base = alt.Chart(ai_agg_long)

a = base.mark_point().encode(y=alt.Y('country_code',axis=alt.Axis(grid=True,gridWidth=3),
                                    sort=alt.EncodingSortField('value','mean',order='descending')),
                                     x=alt.X('value',title='net flow of talent per 10K members'),
                             color='year:N',shape='year:N')
b = base.mark_line(strokeDash=[3,1],strokeWidth=3).encode(
    y=alt.Y('country_code:N',sort=alt.EncodingSortField('value','mean',order='descending')),
    x='value:Q',detail=alt.Detail('country_code:N'))

c = base.mark_rule().transform_calculate(zero='0').encode(x='zero:Q')

d = (a+b+c).properties(height=500)

save(d,f"{material_outputs}_v1/fig_11_linkedin.pdf")

In [None]:
d

### Meetup

In [None]:
from data_getters.meetup import select_meetup

In [None]:
# eu_meetup_groups = []

# for x in [x for x in country_codes['ISO2']]:
    
#     print(x)
#     res = select_meetup(cp, 34, x)
    
#     eu_meetup_groups.append(res)

# with open(f"{project_dir}/data/raw/eu_meetup.p",'wb') as outfile:
#     pickle.dump(eu_meetup_groups,outfile)

In [None]:
with open(f"{project_dir}/data/raw/eu_meetup.p",'rb') as infile:
    eu_meetup_groups = pickle.load(infile)

In [None]:
gs = pd.concat([x['core_groups'] for x in eu_meetup_groups]).reset_index(drop=True)

In [None]:
len(gs)
np.sum(gs['country']=='AL')

In [None]:
gs.members.sum()/1e6

In [None]:
#Some parsing

#of years
gs['year'] = [datetime.fromtimestamp(np.float(x)/1000).year for x in gs['created']]

#Of topics
gs['topic_list'] = [literal_eval(x) for x in gs['topics']]
gs['topic_kws'] = [[x['urlkey'] for x in el] for el in gs['topic_list']]

In [None]:
len(set(flatten_list(gs['topic_kws'])))

In [None]:
#Tag meetups with AI keywords
ai = set(['machine-learning','ai','deep-learning','data-science'])
vr = set(['virtual-reality','augmented-reality','vr'])
crypto = set(['cryptocurrency','blockchain','bitcoin'])

gs['has_ai'],gs['has_vr'],gs['has_crypto'] = [
    [int(len(tech_set & set(kws))>0) for kws in gs['topic_kws']] for tech_set in [ai,vr,crypto]]

In [None]:
meetup_long = pd.crosstab(gs['country'],gs['year']).cumsum(axis=1).reset_index(drop=False).melt(id_vars='country',value_name='groups')


In [None]:
ch_1 = alt.Chart(meetup_long,width=60,height=70).mark_line().encode(x='year:O',
                                                 y='groups',
                                         facet=alt.Facet('country',columns=8,
                                                        sort=alt.EncodingSortField('groups','max',
                                                                          order='descending')))


ch_1.save(f'{material_outputs}_v1/fig_12_meetup.html')

In [None]:
# fig,ax = plt.subplots(figsize=(12,7),nrows=2,sharex=True)

# country_freqs =  gs['country'].value_counts()
# country_freqs.plot.bar(cmap='Purples_r',ax=ax[0])

# (100*gs.groupby('country')['has_ai'].mean()).loc[country_freqs.index].plot.bar(ax=ax[1],cmap='Purples',
#                                                                         edgecolor='purple')

# ax[0].set_ylabel('Total number of meetups')
# ax[1].set_ylabel('% of meetups in AI')

# plt.tight_layout()

# save_fig('fig_8_meetups.pdf',material_outputs)

In [None]:
em_tech_ev = (100*gs.groupby(['year'])[['has_ai','has_vr','has_crypto']
                    ].mean()).reset_index(drop=False).melt(id_vars='year',var_name='Technology activity')

em_tech_geo = gs.groupby(['country'])[['has_ai','has_vr','has_crypto']
                    ].sum().apply(lambda x: 100*x/x.sum()).reset_index(drop=False).melt(id_vars='country',
                                                                                   var_name='Technology',
                                                                                       value_name='Share')

In [None]:
ch_3 = alt.Chart(em_tech_ev,width=600).transform_window(
    mean_value='mean(value)',groupby=['Technology activity'],frame=[-1,+1]).mark_line().encode(
x=alt.X('year:O'),y=alt.Y('mean_value:Q',title='% of activity accounted by year'),color='Technology activity')


ch_4 = alt.Chart(em_tech_geo).mark_circle(stroke='black',
                                         strokeWidth=1).encode(y=alt.Y('Technology:N',sort=['has_ai','has_crypto','has_vr']),
                                         x = alt.X('country',sort=alt.EncodingSortField(
                                             'Share','sum',order='descending')),
                                                               size=alt.Size('Share',legend=None),
                                                               color='Share').properties(height=100)

ch_5 = alt.vconcat(ch_3,ch_4)

ch_5

save(ch_5,f"{material_outputs}_v1/fig_14_meetup.pdf")

### Google big queries

In [None]:
creds = service_account.Credentials.from_service_account_file(
    f"{project_dir}/gbq_eis_credentials.json")

project_id = 'eis-2-275207'

In [None]:
#This query extracts a count of unique year by year of registration and country code
#Removing fake accounts
q1 = '''SELECT EXTRACT (YEAR FROM created_at), COUNT(id), country_code
FROM `ghtorrentmysql1906.MySQL1906.users`
WHERE fake = 0 AND deleted = 0
GROUP BY country_code, EXTRACT (YEAR FROM created_at)'''

In [None]:
github_reg = pandas_gbq.read_gbq(q1, 
                project_id='eis-2-275207',
                credentials=creds)

In [None]:
github_reg.head()

In [None]:
github_reg.rename(columns={'f0_':'year_created','f1_':'user_count','country_code':'country_code'},
                 inplace=True)

In [None]:
github_wide = github_reg.pivot_table(index='year_created',columns='country_code',
                                    values='user_count').fillna(0)
top_github_eu = github_wide[eu_codes].sum().sort_values(ascending=False)

top_gh_eu_names = top_github_eu[:7].index

eu_totals = pd.concat([github_wide[top_gh_eu_names],
           github_wide[[x for x in github_wide.columns if (x in eu_codes) & (x not in top_gh_eu_names)
                       ]].sum(
               axis=1).rename('other')],
         axis=1).cumsum().T

#Need this to order the variables
eu_totals['order'] = list(range(0,8))
eu_totals_long = eu_totals.reset_index(drop=False).melt(id_vars=['index','order'])

In [None]:
ch_g = alt.Chart(eu_totals_long).mark_area().encode(x='year_created:O',
                                             y=alt.Y('value',title='Registered members (cumulative)'),
                                             color=alt.Color('index',
                                                             sort=list(eu_totals.index)[::-1]),
                                            order='order').properties(width=400)

save(ch_g,f'{material_outputs}_v1/fig_15_github.pdf')

ch_g

In [None]:
github_count_df = top_github_eu.reset_index(drop=False).rename(columns={0:'Registered users'})

ch_g_us = alt.Chart(github_count_df,height=200).mark_bar().encode(x=alt.X('country_code',sort=alt.EncodingSortField('count',
                                                    order='descending')),
                                             y='Registered users')
save(ch_g_us,f"{material_outputs}_v1/fig_16_github_count.pdf")

ch_g_us

In [None]:
top_github_eu.sum()/github_wide.sum().sum()

### PyPy

In [None]:
pyq_all = '''SELECT COUNT(*), country_code
FROM `the-psf.pypi.file_downloads` 
WHERE DATE(timestamp) = "{}" 
GROUP BY country_code'''

In [None]:
pyq_ml = '''SELECT COUNT(*), country_code
FROM `the-psf.pypi.file_downloads` 
WHERE file.project in ('tensorflow','keras','pytorch','sklearn') AND DATE(timestamp) = "{}" 
GROUP BY country_code'''

In [None]:
py_all = [pandas_gbq.read_gbq(pyq_all.format(f'2020-04-0{str(n)}'), 
                project_id='eis-2-275207',
                credentials=creds) for n in np.arange(1,7)]


In [None]:
py_ml = [pandas_gbq.read_gbq(pyq_ml.format(f'2020-04-0{str(n)}'), 
                project_id='eis-2-275207',
                credentials=creds) for n in np.arange(1,7)]

In [None]:
py_downloads = pd.concat([
    pd.concat(df).groupby('country_code')['f0_'].mean().rename(name) for df,name in 
    zip([py_all,py_ml],['all_files','ml_packages'])],axis=1).fillna(0)

In [None]:
py_downloads.index = [x.lower() for x in py_downloads.index]

In [None]:
euro_py = py_downloads.loc[eu_codes].sort_values('all_files',ascending=False)

euro_py_long = (100*euro_py.apply(lambda x: x/x.sum())).reset_index(drop=False).melt(id_vars='index',
                                                                      var_name='download_type',value_name=
                                                                                     'download_share')

In [None]:
base = alt.Chart(euro_py_long)

x_pos = alt.X('index',sort=alt.EncodingSortField(field='download_share',op='sum',order='descending'))
y_pos = alt.Y('download_share',title='Share of EU downloads')

p = base.mark_point(filled=True,size=50,stroke='black',strokeWidth=1).encode(x=x_pos,y=y_pos,color='download_type',
                             shape='download_type')

l = base.mark_line(strokeDash=[1,2]).encode(x=x_pos,y=y_pos,detail='index')

f = (p+l).properties(width=500,height=200)

save(f,f"{material_outputs}_v1/fig_17_python_dloads.pdf")

f

In [None]:
py_downloads.assign(
    is_eu = lambda x: [x in eu_codes for x in py_downloads.index]).groupby('is_eu').sum(
).T.assign(share = lambda x: x[True]/x.sum(axis=1))

In [None]:
py_downloads.assign(
    is_cn = lambda x: [x == 'cn' for x in x.index]).groupby('is_cn').sum(
).T.assign(share = lambda x: x[True]/x.sum(axis=1))

### Stack Overflow

In [None]:
#The second one extracts count of activity in a location by year
q2 = '''SELECT EXTRACT (YEAR FROM creation_date), COUNT(id), location
FROM `bigquery-public-data.stackoverflow.users` 
GROUP BY location, EXTRACT (YEAR FROM creation_date) 
'''

In [None]:
stackover = pandas_gbq.read_gbq(q2, 
                project_id='eis-2-275207',
                credentials=creds)

In [None]:
top_stack_locs = stackover.groupby('location')['f1_'].sum().sort_values(
    ascending=False)[:20].reset_index(drop=False).rename(columns={'f1_':'users'})

In [None]:
stack = alt.Chart(top_stack_locs).mark_bar().encode(y=
                                            alt.Y('location:O',sort=alt.EncodingSortField('users',order='descending')),
                                                  x='users:Q').properties(width=200,height=250)

save(stack,f"{material_outputs}_v1/fig_18_stack_users.pdf")

stack

### Save all the data for the analytical synthesis

In [None]:
meetup_long = pd.crosstab(
    gs['country'],gs['year']).reset_index(drop=False).melt(id_vars=['country'],
                                                           value_name='tech_meetups')
github_long = github_wide.reset_index(drop=False).melt(
    id_vars='year_created',value_name='github_users').rename(
    columns={'year_created':'year','country_code':'country'})

python_long = py_downloads['all_files'].loc[eu_codes].reset_index(drop=False).rename(columns={
    'index':'country','all_files':'python_downloads'}).assign(year=2018)

out = pd.concat([x.melt(id_vars=['country','year']) for x in [meetup_long,github_long,python_long]])


In [None]:
out

In [None]:
out.to_csv(f"{project_dir}/data/processed/web_indicators.csv",index=False)