# Web sources

This is a sandbox to explore potential web indicator data collections for EIS.

We will:

* Create a summary table
* Collect LinkedIn skills migration data
* Explore options to query Google Big query about:
  * GitHub
  * Python downloads
* Carry out a toy scrape of the Study portals website

## Preamble

In [None]:
%run ../notebook_preamble.ipy

from eis.utils.data_processing import *
from datetime import datetime
import seaborn as sn
from ast import literal_eval

from google.oauth2 import service_account
import pandas_gbq

plt.style.use('seaborn-whitegrid')
plt.rc('font', size=14) 

In [None]:
#Various credentials to collect Nesta and Google Big Query data
from dotenv import load_dotenv

load_dotenv()

cp = os.environ.get('config_path')



## Analysis

In [None]:
country_codes = pd.read_csv(
    'https://www.eea.europa.eu/data-and-maps/data/waterbase-lakes-4/country-codes-and-abbreviations-32-records/country-codes-and-abbreviations-32-records/at_download/file')

### Table

In [None]:
#Read data
ind = pd.read_csv(f"{project_dir}/data/aux/eis_indicator_inventory.csv",na_values='TBC')

In [None]:
ind.loc[ind['method_type']=='Web'][
    ['category','indicator','source','description']].to_csv(f"{material_outputs}/table_4_web.csv",index=False)

### LinkedIn

In [None]:
li = pd.read_excel('https://development-data-hub-s3-public.s3.amazonaws.com/ddhfiles/144635/public_use-talent-migration.xlsx',
                  sheet_name='Skill Migration')

In [None]:
eu_codes = set([x.lower() for x in country_codes['ISO2']])
eu_li = li.loc[[x in eu_codes for x in li['country_code']]]

In [None]:
ai = ['Artificial Intelligence','Data Science','Natural Language Processing']
eu_ai = eu_li.loc[[x in ai for x in eu_li['skill_group_name']]]

In [None]:
rel = ['country_name','country_code','net_per_10K_2015','net_per_10K_2016','net_per_10K_2017','net_per_10K_2018']

eu_ai_long = eu_ai[rel].melt(id_vars=['country_name','country_code'])

eu_ai_long['year'] = [int(x.split('_')[-1]) for x in eu_ai_long['variable']]

ai_agg = eu_ai_long.pivot_table(
    index='country_code',columns='year',values='value',aggfunc='sum').sort_values(2018,ascending=True)

ai_agg = ai_agg.T.rolling(window=2).mean().dropna().T

In [None]:
ax = ai_agg.plot.barh(figsize=(5,10))
ax.set_xlabel('Net gain or loss of talent with skill \n (2-year rolling average)')

save_fig("fig_7_linkedin.pdf",material_outputs)


### Meetup

In [None]:
from data_getters.meetup import select_meetup

In [None]:
eu_meetup_groups = []

for x in [x for x in country_codes['ISO2']]:
    
    print(x)
    res = select_meetup(cp, 34, x)
    
    eu_meetup_groups.append(res)

# with open(f"{project_dir}/data/raw/eu_meetup_groups.p",'w') as outfile:
#     pickle.dumps(eu_meetup_groups)

In [None]:
with open(f"{project_dir}/data/raw/eu_meetups.p",'wb') as outfile:
    pickle.dump(eu_meetup_groups,outfile)

In [None]:
gs = pd.concat([x['core_groups'] for x in eu_meetup_groups]).reset_index(drop=True)

In [None]:
len(gs)
np.sum(gs['country']=='AL')

In [None]:
gs.members.sum()/1e6

In [None]:
#Some parsing

#of years
gs['year'] = [datetime.fromtimestamp(np.float(x)/1000).year for x in gs['created']]

#Of topics
gs['topic_list'] = [literal_eval(x) for x in gs['topics']]
gs['topic_kws'] = [[x['urlkey'] for x in el] for el in gs['topic_list']]

In [None]:
len(set(flatten_list(gs['topic_kws'])))

In [None]:
#Tag meetups with AI keywords
ai = set(['machine-learning','ai','deep-learning','data-science'])
vr = set(['virtual-reality','augmented-reality','vr'])
crypto = set(['cryptocurrency','blockchain','bitcoin'])

gs['has_ai'],gs['has_vr'],gs['has_crypto'] = [
    [len(tech_set & set(kws))>0 for kws in gs['topic_kws']] for tech_set in [ai,vr,crypto]]

In [None]:
fig,ax = plt.subplots(figsize=(12,7),nrows=2,sharex=True)

country_freqs =  gs['country'].value_counts()
country_freqs.plot.bar(cmap='Purples_r',ax=ax[0])

(100*gs.groupby('country')['has_ai'].mean()).loc[country_freqs.index].plot.bar(ax=ax[1],cmap='Purples',
                                                                        edgecolor='purple')

ax[0].set_ylabel('Total number of meetups')
ax[1].set_ylabel('% of meetups in AI')

plt.tight_layout()

save_fig('fig_8_meetups.pdf',material_outputs)

In [None]:
ax = (100*gs.groupby(['year'])[['has_ai','has_vr','has_crypto']
                    ].mean().rolling(window=3).mean().dropna()).plot(color=['purple','blue','orange'])

ax.set_ylabel('Share of technology meetups \n with topic')

save_fig('fig_9_meetup_topics.pdf',material_outputs)

### Google big queries

In [None]:
creds = service_account.Credentials.from_service_account_file(
    f"{project_dir}/gbq_eis_credentials.json")

project_id = 'eis-2-275207'

In [None]:
#This query extracts a count of unique year by year of registration and country code
#Removing fake accounts
q1 = '''SELECT EXTRACT (YEAR FROM created_at), COUNT(id), country_code
FROM `ghtorrentmysql1906.MySQL1906.users`
WHERE fake = 0 AND deleted = 0
GROUP BY country_code, EXTRACT (YEAR FROM created_at)'''

In [None]:
github_reg = pandas_gbq.read_gbq(q1, 
                project_id='eis-2-275207',
                credentials=creds)

In [None]:
github_reg.head()

In [None]:
github_reg.rename(columns={'f0_':'year_created','f1_':'user_count','country_code':'country_code'},
                 inplace=True)

In [None]:
github_wide = github_reg.pivot_table(index='year_created',columns='country_code',
                                    values='user_count').fillna(0)
top_github_eu = github_wide[eu_codes].sum().sort_values(ascending=False)

top_gh_eu_names = top_github_eu[:7].index

eu_totals = pd.concat([github_wide[top_gh_eu_names],
           github_wide[[x for x in github_wide.columns if (x in eu_codes) & (x not in top_gh_eu_names)
                       ]].sum(
               axis=1).rename('other')],
         axis=1)

ax = eu_totals.cumsum().plot.area(cmap='Accent',figsize=(7,4))
ax.ticklabel_format(axis='y',style='sci')
ax.set_ylabel('Registered users \n (cumulative)')
ax.set_xlabel('')

plt.tight_layout()
save_fig('fig_10_github_trends.pdf',material_outputs)

In [None]:
ax = top_github_eu.plot.bar(color='Purple',figsize=(10,4))
ax.set_xlabel('')
ax.set_ylabel('Total number of \n registered users')
save_fig('fig_11_github_countries.pdf',material_outputs)


In [None]:
top_github_eu.sum()/github_wide.sum().sum()

### PyPy

In [None]:
pyq_all = '''SELECT COUNT(*), country_code
FROM `the-psf.pypi.file_downloads` 
WHERE DATE(timestamp) = "{}" 
GROUP BY country_code'''

In [None]:
pyq_ml = '''SELECT COUNT(*), country_code
FROM `the-psf.pypi.file_downloads` 
WHERE file.project in ('tensorflow','keras','pytorch','sklearn') AND DATE(timestamp) = "{}" 
GROUP BY country_code'''

In [None]:
py_all = [pandas_gbq.read_gbq(pyq_all.format(f'2019-09-0{str(n)}'), 
                project_id='eis-2-275207',
                credentials=creds) for n in np.arange(1,7)]


In [None]:
py_ml = [pandas_gbq.read_gbq(pyq_ml.format(f'2019-09-0{str(n)}'), 
                project_id='eis-2-275207',
                credentials=creds) for n in np.arange(1,7)]

In [None]:
py_downloads = pd.concat([
    pd.concat(df).groupby('country_code')['f0_'].mean().rename(name) for df,name in 
    zip([py_all,py_ml],['all_files','ml_packages'])],axis=1).fillna(0)

In [None]:
py_downloads.index = [x.lower() for x in py_downloads.index]

In [None]:
euro_py = py_downloads.loc[eu_codes].sort_values('all_files',ascending=False)

ax = (100*euro_py.apply(lambda x: x/x.sum())).plot.bar(
    cmap='Purples_r',edgecolor='purple',figsize=(12,5))
ax.set_ylabel('% of downloads \n accounted by the country')

save_fig('fig_12_python_dowloads.pdf',material_outputs)

In [None]:
py_downloads.assign(
    is_eu = lambda x: [x in eu_codes for x in py_downloads.index]).groupby('is_eu').sum(
).T.assign(share = lambda x: x[True]/x.sum(axis=1))

In [None]:
py_downloads.assign(
    is_cn = lambda x: [x == 'cn' for x in x.index]).groupby('is_cn').sum(
).T.assign(share = lambda x: x[True]/x.sum(axis=1))

### Stack Overflow

In [None]:
#The second one extracts count of activity in a location by year
q2 = '''SELECT EXTRACT (YEAR FROM creation_date), COUNT(id), location
FROM `bigquery-public-data.stackoverflow.users` 
GROUP BY location, EXTRACT (YEAR FROM creation_date) 
'''

In [None]:
stackover = pandas_gbq.read_gbq(q2, 
                project_id='eis-2-275207',
                credentials=creds)

In [None]:
ax = stackover.groupby('location')['f1_'].sum().sort_values(ascending=False)[:20][::-1].plot.barh(
figsize=(7,7),color='Purple')
ax.set_xlabel('Registered users')
ax.set_ylabel('')

save_fig('fig_13_stack_overflow.pdf',material_outputs)

### Save all the data for the analytical synthesis

In [None]:
meetup_long = pd.crosstab(
    gs['country'],gs['year']).reset_index(drop=False).melt(id_vars=['country'],
                                                           value_name='tech_meetups')
github_long = github_wide.reset_index(drop=False).melt(
    id_vars='year_created',value_name='github_users').rename(
    columns={'year_created':'year','country_code':'country'})

python_long = py_downloads['all_files'].loc[eu_codes].reset_index(drop=False).rename(columns={
    'index':'country','all_files':'python_downloads'}).assign(year=2019)

out = pd.concat([x.melt(id_vars=['country','year']) for x in [meetup_long,github_long,python_long]])


In [None]:
out.to_csv(f"{project_dir}/data/interim/web_indicators.csv",index=False)