# Web sources

This is a sandbox to explore potential web indicator data collections for EIS.

We will:

* Create a summary table
* Collect LinkedIn skills migration data
* Explore options to query Google Big query about:
  * GitHub
  * Python downloads
* Carry out a toy scrape of the Study portals website

## Preamble

In [None]:
%run ../notebook_preamble.ipy

import os
import requests
from zipfile import ZipFile
from io import BytesIO

from eis.utils.data_processing import *
from datetime import datetime
from ast import literal_eval
import altair as alt
from altair_saver import save

from google.oauth2 import service_account
import pandas_gbq

#Altair saving code
w = make_altair_save()

save_dir = f"{project_dir}/reports/figures/final_report"

def save_altair_(f,n):
    save_altair(f,n,w,fig_path=save_dir)

In [None]:
#Various credentials to collect Nesta and Google Big Query data
from dotenv import load_dotenv

load_dotenv()

cp = os.environ.get('config_path')

In [None]:
def get_country_name(code_var,lookup):
    
    return code_var.apply(lambda x: x.lower()).map(lookup)
    

## Analysis

In [None]:
# country_codes = pd.read_csv(
#     'https://www.eea.europa.eu/data-and-maps/data/waterbase-lakes-4/country-codes-and-abbreviations-32-records/country-codes-and-abbreviations-32-records/at_download/file')

#Load country code - name lookup
country_codes_lu = pd.read_csv(f"{data_path}/aux/eu_iso_2_name_lookup.csv").set_index('Unnamed: 0')['0'].to_dict()

country_codes = set(country_codes_lu.keys())

### Table

In [None]:
#Read data
ind = pd.read_csv(f"{project_dir}/data/aux/eis_indicator_inventory.csv",na_values='TBC')

In [None]:
ind.loc[ind['method_type']=='Web'][
    ['category','indicator','source','description']].to_csv(f"{material_outputs}/table_4_web.csv",index=False)

### LinkedIn

In [None]:
li = pd.read_excel('https://development-data-hub-s3-public.s3.amazonaws.com/ddhfiles/144635/public_use-talent-migration.xlsx',
                  sheet_name='Skill Migration')

In [None]:
eu_codes = country_codes
eu_li = li.loc[[x in eu_codes for x in li['country_code']]]

In [None]:
ai = ['Artificial Intelligence','Data Science','Natural Language Processing']
eu_ai = eu_li.loc[[x in ai for x in eu_li['skill_group_name']]]

In [None]:
rel = ['country_name','country_code','net_per_10K_2016','net_per_10K_2017','net_per_10K_2018','net_per_10K_2019']

eu_ai_long = eu_ai[rel].melt(id_vars=['country_name','country_code'])

eu_ai_long['year'] = [int(x.split('_')[-1]) for x in eu_ai_long['variable']]

ai_agg = eu_ai_long.pivot_table(
    index=['country_code','country_name'],columns='year',values='value',aggfunc='sum').sort_values(2018,ascending=True)

ai_agg = ai_agg.T.rolling(window=2).mean().dropna().T

In [None]:
ai_agg_long = ai_agg.reset_index(drop=False).melt(id_vars=['country_code','country_name'],
                                                  var_name='year')

In [None]:
base = alt.Chart(ai_agg_long)

a = base.mark_point().encode(y=alt.Y('country_name',axis=alt.Axis(grid=True,gridWidth=1),
                                    sort=alt.EncodingSortField('value','mean',order='descending')),
                                     x=alt.X('value',title='Net flow of talent per 10K members'),
                             color='year:N',shape='year:N')
b = base.mark_line(strokeDash=[3,1],strokeWidth=1).encode(
    y=alt.Y('country_name:N',title=None,sort=alt.EncodingSortField('value','mean',order='descending')),
    x='value:Q',detail=alt.Detail('country_name:N'))

c = base.mark_rule().transform_calculate(zero='0').encode(x='zero:Q')

d = (a+b+c).properties(height=500)

save_altair_(d,'fig_8_linkedin')

d

### Meetup

In [None]:
with open(f"{project_dir}/data/raw/eu_meetup.p",'rb') as infile:
    eu_meetup_groups = pickle.load(infile)

In [None]:
gs = pd.concat([x['core_groups'] for x in eu_meetup_groups]).reset_index(drop=True)

In [None]:
#Some parsing

#of years
gs['year'] = [datetime.fromtimestamp(np.float(x)/1000).year for x in gs['created']]

#Of topics
gs['topic_list'] = [literal_eval(x) for x in gs['topics']]
gs['topic_kws'] = [[x['urlkey'] for x in el] for el in gs['topic_list']]

In [None]:
#Tag meetups with AI keywords
ai = set(['machine-learning','ai','deep-learning','data-science'])
vr = set(['virtual-reality','augmented-reality','vr'])
crypto = set(['cryptocurrency','blockchain','bitcoin'])

gs['Artificial Intelligence'],gs['Virtual Reality'],gs['Crypto'] = [
    [int(len(tech_set & set(kws))>0) for kws in gs['topic_kws']] for tech_set in [ai,vr,crypto]]

gs['country_name'] = get_country_name(gs['country'],country_codes_lu)

In [None]:
meetup_long = pd.crosstab(gs['country_name'],gs['year']).cumsum(axis=1).reset_index(drop=False).melt(id_vars='country_name',value_name='groups')

#meetup_long['country_name'] = get_country_name(meetup_long['country'],country_codes_lu)

In [None]:
ch_1 = alt.Chart(meetup_long,width=60,height=70).mark_line().encode(x='year:O',
                                                 y='groups',
                                         facet=alt.Facet('country_name',columns=8,title='Country',
                                                        sort=alt.EncodingSortField('groups','max',
                                                                          order='descending')))


save_altair_(ch_1,"fig_9_meetup_trends")
ch_1

In [None]:
em_tech_ev = (100*gs.groupby(['year'])[['Artificial Intelligence','Virtual Reality','Crypto']
                    ].mean()).reset_index(drop=False).melt(id_vars='year',var_name='Technology activity')

em_tech_geo = gs.groupby(['country_name'])[['Artificial Intelligence','Virtual Reality','Crypto']
                    ].sum().apply(lambda x: 100*x/x.sum()).reset_index(drop=False).melt(id_vars='country_name',
                                                                                   var_name='Technology',
                                                                                       value_name='Share')

# name_lookup = {'has_ai':'AI','has_crypto':['Cryptocurrencies', '& Blockchain'],'has_vr':'immersive'}

# sort_tech = list(name_lookup.values())

# em_tech_geo['tech_clean'] = em_tech_geo['Technology'].map(name_lookup)

In [None]:
ch_3 = alt.Chart(em_tech_ev,width=600).transform_window(
    mean_value='mean(value)',groupby=['Technology activity'],frame=[-1,+1]).mark_line().encode(
x=alt.X('year:O'),y=alt.Y('mean_value:Q',title='% of activity accounted by year'),color='Technology activity')


ch_4 = alt.Chart(em_tech_geo).mark_circle(stroke='black',
                                         strokeWidth=1).encode(y=alt.Y('Technology:N',
                                                                       sort=['Artificial Intelligence','Crypto',
                                                                             'Virtual Reality']),
                                         x = alt.X('country_name',sort=alt.EncodingSortField(
                                             'Share','sum',order='descending')),
                                                               size=alt.Size('Share',legend=None),
                                                               color='Share').properties(height=100)

ch_5 = alt.vconcat(ch_3,ch_4)

save_altair_(ch_5,"fig_10_meetup_trends")

ch_5

### Google big queries

In [None]:
creds = service_account.Credentials.from_service_account_file(
    f"{project_dir}/gbq_eis_credentials.json")

project_id = 'eis-2-275207'

In [None]:
#This query extracts a count of unique year by year of registration and country code
#Removing fake accounts
q1 = '''SELECT EXTRACT (YEAR FROM created_at), COUNT(id), country_code
FROM `ghtorrentmysql1906.MySQL1906.users`
WHERE fake = 0 AND deleted = 0
GROUP BY country_code, EXTRACT (YEAR FROM created_at)'''

In [None]:
github_reg = pandas_gbq.read_gbq(q1, 
                project_id='eis-2-275207',
                credentials=creds)

In [None]:
github_reg.head()

In [None]:
github_reg.rename(columns={'f0_':'year_created','f1_':'user_count','country_code':'country_code'},
                 inplace=True)

In [None]:
github_wide = github_reg.pivot_table(index='year_created',columns='country_code',
                                    values='user_count').fillna(0)
top_github_eu = github_wide[[x for x in eu_codes if x in github_wide.columns]].sum().sort_values(ascending=False)

top_gh_eu_names = top_github_eu[:7].index

eu_totals = pd.concat([github_wide[top_gh_eu_names],
           github_wide[[x for x in github_wide.columns if (x in eu_codes) & (x not in top_gh_eu_names)
                       ]].sum(
               axis=1).rename('other')],
         axis=1).cumsum().T

#Need this to order the variables
eu_totals['order'] = list(range(0,8))
eu_totals_long = eu_totals.reset_index(drop=False).melt(id_vars=['index','order'])

eu_totals_long['country_name'] = [country_codes_lu[x] if x in country_codes_lu.keys() else 'Other' for x in eu_totals_long['index']]

In [None]:
ch_g = alt.Chart(eu_totals_long).mark_area(stroke='Grey').encode(
    x=alt.X('year_created:O',title='Year'),
    y=alt.Y('value',title='Registered members (cumulative)'),
    color=alt.Color('country_name',title='Country',
                    sort=list(eu_totals.index)[::-1],scale=alt.Scale(scheme='Accent')),order='order').properties(
    width=400)

save_altair_(ch_g,'fig_11_github')

ch_g

In [None]:
github_count_df = top_github_eu.reset_index(drop=False).rename(columns={0:'Registered users'})

github_count_df['country_name'] = github_count_df['country_code'].map(country_codes_lu)

ch_g_us = alt.Chart(github_count_df,height=200).mark_bar().encode(x=alt.X('country_name',sort=alt.EncodingSortField('count',
                                                    order='descending')),
                                             y='Registered users').properties(width=550)

save_altair_(ch_g_us,'fig_12_github_country_count')
ch_g_us

In [None]:
top_github_eu.sum()/github_wide.sum().sum()

### PyPy

In [None]:
pyq_all = '''SELECT COUNT(*), country_code
FROM `the-psf.pypi.file_downloads` 
WHERE DATE(timestamp) = "{}" 
GROUP BY country_code'''

In [None]:
pyq_ml = '''SELECT COUNT(*), country_code
FROM `the-psf.pypi.file_downloads` 
WHERE file.project in ('tensorflow','keras','pytorch','sklearn') AND DATE(timestamp) = "{}" 
GROUP BY country_code'''

In [None]:
py_all = [pandas_gbq.read_gbq(pyq_all.format(f'2020-04-0{str(n)}'), 
                project_id='eis-2-275207',
                credentials=creds) for n in np.arange(1,7)]


In [None]:
py_ml = [pandas_gbq.read_gbq(pyq_ml.format(f'2020-04-0{str(n)}'), 
                project_id='eis-2-275207',
                credentials=creds) for n in np.arange(1,7)]

In [None]:
py_downloads = pd.concat([
    pd.concat(df).groupby('country_code')['f0_'].mean().rename(name) for df,name in 
    zip([py_all,py_ml],['all_files','ml_packages'])],axis=1).fillna(0)

In [None]:
py_downloads.index = [x.lower() for x in py_downloads.index]

py_downloads.reset_index(drop=False,inplace=True)

In [None]:
euro_py = py_downloads.loc[py_downloads['index'].isin(country_codes)].sort_values('all_files',ascending=False)

euro_py['country_name'] = euro_py['index'].map(country_codes_lu)
euro_py = euro_py.drop('index',axis=1).set_index('country_name')

In [None]:
euro_py_long = (100*euro_py.apply(lambda x: x/x.sum())).reset_index(drop=False).melt(id_vars='country_name',
                                                                      var_name='download_type',value_name=
                                                                                     'download_share')

In [None]:
euro_py_long['download_type'] = ['All' if x=='all_files' else "Machine Learning packages" for x in euro_py_long['download_type']]

In [None]:
base = alt.Chart(euro_py_long)

x_pos = alt.X('country_name',title='Country',
              sort=alt.EncodingSortField(field='download_share',op='sum',order='descending'))
y_pos = alt.Y('download_share',title='Share of EU downloads')

p = base.mark_point(filled=True,size=50,stroke='black',strokeWidth=1).encode(x=x_pos,y=y_pos,color='download_type',
                             shape=alt.Shape('download_type',title='Type of download'))

l = base.mark_line(strokeDash=[1,2]).encode(x=x_pos,y=y_pos,detail='country_name')

f = (p+l).properties(width=500,height=200)

save_altair_(f,"fig_13_pydownloads")

f

### Stack Overflow

In [None]:
#The second one extracts count of activity in a location by year
q2 = '''SELECT EXTRACT (YEAR FROM creation_date), COUNT(id), location
FROM `bigquery-public-data.stackoverflow.users` 
GROUP BY location, EXTRACT (YEAR FROM creation_date) 
'''

In [None]:
stackover = pandas_gbq.read_gbq(q2, 
                project_id='eis-2-275207',
                credentials=creds)

In [None]:
top_stack_locs = stackover.groupby('location')['f1_'].sum().sort_values(
    ascending=False)[:20].reset_index(drop=False).rename(columns={'f1_':'users'})

In [None]:
stack = alt.Chart(top_stack_locs).mark_bar().encode(y=
                                            alt.Y('location:O',title='Location',
                                                  sort=alt.EncodingSortField('users',order='descending')),
                                                  x='users:Q').properties(width=200,height=250)


save_altair_(stack,'fig_14_stackover')

stack

## StudyPortals

### Preamble

In [None]:
spath = f"{project_dir}/data/raw/studyportals"

In [None]:
if os.path.exists(spath)==False:
    os.mkdir(spath)

In [None]:
file = requests.get("https://github.com/nestauk/eis/blob/3_studydata/data/raw/courses.zip?raw=true")

In [None]:
ZipFile(BytesIO(file.content)).extractall(spath)

In [None]:
with open(spath+'/courses/108/bachelor/108-bachelor-1000.json','r') as infile:
    test = json.load(infile)

In [None]:
comp_courses = []

for file in os.listdir(spath+'/courses'):
    if not any(ext in file for ext in ['json','txt']):
        course_types = os.listdir(spath+f'/courses/{file}')
        for level in course_types:
            jsons = os.listdir(spath+f'/courses/{file}/{level}')
            for j in jsons:
                with open(spath+f'/courses/{file}/{level}/{j}','r') as infile:
                    courses = json.load(infile)
                    courses_df = pd.DataFrame(courses)
                    comp_courses.append(courses_df)
        
        
        

In [None]:
sp_df = pd.concat(comp_courses).reset_index(drop=True)

In [None]:
sp_df['venues_n'] = sp_df['venues'].apply(lambda x: len(x))

In [None]:
sp_df['country'] = [[x['country'] for x in vens][0] for vens in sp_df['venues']] 

In [None]:
print(len(set(sp_df['id'])),'  ',len(set(sp_df['country'])))

In [None]:
#We want to focus on the EU

with open(f"{data_path}/aux/eu_codes_names.txt",'r') as infile:
    eu_27_other_codes = infile.read().split(', ')
    
eu_27_names = [x.split(': ')[1].lower().split(',')[0] for x in eu_27_other_codes]+['liechtenstein','macedonia (fyrom)']

sp_df['country_lower'] = sp_df['country'].apply(lambda x: x.lower())

sp_df_eu = sp_df.loc[sp_df['country_lower'].isin(eu_27_names)]

In [None]:
len(sp_df_eu)

In [None]:
sp_df_eu['country_lower'].value_counts(normalize=True).head()

In [None]:
100*sp_df_eu['level'].value_counts(normalize=True)

In [None]:
#How do we select
#Note that there are some duplicated courses because they are assigned multiple disciplines
country_discipline_activity = sp_df_eu.groupby(['country','level','discipline_title']).size().reset_index(name='course_n')

In [None]:
#Plot
country_chart = (alt.Chart(country_discipline_activity)
                 .mark_point(filled=False,shape='square')
                 .encode(x=alt.Y('discipline_title',title='Discipline',
                                 sort=alt.EncodingSortField('course_n','sum',order='descending')),
                         y=alt.X('country',title='Country',
                                 sort=alt.EncodingSortField('course_n','sum',order='descending')),
                         size=alt.Size('course_n',title='Number of courses'),
                         color=alt.Color('level',title='Level',
                                         scale=alt.Scale(scheme='Dark2')))).properties(width=275)

country_chart

In [None]:
country_discipline = country_discipline_activity.groupby(
    ['country','discipline_title'])['course_n'].sum().reset_index(drop=False)

In [None]:
country_levels = (alt.Chart(country_discipline)
                 .mark_bar()
                 .encode(y=alt.X('country',sort=alt.EncodingSortField('course_n',order='descending')),
                         x=alt.X('course_n'),
                         color=alt.Color('discipline_title',
                                         sort=alt.EncodingSortField('course_n',order='descending'),
                                         scale=alt.Scale(scheme='category20')),
                        order=alt.Order('course_n',sort='descending'))).properties(height=450,width=200)

save_altair_(country_levels,"fig_15_country_courses")

country_levels

In [None]:
discipline_level = sp_df_eu.groupby(['discipline_title','level']).size().reset_index(name='course_n')

d = (alt.Chart(discipline_level)
     .mark_bar()
     .encode(y=alt.Y('discipline_title',sort=alt.EncodingSortField('course_n',order='descending'),title='Discipline'),
             x=alt.X('course_n',title='Number of courses'),
             color=alt.Color('level',title='Level',sort=alt.EncodingSortField('course_n')))).properties(width=250)

save_altair_(d,"fig_16_discipline_level")

d

### Save all the data for the analytical synthesis

In [None]:
meetup_long = pd.crosstab(
    gs['country'],gs['year']).reset_index(drop=False).melt(id_vars=['country'],
                                                           value_name='tech_meetups')
github_long = github_wide.reset_index(drop=False).melt(
    id_vars='year_created',value_name='github_users').rename(
    columns={'year_created':'year','country_code':'country'})

python_long = py_downloads.loc[py_downloads['index'].isin(country_codes)][['index','all_files']].rename(columns={
    'index':'country','all_files':'python_downloads'}).assign(year=2018)

out = pd.concat([x.melt(id_vars=['country','year']) for x in [meetup_long,github_long,python_long]])


In [None]:
out.to_csv(f"{project_dir}/data/processed/web_indicators.csv",index=False)