## StudyPortals

### Preamble

In [None]:
%run ../notebook_preamble.ipy

import os
import requests
from zipfile import ZipFile
from io import BytesIO
import altair as alt

spath = f"{project_dir}/data/raw/studyportals"

In [None]:
if os.path.exists(spath)==False:
    os.mkdir(spath)

In [None]:
file = requests.get("https://github.com/nestauk/eis/blob/3_studydata/data/raw/courses.zip?raw=true")

In [None]:
ZipFile(BytesIO(file.content)).extractall(spath)

In [None]:
with open(spath+'/courses/108/bachelor/108-bachelor-1000.json','r') as infile:
    test = json.load(infile)

In [None]:
comp_courses = []

for file in os.listdir(spath+'/courses'):
    if not any(ext in file for ext in ['json','txt']):
        course_types = os.listdir(spath+f'/courses/{file}')
        for level in course_types:
            jsons = os.listdir(spath+f'/courses/{file}/{level}')
            for j in jsons:
                with open(spath+f'/courses/{file}/{level}/{j}','r') as infile:
                    courses = json.load(infile)
                    courses_df = pd.DataFrame(courses)
                    comp_courses.append(courses_df)
        
        
        

In [None]:
sp_df = pd.concat(comp_courses).reset_index(drop=True)

In [None]:
#sp_df = sp_df_.drop_duplicates('id')

In [None]:
sp_df['venues_n'] = sp_df['venues'].apply(lambda x: len(x))

In [None]:
sp_df['country'] = [[x['country'] for x in vens][0] for vens in sp_df['venues']] 

In [None]:
sp_df.columns

In [None]:
#We want to focus on the EU

with open(f"{data_path}/aux/eu_codes_names.txt",'r') as infile:
    eu_27_other_codes = infile.read().split(', ')
    
eu_27_names = [x.split(': ')[1].lower().split(',')[0] for x in eu_27_other_codes]+['liechtenstein','macedonia (fyrom)']

sp_df['country_lower'] = sp_df['country'].apply(lambda x: x.lower())

sp_df_eu = sp_df.loc[sp_df['country_lower'].isin(eu_27_names)]

In [None]:
len(sp_df_eu['country_lower'].value_counts())

In [None]:
#How do we select
#Note that there are some duplicated courses because they are assigned multiple disciplines
country_discipline_activity = sp_df_eu.groupby(['country','level','discipline_title']).size().reset_index(name='course_n')

In [None]:
#Plot

country_chart = (alt.Chart(country_discipline_activity)
                 .mark_point(filled=False,shape='square')
                 .encode(x=alt.Y('discipline_title',title='Discipline',
                                 sort=alt.EncodingSortField('course_n','sum',order='descending')),
                         y=alt.X('country',title='Country',
                                 sort=alt.EncodingSortField('course_n','sum',order='descending')),
                         size=alt.Size('course_n',title='Number of courses'),
                         color=alt.Color('level',title='Level',
                                         scale=alt.Scale(scheme='Dark2')))).properties(width=275)
country_chart
                 

In [None]:
discipline_level = sp_df_eu.groupby(['discipline_title','level']).size().reset_index(name='course_n')

d = (alt.Chart(discipline_level)
     .mark_bar()
     .encode(y=alt.Y('discipline_title',sort=alt.EncodingSortField('course_n',order='descending'),title='Discipline'),
             x=alt.X('course_n',title='Number of courses'),
             color=alt.Color('level',title='Level',sort=alt.EncodingSortField('course_n')))).properties(width=250)

d

In [None]:
sp_df_eu['level'].value_counts()