# Load MAG data

This is the arXiv corpus post-matching with GRID

We want to use it in a institutional analysis of top research trends.

## Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])

## Load data

In [None]:
from data_getters.inspector import get_schemas
config_path = '../mysqldb_team.config'

schemas = get_schemas(conf_path=config_path)
# Show datasets
print(schemas.keys())

In [None]:
import pandas as pd
from data_getters.core import get_engine

In [None]:
con = get_engine(config_path=config_path)

In [None]:
chunks = pd.read_sql_table('arxiv_articles', con , chunksize=1000)

df_container = []

In [None]:
for ch in chunks:
    df_container.append(ch)

In [None]:
arxiv_df = pd.concat(df_container)

In [None]:
arxiv_df.reset_index(drop=False,inplace=True)

In [None]:
arxiv_df['year'] = [x.year for x in arxiv_df['created']]

In [None]:
arxiv_df.shape

## Some data processing

#### Extract affiliations from the author dict

In [None]:
#Firt remove papers without MAG metadata
arxiv_df_2 = arxiv_df.loc[[type(x)==list for x in arxiv_df['mag_authors']]]

In [None]:
arxiv_df_2.shape

In [None]:
arxiv_df_2['mag_authors'][0]

In [None]:
#Now 

for v in ['author_name','author_affiliation']:
    
    arxiv_df_2[v+'_list'] = [[x[v] if v in x.keys() else np.nan for x in author] for author in arxiv_df_2['mag_authors']]


In [None]:
arxiv_df_2.head()

In [None]:
arxiv_df_2.shape

#### I want to label these papers with AI ids

I will use the papers identified by Kostas in his Women in AI analysis

In [None]:
ai_ids = pd.read_csv('../data/external/dl_paper_ids.csv',dtype={'paper_id':str})

In [None]:
ai_labelled = set(ai_ids['paper_id'])

In [None]:
#Only focus on papers that we have labelled as AI (or not)
arxiv_df_3 = pd.merge(arxiv_df_2,ai_ids[['paper_id','terms','is_AI','number_of_terms']],left_on='id',right_on='paper_id')

In [None]:
arxiv_df_3.rename(columns={'is_AI':'is_ai'},inplace=True)

Quick check of institutions

In [None]:
arxiv_df_3.to_csv(f'../data/external/{today_str}_arxiv_mag.csv',compression='zip')

### MAG-GRID data

I am also downloading this data for the geographical analysis of arXiv. As a bonus, the data comes with the grid id so we can use that for the institutional analysis

In [None]:
from data_getters.arxiv_grid import get_arxiv_grid
#df = get_arxiv_grid_deep_change(config_path)

In [None]:
#Run the query
df = get_arxiv_grid(conf_path=config_path,all_articles=True)

In [None]:
df.shape

In [None]:
#Save
df.to_csv(f'../data/external/{today_str}_paper_institute_locations.csv',compression='zip')

### Some small explorations

In [None]:
arxiv_year_counts = pd.DataFrame(arxiv_df['year'].value_counts().loc[np.arange(1986,2019)].fillna(0))

In [None]:
growth_rate = []

for n,x in enumerate(arxiv_year_counts['year']):
    
    if n==0:
        out=np.nan
        growth_rate.append(np.nan)
    else:
        if div!=0:
            out = 100*((x/div)-1)
            growth_rate.append(out)
        else:
            growth_rate.append(np.nan)
        
    div = x

#len(growth_rate)
arxiv_year_counts['growth_rate']=growth_rate

In [None]:
%matplotlib inline

arxiv_year_counts['growth_rate'].loc[np.arange(2000,2019)].rolling(window=3).mean().plot()