# Notebook Template

This notebook is a template notebook that simply has a preamble with a few useful imports. Add or delete lines as you need to set up the notebook for your own project!

## Preamble

In [None]:
%load_ext autoreload
%autoreload 2
# install im_tutorial package
!pip install git+https://github.com/nestauk/im_tutorials.git

In [None]:
# useful Python tools
import itertools
import collections

# matplotlib for static plots
import matplotlib.pyplot as plt
# networkx for networks
import networkx as nx
# numpy for mathematical functions
import numpy as np
# pandas for handling tabular data
import pandas as pd
# seaborn for pretty statistical plots
import seaborn as sns

# basic bokeh imports for an interactive scatter plot or line chart
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Circle, Line

# NB: If using Google Colab, this function must be run at 
# the end of any cell that you want to display a bokeh plot.
# If using Jupyter, then this line need only appear once at
# the start of the notebook.
output_notebook()

from im_tutorials.data import datasets

In [None]:
datasets.gateway_to_research_projects??

In [None]:
def double_eval(x):
    return ast.literal_eval(ast.literal_eval(x))

def arxiv_papers(year=2017):
    '''arxiv_papers
    Get arXiv papers csv for a single year and return as dataframe.
    
    Args:
        year (`int`): Year of the dataset.
    Returns:
        arxiv_df (`pd.DataFrame`): Parsed dataframe of arXiv papers.
    '''
    bucket='innovation-mapping-tutorials'
    gtr_projects_key=f'arxiv_{year}/arxiv_{year}.csv'
    arxiv_df = pd.read_csv(
        smart_open.smart_open(f'https://s3.us-east-2.amazonaws.com/{bucket}/{gtr_projects_key}'),
        index_col=0,
        converters={
            'authors': double_eval,
        },
        parse_dates=['created'], 
    )
    arxiv_df['year_created'] = arxiv_df['created'].dt.year
    arxiv_df['category_ids'] = arxiv_df['category_ids'].str.split(',') 
    return arxiv_df

In [None]:
import smart_open

In [None]:
df = arxiv_papers()

In [None]:
df.head()

In [None]:
df.shape

## Import Data

In [None]:
# The im_tutorials datasets module can be used to easily load datasets.
# For example, to load Gateway to Research projects:
# gtr_projects_df = datasets.gateway_to_research_projects()