# RWJF open data analysis

We have access to two datasets about the projects that RWJF support:

1. Pioneers dataset: Information about grants awarded as part of the Pioneers programme, which focuses on innovations in the USA
2. Global dataset: Grants awarsed as part of the Global programme, which focuses on innovations outside the USA
3. Open dataset: With information about all their grants

1 and 2 are relatively unstructured but contain rich text, whereas 3 is well structured but doesn't have a lot of text.

We want to rapidly process these data and analyse them to understand: 

* What is RWJFs funding portfolio: what topics are they supporting? Where?
* Enrich these data with additional information from for example GRID, CrunchBase to map collaboration networks.



## Preamble

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#Additional imports
import os
import ratelim
import re
import io
import urllib
import codecs
import bs4
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
from nltk.corpus import stopwords

from analysis.src.nlp.lda_pipeline import LdaPipeline, CleanTokenize
from analysis.src.data.readnwrite import get_data_dir

stop = stopwords.words('English')

In [None]:
%matplotlib inline
# Open a standard set of directories

# Paths

# Get the top path
data_path = get_data_dir()
# Create the path for external data
ext_data = os.path.join(data_path, 'external')
# Raw data
raw_data = os.path.join(data_path, 'raw')
# And external data
proc_data = os.path.join(data_path, 'processed')
# And interim data
inter_data = os.path.join(data_path, 'interim')
# And figures
fig_path = os.path.join(data_path, 'figures')

# Get date for saving files
today = datetime.today()

today_str = "_".join([str(x) for x in [today.day,today.month,today.year]])

## 1. Load data

### 1.1 Load the Global and Pioneers data

In [None]:
def get_project_meta(project):
    '''
    This function takes a project and returns the name and the id (if they are available, this is not always the case)  
    '''
    
    if 'ID'  in project:
        #Split on the ID string to get the name
        name = project.split('ID')[0].strip()
        
        #Split on the ID string again to get what we want
        grant_id = re.sub('[#:]','',project.split('ID')[1].split('\n')[0].strip()).strip()   
    else:
        #If there is no ID we split on line breaks
        name = project.strip().split('\n')[0].strip()
        grant_id = np.nan

    #description = project.split('\n*')[1]
    return([name,grant_id])

def flatten_list(my_list):
    '''
    Turns a nested list into a flat list
    '''    
    flat = [x for el in my_list for x in el]    
    return(flat)

In [None]:
def read_rwjf_data(file):
    '''
    This function reads project lists from the RWJF and tidies it up, and returns
    a list where each element has the project name, grant id and description
    
    '''
    #Load the data
    with open(raw_data + '/' + file, 'r') as myfile:
        data=myfile.read()
    
    #Split it based on the project separator and leave out the links at the top
    projects = data.split('\n________________\n')[1:]
    
    #Extract metadata
    project_meta = [get_project_meta(x) for x in projects]
    
    #project_descriptions = [x[2] for x in project_meta]
    
    #Clean up the project info
    projects_clean = [re.sub('\* ','',re.sub('\n','',project)).lower() for project in projects]
    
    return([[x,y,z] for x,y,z in zip(
        [x[0] for x in project_meta],
        [x[1] for x in project_meta],
        projects_clean)])  

In [None]:
# Load both files
pio = read_rwjf_data('pioneer_grantees.txt')
glob = read_rwjf_data('global_grantees.txt') 

In [None]:
rw_df = pd.DataFrame([x + ['pioneers'] for x in pio] + [x + ['global'] for x in glob], columns=['project',
                                                                                'code', 'description', 'source_id'])

rw_df.to_csv(os.path.join(inter_data, 'rwjf_pioneer_and_global_projects.csv'), index=False)

rw_df.head()

### 1.2 Load the RWJF open grant data

In [None]:
grant_data = pd.read_csv(raw_data+'/rwjf_grants.csv')

In [None]:
len(grant_data)

In [None]:
grant_data.head()

In [None]:
#Unfortunately they don't have the grant ids in their open dataset! 

pd.Series(flatten_list([[y for y in x[2].split(' ') if y not in stop] for x in pio])).value_counts()[:10]
