# Processing GDB

The grants database is fairly harmonised, but there are some 

## Preamble

In [3]:
%load_ext autoreload
%autoreload 2

In [188]:
#Additional imports
import os
import ratelim
import re
import io
import urllib
import codecs
import bs4
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import OrderedDict

from datetime import datetime
from nltk.corpus import stopwords

from analysis.src.nlp.lda_pipeline import LdaPipeline, CleanTokenize
from analysis.src.data.readnwrite import get_data_dir
from analysis.src.data.data_utilities import dataframe_health_report

stop = stopwords.words('English')

In [275]:
%matplotlib inline
# Open a standard set of directories

# Paths

# Get the top path
data_path = get_data_dir()
# Create the path for external data
ext_data = os.path.join(data_path, 'external')
# Raw data
raw_data = os.path.join(data_path, 'raw')
# And external data
proc_data = os.path.join(data_path, 'processed')
# And interim data
inter_data = os.path.join(data_path, 'interim')
# And figures
fig_path = os.path.join(data_path, 'figures')

# Get date for saving files
today = datetime.utcnow()

today_str = "_".join([str(x) for x in [today.year, today.month, today.day]])

In [283]:
gdb_df = pd.read_csv(os.path.join(raw_data, 'gdb.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [284]:
gdb_df.head(2)

Unnamed: 0,gh_user_id,gdb_dataset_id,source_id,start_date,status,gh_user_creation_date,description,funding,row_id,currency,...,cb_category_list,cb_role,end_date,gh_valid_description,name,administrative_area_level_1,administrative_area_level_2,lng,lat,country
0,,gtr,GDB,2009-04-01 00:00:00,Closed,,The past decade has seen a renaissance in acce...,2313330.0,http://gtr.rcuk.ac.uk:80/gtr/api/projects/00C9...,GBP,...,,,,,University College London,England,Greater London,-0.132718,51.524469,United Kingdom
1,,gtr,GDB,2009-10-01 00:00:00,Closed,,Humans and animals are made up of millions of ...,1520860.0,http://gtr.rcuk.ac.uk:80/gtr/api/projects/00E4...,GBP,...,,,,,University College London,England,Greater London,-0.132718,51.524469,United Kingdom


In [195]:
health_gdb_df = dataframe_health_report(gdb_df, norm=True)

In [196]:
health_gdb_df

Unnamed: 0,dtype,non_null_count,null_count,NaN_count,unique_values,modal_value,modal_value_count,string_length_min,string_length_max,string_length_median,string_empty_count
gh_user_id,float64,0.098745,0.901255,0.901255,13827,1.19786e+07,0.000334035,,,,
gdb_dataset_id,object,0.672986,0.327014,0.327014,7,gtr,0.184033,3.0,14.0,4.0,0.0
source_id,object,1.0,0.0,0.0,3,GDB,0.672986,3.0,10.0,3.0,0.0
start_date,object,0.84556,0.15444,0.15444,22294,2017-01-01,0.0138522,8.0,23.0,13.0,0.0
status,object,0.566265,0.433735,0.433735,14,operating,0.187967,3.0,17.0,6.0,0.0
gh_user_creation_date,object,0.098745,0.901255,0.901255,13827,2016-03-09 09:02:48 UTC,0.000334035,23.0,23.0,23.0,0.0
description,object,0.999993,6.81705e-06,6.81705e-06,102186,The Innovative Manufacturing and Construction ...,0.000920302,6.0,9811.0,1745.0,0.0
funding,object,0.679046,0.320954,0.320954,54004,50000,0.00508552,1.0,15.0,8.0,0.0
row_id,object,1.0,0.0,0.0,117285,http://gtr.rcuk.ac.uk:80/gtr/api/projects/FDEB...,0.000920302,6.0,78.0,15.0,0.0
currency,object,0.82247,0.17753,0.17753,3,USD,0.484474,3.0,3.0,3.0,0.0


## 1 Normalising

### 1.1 Dates

In [263]:
dates = []
for s in gdb_df['start_date']:
    if pd.isnull(s):
        dates.append(np.nan)
    else:
        dates.append(s.replace('"', '').replace('=', ''))
dates = pd.to_datetime(dates, infer_datetime_format=True, errors='coerce')

In [266]:
years = [d.year for d in dates]

In [270]:
gdb_dates_df = pd.DataFrame({'date': dates, 'year': years})

In [272]:
dataframe_health_report(gdb_dates_df)

Unnamed: 0,dtype,non_null_count,null_count,NaN_count,unique_values,modal_value,modal_value_count,+inf_count,-inf_count,min,max,zeros_count,mean,25%,50%,75%
date,datetime64[ns],124029,22662,22662,18894,,0,,,,,,,,,
year,float64,124029,22662,22662,211,2015.0,17534,0.0,0.0,1744.0,2019.0,0.0,2010.7,2009.0,2013.0,2016.0


In [271]:
gdb_dates_df.to_csv(os.path.join(inter_data, 'gdb_dates.csv'), index=False)

### 1.2 Funding Currency

TODO: use `forex_python`

In [269]:
gdb_df['currency'].value_counts()

USD    71068
GBP    32060
EUR    17521
Name: currency, dtype: int64

## 2. Export

In [289]:
gdb_doc_ids = gdb_df.index.values
gdb_doc_id_df = pd.DataFrame({'doc_id': gdb_doc_ids})

In [290]:
gdb_doc_id_df.to_csv(os.path.join(inter_data, 'gdb_doc_ids.csv'),index=False)