# Processing GDB

The grants database is fairly harmonised, but there are some 

## Preamble

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#Additional imports
import os
import ratelim
import re
import io
import urllib
import codecs
import bs4
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import OrderedDict

from datetime import datetime
from nltk.corpus import stopwords

from analysis.src.nlp.lda_pipeline import LdaPipeline, CleanTokenize
from analysis.src.data.readnwrite import get_data_dir
from analysis.src.data.data_utilities import dataframe_health_report

stop = stopwords.words('English')

In [None]:
%matplotlib inline
# Open a standard set of directories

# Paths

# Get the top path
data_path = get_data_dir()
# Create the path for external data
ext_data = os.path.join(data_path, 'external')
# Raw data
raw_data = os.path.join(data_path, 'raw')
# And external data
proc_data = os.path.join(data_path, 'processed')
# And interim data
inter_data = os.path.join(data_path, 'interim')
# And figures
fig_path = os.path.join(data_path, 'figures')

# Get date for saving files
today = datetime.today()

today_str = "_".join([str(x) for x in [today.day,today.month,today.year]])

In [None]:
gdb_df = pd.read_csv(raw_data + '/gdb.csv')

In [None]:
gdb_df.head(2)

In [None]:
health_gdb_df = dataframe_health_report(gdb_df, norm=True)

In [None]:
health_gdb_df

## 1 Normalising

### 1.1 Column Names

In [None]:
gdb_df.columns = [c.replace(' ', '_').lower() for c in gdb_df.columns]

### 1.2 Dates

In [None]:
dates = []
for s in gdb_df['start_date']:
    if pd.isnull(s):
        dates.append(np.nan)
    else:
        dates.append(s.replace('"', '').replace('=', ''))
dates = pd.to_datetime(dates, infer_datetime_format=True, errors='coerce')

In [None]:
years = [d.year for d in dates]

In [None]:
gdb_dates_df = pd.DataFrame({'date': dates, 'year': years})

In [None]:
dataframe_health_report(gdb_dates_df)

In [None]:
gdb_dates_df.to_csv(os.path.join(inter_data, 'gdb_dates.csv'), index=False)

### 1.3 Funding Currency

TODO: use `forex_python`

In [None]:
gdb_df['currency'].value_counts()