SDG Labelling Data Preparation
==============================

Cleaning of data scraped from [Partnerships for the SDGs](https://sustainabledevelopment.un.org/partnership/browse/) and [RELX Group SDG Resource Centre](https://sdgresources.relx.com/articles).

In [None]:
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import ast
import json
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import defaultdict, Counter
from datetime import datetime, date

from analysis.src.data.readnwrite import get_data_dir
from analysis.src.data.data_utilities import flatten, eval_column, grouper

pd.options.display.max_columns = 99

In [None]:
%matplotlib inline

# Paths
# Get the top path
data_path = get_data_dir()

# Create the path for external data
ext_data = os.path.join(data_path, 'external')
# Raw data
raw_data = os.path.join(data_path, 'raw')
# And external data
proc_data = os.path.join(data_path, 'processed')
# And interim data
inter_data = os.path.join(data_path, 'interim')
# And figures
fig_path = os.path.join(data_path, 'figures')

# Get date for saving files
today = datetime.utcnow()

today_str = "_".join([str(x) for x in [today.year,today.month,today.day]])

## 1. Load Data

We have a raw dataset from each site that was scraped to load.

In [None]:
partnernship_df = pd.read_csv(os.path.join(raw_data, 'sdg_partnership_projects_scraped.csv'))
relx_df = pd.read_json(os.path.join(raw_data, 'sdg_relx_articles.json'))

In [None]:
partnernship_df.head(2)

In [None]:
relx_df.head(2)

## 2. Cleaning

### 2.1 Partnership Data

#### Goals

In [None]:
goals_partner = eval_column(partnernship_df, 'goals')

In [None]:
goals_binary_partner = []
for gp in goals_partner:
    goals_binary = np.zeros(17).astype('int8')
    for i in gp:
        goals_binary[int(i) - 1] = 1
    goals_binary_partner.append(goals_binary)

ohe_goals_partner = pd.DataFrame(goals_binary_partner)
ohe_goals_partner.columns = ['goal_{}'.format(i + 1) for i in range(17)]

In [None]:
ohe.head(1)

#### Content

In [None]:
content_partnership = list(partnernship_df['content'].values)

In [None]:
content_partnership[0]


Looks like the main text cleaning is removing new lines etc.

In [None]:
content_partnership = [cp.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') for cp in content_partnership]
content_partnership = [re.sub(' +', ' ', cp).strip() for cp in content_partnership]

### Date

In [None]:
tf = partnernship_df['timeframe'].values[0]

In [None]:
# This doesn't work...
# I stopped trying when I found the date '1/2///2/0/1/9'...

def parse_timeframe(tf):
    start, end = tf.split(' - ')[:]
    start = start.split(': ')[1]
    
    start = start.replace(',', ', ')
    end = end.replace(',', ', ')
    
    if '/' in start:
        start = start.split('/')
        if len(start[-1]) == 2:
            start[-1] = '20' + start[-1]
        start = '/'.join(end)
    if '/' in end:
        print(start, end)
        end = end.split('/')
        if len(end[-1]) == 2:
            end[-1] = '20' + end[-1]
        end = '/'.join(end)
        print(start, end)
    if (end == 'ongoing') | (end == '-'):
        end = date(year=2030, month=1, day=1)
        start = pd.to_datetime(start).date()
    else:
        start = pd.to_datetime(start).date() 
        end = pd.to_datetime(end).date()
    return start, end

### 2.2 RELX Data

#### Goals

In [None]:
goals_relx = relx_df['sdg_goals'].values

In [None]:
goals_binary_relx = []
for gp in goals_relx:
    goals_binary = np.zeros(17).astype('int8')
    for i in gp:
        goals_binary[int(i) - 1] = 1
    goals_binary_relx.append(goals_binary)

ohe_goals_relx = pd.DataFrame(goals_binary_relx)
ohe_goals_relx.columns = ['goal_{}'.format(i + 1) for i in range(17)]

In [None]:
ohe_goals_relx.head(1)

#### Content

In [None]:
content_relx = relx_df['content'].values
content_relx[0]

Looks like there's a fair amount of special characters here. Let's get rid of 'em.

In [None]:
content_relx = [re.sub(r'[^\x00-\x7f]',r' ', cr) for cr in content_relx]
content_relx = [re.sub(' +', ' ', cr).strip() for cr in content_relx]

## 3. Joining and Exporting

In [None]:
partner_clean_df = pd.DataFrame({'content': content_partnership,
                                 'source': 'un_sdg_partnerships'})
relx_clean_df = pd.DataFrame({'content': content_relx,
                              'source': 'relx'})

partner_clean_df = partner_clean_df.join(ohe_goals_partner)
relx_clean_df = relx_clean_df.join(ohe_goals_relx)

In [None]:
clean_df = pd.concat([partner_clean_df, relx_clean_df])

In [None]:
print("Number of projects:", len(clean_df))

In [None]:
print("Number of projects for each goal:")
for c in clean_df.columns:
    if 'goal_' in c:
        print('{:7} {:>5}'.format(c, sum(clean_df[c])))

In [None]:
clean_df.to_csv(os.path.join(inter_data, 'sdg_projects_and_goals.csv'), index=False)