# Create feature unions based on previous work (EDA, feature selection, time series)

## 0.0 Libraries

In [3]:
%load_ext autoreload
%autoreload 2

#system
import os
import sys
from os.path import join as pj
module_path = os.path.abspath(pj('..','..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# utils
from src.d00_utils import print_helper_functions as phf

# ipython
import warnings
warnings.simplefilter('ignore')

# executing code
import click
import logging

# type annotations
from typing import List, Set, Dict, Tuple, Optional
from collections.abc import Iterable

# configuring
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import configparser

# feature extractor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# data
import numpy as np
import re

# stats
import statsmodels.api as sm

# viz
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set(font_scale=1.5)
plt.style.use('bmh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# file handling
import tempfile
import joblib
import botocore
import boto3
from os.path import join as pj
import pickle as pkl
sys.path.append('../..')
from src.d01_data.build_features import FeatureExtractorText, FeatureExtractorOHE, FeatureExtractorNumber, CustomImputer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1.0 Feature Extractor

### 1.1 Load data

In [4]:
# not used in this stub but often useful for finding various files
project_dir = Path().resolve().parents[1]
print(project_dir)

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
_ = load_dotenv(find_dotenv())

raw_dir = pj(project_dir, 'data', os.environ.get('RAW_DIR'))
interim_dir = pj(project_dir, 'data', os.environ.get('INTERIM_DIR'))
processed_dir = pj(project_dir, 'data', os.environ.get('PROCESSED_DIR'))

/Users/marclawson/repositories/grantnav_10k_predictor


In [5]:
data = pd.read_csv(pj(interim_dir,'grantnav_data_post2015.csv'), index_col=0)

### 1.2 Create feature extractors

#### 1.2.1 Feature Extractors

#### 1.2.2 Feature unions

Some funders are not being included in the training set so will need to create an 'other_funders' column to cater for this.

In [32]:
funder_counts = data['funding_org_identifier'].value_counts()
other_funders = {f: 'GB-OTHER' for f in list(funder_counts[funder_counts<200].index)}
all_funders = dict(zip(data['funding_org_identifier'], data['funding_org_identifier']))
new_funders_dict = dict(all_funders, **other_funders)

In [33]:
data['funding_org_identifier_revised'] = data['funding_org_identifier'].map(new_funders_dict)
data['funding_org_name_revised'] = np.where(data['funding_org_identifier_revised']=='GB-OTHER', 'Other Funder', data['funding_org_name'])

In [34]:
# save data
data.to_csv(pj(interim_dir,'grantnav_data_post2015.csv'))

In [9]:
description_pipe = make_pipeline(
    FeatureExtractorText('description'),
    TfidfVectorizer(),
    StandardScaler()
)

title_pipe = make_pipeline(
    FeatureExtractorText('title'),
    TfidfVectorizer(),
    StandardScaler()
)

duration_pipe = make_pipeline(
    FeatureExtractorNumber('planned_dates_duration_months'),
    StandardScaler()
)

award_date_pipe = make_pipeline(
    FeatureExtractorNumber('days_since_award'),
    StandardScaler()
)

funder_pipe = make_pipeline(
    FeatureExtractorOHE('funding_org_identifier_revised'),
    OneHotEncoder(drop='first'),
    StandardScaler()
)

In [10]:
feature_union = make_union(description_pipe, title_pipe, duration_pipe, award_date_pipe, funder_pipe)

### 1.3 Save feature union 

In [11]:
filename = 'feature_union.jlib'
_file = open(pj(processed_dir, filename), 'wb')
feature_union = joblib.dump(feature_union, _file, compress=1)

In [12]:
# load features
filename = 'feature_union.jlib'
_file = open(pj(processed_dir, filename), 'rb')
feature_union = joblib.load(_file)