# Create feature unions based on previous work (EDA, feature selection, time series)

## 0.0 Libraries

In [245]:
%load_ext autoreload
%autoreload 2

#system
import os
import sys
from os.path import join as pj
module_path = os.path.abspath(pj('..','..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# utils
from src.d00_utils import print_helper_functions as phf

# ipython
import warnings
warnings.simplefilter('ignore')

# executing code
import click
import logging

# type annotations
from typing import List, Set, Dict, Tuple, Optional
from collections.abc import Iterable

# configuring
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import configparser

# feature extractor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer

# data
import numpy as np
import re

# stats
import statsmodels.api as sm

# viz
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set(font_scale=1.5)
plt.style.use('bmh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# file handling
import tempfile
import joblib
import botocore
import boto3
from os.path import join as pj
import pickle as pkl

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1.0 Feature Extractor

### 1.1 Load data

In [3]:
# not used in this stub but often useful for finding various files
project_dir = Path().resolve().parents[1]
print(project_dir)

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
_ = load_dotenv(find_dotenv())

raw_dir = pj(project_dir, 'data', os.environ.get('RAW_DIR'))
interim_dir = pj(project_dir, 'data', os.environ.get('INTERIM_DIR'))
processed_dir = pj(project_dir, 'data', os.environ.get('PROCESSED_DIR'))

/Users/marclawson/repositories/grantnav_10k_predictor


In [306]:
data = pd.read_csv(pj(interim_dir,'grantnav_data_post2015.csv'), index_col=0)

### 1.3 Create feature extractors

In [None]:
class FeatureExtractorText(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, *args):
        return self

    def transform(self, X, *args):
        X = X[self.columns].values
        return X

    
class FeatureExtractorOHE(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, *args):
        return self

    def transform(self, X, *args):
        X = X[self.columns].values.reshape(-1, 1)
        return X
              
              
class FeatureExtractorNumber(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, *args):
        return self

    def transform(self, X, *args):
        X = X[self.columns].values.reshape(-1,1)
        return X

    
class CustomImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, *args):
        return self

    def transform(self, X, *args):
        X = SimpleImputer(missing_values=np.NaN,
                          strategy='constant',
                          fill_value=0).fit_transform(X)
        return X

In [None]:
@click.command()
@click.option('--use-s3', 'use_s3', default=True)
def main(use_s3):
    """ Builds features for modelling."""

    s3 = boto3.resource('s3')
    bucket = s3.Bucket(os.environ.get('BUCKET'))
    processed_dir = pj(project_dir, 'data', os.environ.get('PROCESSED_DIR'))

    activities_pipe = make_pipeline(
        FeatureExtractorText('activities'),
        CountVectorizer(),
        StandardScaler()
    )

    objects_pipe = make_pipeline(
        FeatureExtractorText('objects'),
        CountVectorizer(),
        StandardScaler()
    )

    income_pipe = make_pipeline(
        FeatureExtractorNumber('income_3y_mean'),
        StandardScaler()
    )

    title_pipe = make_pipeline(
        FeatureExtractorText('name'),
        CountVectorizer(),
        StandardScaler()
    )

    region_pipe = make_pipeline(
        FeatureExtractorOHE('EER'),
        OneHotEncoder(drop='first'),
        StandardScaler()
    )

    ru_pipe = make_pipeline(
        FeatureExtractorOHE('RU'),
        OneHotEncoder(drop='first'),
        StandardScaler()
    )

    funders_pipe = make_pipeline(
        FeatureExtractorNumber('Funders'),
        CustomImputer(),
        StandardScaler()
    )

    trustees_pipe = make_pipeline(
        FeatureExtractorNumber('Trustees'),
        CustomImputer(),
        StandardScaler()
    )

    selfclass_pipe = make_pipeline(
        FeatureExtractorText('self_class'),
        CountVectorizer(),
        StandardScaler()
    )

    feature_union = make_union(activities_pipe, objects_pipe, income_pipe, title_pipe, region_pipe, 
                    ru_pipe, trustees_pipe, selfclass_pipe)