## This notebook takes processed data for the 3 datsets and creates machine learning ready datasets

In [2]:
import scipy.sparse
import sklearn.linear_model

"""Define graph forming functions"""
# Import packages

import joblib
import itertools
import numpy as np
import pandas as pd
from datetime import date
from tqdm import tqdm
from functools import partial

from sklearn import tree
import sklearn.svm as svm
from sklearn import ensemble
import sklearn.metrics as skm
import sklearn.preprocessing as pp
import sklearn.linear_model as lms
import sklearn.neural_network as skl_nn
import sklearn.neighbors as neighbors
from sklearn.naive_bayes import GaussianNB
import sklearn.model_selection as model_sel

import torch

import torch_sparse
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

In [3]:
# Input data
random_state = 2
np.random.seed(random_state)
data_path = "..\data\classification_inputs/"

max_test_set = 100000

In [4]:
def filter_for_start_yr(df, start_col, end_col) -> pd.DataFrame:
    """Convert dataframe of plants with entry for each year into dataframe with row for each year"""
    # Get rid of emissions for years before start year
    df['Age'] = df['Year'].astype(int) - df[start_col].astype(int)
    df = df[df['Age'] >= 0]

    # Get rid of emissions for years after end years
    df['ToGo'] = df[end_col].astype(int) - df['Year'].astype(int)
    df = df[df['ToGo'] >= 0]

    df[['START_YR', 'END_YR']] = df[['START_YR', 'END_YR']].astype(int)

    return df.drop(columns=['ToGo'])

def pivot_data_dfs(df:pd.DataFrame, time_col) -> [pd.DataFrame, pd.DataFrame]:
    """Pivot data dataframes to get each entry and all year/month values in rows"""
    feature_cols = [i for i in df.columns if i not in [time_col, 'Emissions']]
    df_pivoted = df.pivot(index=feature_cols,
                          columns=time_col,
                          values = 'Emissions').reset_index()
    return df_pivoted, feature_cols


def melt_data_dfs(df:pd.DataFrame, feature_cols, time_col) -> pd.DataFrame:
    """Melt data dataframes to get row per date entry for each facility"""
    melted = df.melt(id_vars=feature_cols, var_name=time_col, value_name='Emissions').dropna(subset=['Emissions'])
    melted[time_col] = melted[time_col].astype(int)
    
    if 'START_YR' in df.columns and 'END_YR' in df.columns:
        melted = filter_for_start_yr(melted, 'START_YR', 'END_YR')

    return melted

## Convert train and test sets into ML ready sets
def series_to_bins(series:pd.Series, bins:list=None, labels:list=None, positive:bool=True):
    # Convert a continuous pandas dataframe column into discrete bins
    if bins is None:
        bin_series = series[series!=0] if positive else series
        bins = [min(bin_series.min(),0)-0.01, bin_series.quantile(0.25), bin_series.quantile(0.5), bin_series.quantile(0.75), bin_series.max()+0.01]
    if labels is None: labels = list(range(len(bins)-1))

    transformer = pp.FunctionTransformer(
        pd.cut, kw_args={'bins': bins, 'labels': labels, 'retbins': False}
    )
    return bins, transformer.fit_transform(series)


def preprocess_yearly(train_set, test_set, y_col='Emissions'):
    """Digitise train and test sets"""

    # Create Y
    bins, y_train_clf = series_to_bins(train_set[y_col])
    _, y_test_clf = series_to_bins(test_set[y_col], bins=[test_set[y_col].min()-0.01]+bins[1:-1]+[test_set[y_col].max()+0.01])

    y_train_reg, y_test_reg = train_set[y_col], test_set[y_col]
    train_set, test_set = train_set.drop(columns=[y_col]), test_set.drop(columns=[y_col])

    # Create X
    # Deal with string columns
    x_enc = pp.OrdinalEncoder()
    string_cols = list(train_set.select_dtypes(include='object').columns)
    train_set[string_cols] = train_set[string_cols].astype(str)
    test_set[string_cols] = test_set[string_cols].astype(str)
    x_strings = pd.concat((train_set[string_cols], test_set[string_cols]))
    x_enc.fit(x_strings)

    # Make float columns into int columns
    float_cols = list(train_set.select_dtypes(include='float').columns)
    train_set[float_cols], test_set[float_cols] = train_set[float_cols].astype(int), test_set[float_cols].astype(int)

    if 'LATITUDE' in list(train_set.columns) and 'LONGITUDE' in list(train_set.columns):
        train_set[['LATITUDE', 'LONGITUDE']] = (train_set[['LATITUDE', 'LONGITUDE']].astype(int)+[90, 180])
        test_set[['LATITUDE', 'LONGITUDE']] = (test_set[['LATITUDE', 'LONGITUDE']].astype(int)+[90, 180])

    int_cols = list(train_set.select_dtypes(include='integer').columns)
    x_ints_min = pd.concat((train_set[int_cols], test_set[int_cols])).min().values
    x_ints_train = train_set[int_cols] - x_ints_min
    x_ints_test = test_set[int_cols] - x_ints_min

    X_train = np.concatenate((x_enc.transform(train_set[string_cols]),
                              x_ints_train.values), axis=1)
    X_test = np.concatenate((x_enc.transform(test_set[string_cols]),
                              x_ints_test.values), axis=1)



    return X_train, X_test, y_train_clf, y_test_clf, y_train_reg, y_test_reg, x_enc

def save_decoded_X(X, x_enc, cols, used, name):
    min_years = [used['START_YR'].astype(int).min(), 1978]
    X_inv = np.concatenate((x_enc.inverse_transform(X[:,:-4]), (X[:,-4:-2]+min_years).astype(int), X[:,-2:]), axis=1)
    pd.DataFrame(X_inv, columns=list(columns[:-2]+['Year']+columns[-2:])).to_csv(name+'.csv')

# Function to split rows into two DataFrames
def split_rows(group, test_fraction):

    num_rows = group.shape[0]
    if num_rows == 1:
        return None, None  # Exclude groups with only one sample

    num_sampled_rows = int(min(max(test_fraction * num_rows, 1), num_rows-1))  # At least one sample for each group

    test_df = group.sample(n=num_sampled_rows)
    train_df = group.drop(test_df.index)

    return train_df, test_df

In [5]:
def create_yearly_data(data, input_data, gap_filling_level, timesteps, test_size):

    if gap_filling_level == 1:
        """Gap filling level 1
        Divide training data points (year for particular facility) from test data points -> Unknown year fill"""

        train_set, test_set = data.copy(), data.copy()

        # Calculate the number of entries to mask in each row
        num_entries = data[timesteps].shape[1]
        num_masked_entries = int(test_size * num_entries)

        # Mask and unmask entries in each row
        mask = np.zeros(data[timesteps].shape, dtype=bool)
        for i in range(data[timesteps].shape[0]):
            indices = np.random.choice(num_entries, size=num_masked_entries, replace=False)
            mask[i, indices] = True

        train_set[timesteps] = train_set[timesteps].mask(mask)
        test_set[timesteps] = test_set[timesteps].mask(~mask)

        feature_cols = [i for i in train_set.columns if i not in timesteps]
        train_yearly, test_yearly = melt_data_dfs(train_set, feature_cols, time_col), melt_data_dfs(test_set, feature_cols, time_col)

    elif gap_filling_level == 2:
        """Gap filling level 2
        Divide training plants from test plants -> Unknown plant fill"""

        grouping_cols = inference_cols if len(inference_cols) > 2 else [inference_cols[0]]

        split_rows_partial = partial(split_rows, test_fraction=test_size)
        grouped_df = data.groupby(grouping_cols)

        train_dfs, test_dfs = zip(*grouped_df.apply(split_rows_partial))
        train_set, test_set = pd.concat(train_dfs), pd.concat(test_dfs)

        feature_cols = [i for i in train_set.columns if i not in timesteps]
        train_yearly, test_yearly = melt_data_dfs(train_set, feature_cols, time_col), melt_data_dfs(test_set, feature_cols, time_col)

    elif gap_filling_level == 3:
        """Gap filling level 3
        Divide training countries or products from test countries or products -> Unknown country/product fill"""
        gap3cols = [i for i in inference_cols if i!=divider]

        split_rows_partial = partial(split_rows, test_fraction=test_size)
        grouped_df = data.groupby(gap3cols)

        train_dfs, test_dfs = zip(*grouped_df.apply(split_rows_partial))
        train_set, test_set = pd.concat(train_dfs), pd.concat(test_dfs)
        # train_set = data.groupby(gap3cols).apply(lambda x: x.sample(frac=1-test_size)).reset_index(drop=True)
        # test_set = data.groupby(gap3cols).apply(lambda x: x.sample(frac=test_size)).reset_index(drop=True)

        # train_divs, test_divs = model_sel.train_test_split(data[divider].unique(), test_size=0.3, random_state=1)
        # train_set, test_set = [data[[i in divs for i in data[divider]]] for divs in [train_divs, test_divs]]

        feature_cols = [i for i in train_set.columns if i not in timesteps]
        train_yearly, test_yearly = melt_data_dfs(train_set, feature_cols, time_col), melt_data_dfs(test_set, feature_cols, time_col)

    else: print('Please choose a gap filling level.')

    return feature_cols, train_yearly, test_yearly

In [6]:
def custom_interpolate(row):
    if row.count() >= 3:  # Check if there are enough values for polynomial interpolation
        return row.interpolate(method='polynomial', order=order, limit_direction='both')
    else:
        return row.interpolate(method='linear', limit_direction='both')

def metrics(y_true, y_pred, model_type='clf'):
    if model_type == 'clf':
        metric_dict = {'confusion': skm.confusion_matrix(y_true, y_pred),
                       'overall_acc': skm.accuracy_score(y_true, y_pred),
                       'average_acc': skm.balanced_accuracy_score(y_true, y_pred),
                       'kappa': skm.cohen_kappa_score(y_true, y_pred),
                       'IoU': skm.jaccard_score(y_true, y_pred, average='weighted')}
    elif model_type == 'reg':
        metric_dict = {'r2': skm.r2_score(y_true, y_pred),
                       'mae': skm.mean_absolute_error(y_true, y_pred),
                       'mse': skm.mean_squared_error(y_true, y_pred)}

    else: raise 'Incorrect model type'

    return metric_dict

In [7]:
def gap_interpolation(train_yearly:pd.DataFrame, test_yearly:pd.DataFrame, years:list, feature_cols:list, inference_cols:list, gap_filling_level:int, interpolation:str='linear', order:int=None, divider:str=None):
    # Interpolate missing values in data rows of training set
    df_train = train_yearly.pivot(index=feature_cols, columns=time_col, values='Emissions').reset_index()

    if gap_filling_level == 1:
        if order is not None:
            train_years = df_train[years]
            train_years.columns = train_years.columns.astype(int)
            # sufficient = train_years[train_years.count(axis=1) > order+1]
            # insufficient = train_years[train_years.count(axis=1) <= order+1]
            df_train[years] = train_years.transpose().apply(custom_interpolate).transpose()
                #pd.concat((sufficient.interpolate(method='polynomial', order=order, axis=1, limit_direction='both'),
                                       #  insufficient.interpolate(method='linear', axis=1, limit_direction='both'))).sort_index()

        else: df_train[years] = df_train[years].interpolate(method=model, order=order, axis=1, limit_direction='both')
        df_train[years] = df_train[years].interpolate(method='pad', axis=1).interpolate(method='bfill', axis=1)
        pred_yearly = melt_data_dfs(df_train, feature_cols, time_col)
        inference_cols = feature_cols

    else:
        if gap_filling_level == 3 or inference_cols == feature_cols:
            inference_cols = [item for item in inference_cols if item != divider]
        # interp = train_yearly[inference_cols+years].groupby(inference_cols).mean()
        #
        # pred_yearly = pd.melt(interp.reset_index(), id_vars=inference_cols, value_vars=years, var_name=time_col, value_name='Emissions')
        pred_yearly = train_yearly.groupby(inference_cols+[time_col]).mean()['Emissions'].reset_index()

    test_cleared = test_yearly.dropna(subset=['Emissions'])
    if len(test_cleared) > max_test_set:
        test_cleared = test_cleared.sample(n=max_test_set, random_state=random_state)
    preds_merged = test_cleared.merge(pred_yearly, how='left', on=inference_cols+[time_col]).dropna(subset=['Emissions_y'])
    # test_cleared = test_yearly.dropna(subset=['Emissions'])
    # if len(test_cleared) > max_test_set:
    #     test_cleared = test_cleared.sample(n=max_test_set, random_state=random_state)
    # preds_merged = test_cleared.merge(pred_yearly, how='left', on=inference_cols+[time_col]).dropna(subset=['Emissions_y'])

    return preds_merged['Emissions_x'], preds_merged['Emissions_y']

In [8]:
from torch_geometric.utils import remove_self_loops
from scipy.sparse import coo_matrix, diags

def pd_to_adj_matrix(df:pd.DataFrame, columns:list, weights:list = False, remove_self_conns:bool = True, max_edges = 100, verbose:bool = True):
    """Form pytorch COO format adjacency matrix from pandas dataframe columns"""
    groups = [group.index.values.astype(int) for col in columns for _, group in df.groupby(col)]

    rows = torch.tensor(np.concatenate([np.tile(g.flatten(), min(len(g), max_edges)) for g in groups]), dtype=torch.long)
    cols = torch.tensor(np.concatenate([np.repeat(g.flatten(), min(len(g), max_edges)) for g in groups]), dtype=torch.long)

    if weights is False:
        weight_vector = torch.tensor(np.ones(len(rows), dtype=int), dtype=torch.float)
    else: weight_vector = torch.tensor(np.ones(len(rows), dtype=int)*weights, dtype=torch.long)

    adjacency = torch_sparse.SparseTensor(row=rows, col=cols, value=weight_vector)

    return adjacency

def balance_classes_pt(X_train, X_test, y_train, y_test, col_name = 'Emissions'):
    y_train_pd = pd.Series(y_train, name=col_name)
    min_count = y_train_pd.reset_index().groupby(col_name).count().min()
    y_train_df = y_train_pd.reset_index().groupby(col_name).sample(min_count.values)
    y_train = y_train_df[col_name].values
    X_train = X_train[y_train_df.index]

    y_test_pd = pd.Series(y_test, name=col_name)
    min_count = y_test_pd.reset_index().groupby(col_name).count().min()
    y_test_df = y_test_pd.reset_index().groupby(col_name).sample(min_count.values)
    y_test = y_test_df[col_name].values
    X_test = X_test[y_test_df.index]

    return X_train, X_test, torch.tensor(y_train), torch.tensor(y_test)

In [None]:
## Inputs
# Input data
data_path = "C:/Users\lukec\PycharmProjects\emissions-tracking-conda\emissions-tracking\data\classification_inputs/"
for input_data in ['CT_manufacturing','petrochemicals','unfccc']:
    print(input_data)
     # Output data
    model_path = 'C:/Users\lukec\PycharmProjects\emissions-tracking-conda\emissions-tracking\models/datasets/'

    # Define divider for level 3
    if input_data == 'CT_manufacturing':
        divider = 'iso3_country'
        inference_cols = ['iso3_country', 'original_inventory_sector', 'asset_type']
        time_col = 'Timestep'
        timesteps = [str(i) for i in range(0,90)]
        graph_cols, max_edges = [0,1,3,5], 100
    elif input_data == 'petrochemicals':
        divider = 'COUNTRY/TERRITORY'
        inference_cols = ['PRODUCT', 'COUNTRY/TERRITORY']
        time_col = 'Year'
        timesteps = [str(i) for i in range(1978,2051)]
        graph_cols, max_edges = [0,3], 30
    elif input_data == 'unfccc':
        divider='Party'
        inference_cols = ['Party', 'Category']
        time_col = 'Year'
        timesteps = [str(i) for i in range(1990,2021)]
        graph_cols, max_edges = [0], 100

    ## Parameters
    max_test_set = 100000
    random_state = 2
    test_size = 0.3
    regression = False

    data = pd.read_csv(data_path+input_data+'.csv')
    # Gap level
    for gap_filling_level in [1,2,3]:
        print('Gap level'+str(gap_filling_level))
        feature_cols, train_yearly, test_yearly = create_yearly_data(data, input_data, gap_filling_level, timesteps, test_size)
        # Digitise train & test sets
        X_train_unscaled, X_test_unscaled, y_train_clf, y_test_clf, y_train_reg, y_test_reg, x_enc = preprocess_yearly(train_yearly, test_yearly)



        # Scale datasets & save as .npy
        scaler = pp.StandardScaler().fit(X_train_unscaled)
        X_train, X_test = scaler.transform(X_train_unscaled), scaler.transform(X_test_unscaled)

        # Create pytorch tensors
        X_train_pt, X_test_pt, y_train_pt, y_test_pt = [torch.Tensor(array) for array in [X_train, X_test, y_train_clf.values, y_test_clf.values]]
        y_train_pt, y_test_pt = [df.to(torch.long) for df in [y_train_pt, y_test_pt]]

        pd.DataFrame(y_train_clf).to_csv(model_path+'y_train-'+input_data+'-'+str(gap_filling_level)+'.csv')
        pd.DataFrame(y_test_clf).to_csv(model_path+'y_test-'+input_data+'-'+str(gap_filling_level)+'.csv')

        # Scale datasets & save as .npy
        scaler = pp.StandardScaler().fit(X_train_unscaled)
        X_train, X_test = scaler.transform(X_train_unscaled), scaler.transform(X_test_unscaled)

        np.save(model_path+'X_train_unscaled-'+input_data+'-'+str(gap_filling_level)+'.npy', X_train_unscaled)
        np.save(model_path+'X_test_unscaled-'+input_data+'-'+str(gap_filling_level)+'.npy', X_test_unscaled)

        np.save(model_path+'X_train-'+input_data+'-'+str(gap_filling_level)+'.npy', X_train)
        np.save(model_path+'X_test-'+input_data+'-'+str(gap_filling_level)+'.npy', X_test)

        # Create pytorch tensors
        X_train_pt, X_test_pt, y_train_pt, y_test_pt = [torch.Tensor(array) for array in [X_train, X_test, y_train_clf.values, y_test_clf.values]]
        y_train_pt, y_test_pt = [df.to(torch.long) for df in [y_train_pt, y_test_pt]]

        torch.save(X_train_pt, model_path+'X_train_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')
        torch.save(X_test_pt, model_path+'X_test_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')
        torch.save(y_train_pt, model_path+'y_train_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')
        torch.save(y_test_pt, model_path+'y_test_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')

        # Balance datasets for graphs
        X_train_pt, X_test_pt, y_train_pt, y_test_pt = balance_classes_pt(X_train_pt, X_test_pt, y_train_pt, y_test_pt)

        # # Define graph
        # graph = Data()
        # graph.x = torch.cat((X_train_pt, X_test_pt))
        # graph.y = torch.cat((y_train_pt, y_test_pt))
        #
        # # Train/test division
        # graph.train_mask = torch.tensor([True]*len(X_train_pt)+[False]*len(X_test_pt))
        # graph.test_mask = ~graph.train_mask
        #
        # # Edge creation
        # input_df = pd.DataFrame(np.concatenate((X_train_unscaled[:,graph_cols].astype(int), X_test_unscaled[:,graph_cols].astype(int))))
        # graph.edge_index = pd_to_adj_matrix(input_df, columns=list(range(len(graph_cols))), max_edges=max_edges)
        #
        # torch.save(graph, model_path+'graph-'+input_data+'-'+str(gap_filling_level)+'.pt')