In [1]:
#!pip install torch_sparse

In [2]:
import scipy.sparse
import sklearn.linear_model

"""Define graph forming functions"""
# Import packages

import math
import joblib
import itertools
import numpy as np
import pandas as pd
from datetime import date
from tqdm import tqdm
from functools import partial
#from google.colab import files

from sklearn import tree
import sklearn.svm as svm
from sklearn import ensemble
import sklearn.metrics as skm
import sklearn.preprocessing as pp
import sklearn.linear_model as lms
import sklearn.neural_network as skl_nn
import sklearn.neighbors as neighbors
from sklearn.naive_bayes import GaussianNB
import sklearn.model_selection as model_sel

import torch

import torch_sparse
import torch.nn as nn
from torch_geometric.data import Data
# from torch_geometric.nn import GCNConv

In [3]:
# Input data
random_state = 2
np.random.seed(random_state)
data_path = 'C:/Users\lukec\PycharmProjects\emissions-tracking-conda\emissions-tracking\models\datasets/'
#"/content/datasets/"

max_test_set = 100000

In [4]:
def filter_for_start_yr(df, start_col, end_col) -> pd.DataFrame:
    """Convert dataframe of plants with entry for each year into dataframe with row for each year"""
    # Get rid of emissions for years before start year
    df['Age'] = df['Year'].astype(int) - df[start_col].astype(int)
    df = df[df['Age'] >= 0]

    # Get rid of emissions for years after end years
    df['ToGo'] = df[end_col].astype(int) - df['Year'].astype(int)
    df = df[df['ToGo'] >= 0]

    df[['START_YR', 'END_YR']] = df[['START_YR', 'END_YR']].astype(int)

    return df.drop(columns=['ToGo'])

def pivot_data_dfs(df:pd.DataFrame, time_col) -> [pd.DataFrame, pd.DataFrame]:
    """Pivot data dataframes to get each entry and all year/month values in rows"""
    feature_cols = [i for i in df.columns if i not in [time_col, 'Emissions']]
    df_pivoted = df.pivot(index=feature_cols,
                          columns=time_col,
                          values = 'Emissions').reset_index()
    return df_pivoted, feature_cols


def melt_data_dfs(df:pd.DataFrame, feature_cols, time_col) -> pd.DataFrame:
    """Melt data dataframes to get row per date entry for each facility"""
    melted = df.melt(id_vars=feature_cols, var_name=time_col, value_name='Emissions').dropna(subset=['Emissions'])
    melted[time_col] = melted[time_col].astype(int)
    
    if 'START_YR' in df.columns and 'END_YR' in df.columns:
        melted = filter_for_start_yr(melted, 'START_YR', 'END_YR')

    return melted

## Convert train and test sets into ML ready sets
def series_to_bins(series:pd.Series, bins:list=None, labels:list=None, positive:bool=True):
    # Convert a continuous pandas dataframe column into discrete bins
    if bins is None:
        bin_series = series[series!=0] if positive else series
        bins = [min(bin_series.min(),0)-0.01, bin_series.quantile(0.25), bin_series.quantile(0.5), bin_series.quantile(0.75), bin_series.max()+0.01]
    if labels is None: labels = list(range(len(bins)-1))

    transformer = pp.FunctionTransformer(
        pd.cut, kw_args={'bins': bins, 'labels': labels, 'retbins': False}
    )
    return bins, transformer.fit_transform(series)


def preprocess_yearly(train_set, test_set, y_col='Emissions'):
    """Digitise train and test sets"""

    # Create Y
    bins, y_train_clf = series_to_bins(train_set[y_col])
    _, y_test_clf = series_to_bins(test_set[y_col], bins=[test_set[y_col].min()-0.01]+bins[1:-1]+[test_set[y_col].max()+0.01])

    y_train_reg, y_test_reg = train_set[y_col], test_set[y_col]
    train_set, test_set = train_set.drop(columns=[y_col]), test_set.drop(columns=[y_col])

    # Create X
    # Deal with string columns
    x_enc = pp.OrdinalEncoder()
    string_cols = list(train_set.select_dtypes(include='object').columns)
    train_set[string_cols] = train_set[string_cols].astype(str)
    test_set[string_cols] = test_set[string_cols].astype(str)
    x_strings = pd.concat((train_set[string_cols], test_set[string_cols]))
    x_enc.fit(x_strings)

    # Make float columns into int columns
    float_cols = list(train_set.select_dtypes(include='float').columns)
    train_set[float_cols], test_set[float_cols] = train_set[float_cols].astype(int), test_set[float_cols].astype(int)

    if 'LATITUDE' in list(train_set.columns) and 'LONGITUDE' in list(train_set.columns):
        train_set[['LATITUDE', 'LONGITUDE']] = (train_set[['LATITUDE', 'LONGITUDE']].astype(int)+[90, 180])
        test_set[['LATITUDE', 'LONGITUDE']] = (test_set[['LATITUDE', 'LONGITUDE']].astype(int)+[90, 180])

    int_cols = list(train_set.select_dtypes(include='integer').columns)
    x_ints_min = pd.concat((train_set[int_cols], test_set[int_cols])).min().values
    x_ints_train = train_set[int_cols] - x_ints_min
    x_ints_test = test_set[int_cols] - x_ints_min

    X_train = np.concatenate((x_enc.transform(train_set[string_cols]),
                              x_ints_train.values), axis=1)
    X_test = np.concatenate((x_enc.transform(test_set[string_cols]),
                              x_ints_test.values), axis=1)



    return X_train, X_test, y_train_clf, y_test_clf, y_train_reg, y_test_reg, x_enc

def save_decoded_X(X, x_enc, cols, used, name):
    min_years = [used['START_YR'].astype(int).min(), 1978]
    X_inv = np.concatenate((x_enc.inverse_transform(X[:,:-4]), (X[:,-4:-2]+min_years).astype(int), X[:,-2:]), axis=1)
    pd.DataFrame(X_inv, columns=list(columns[:-2]+['Year']+columns[-2:])).to_csv(name+'.csv')

# Function to split rows into two DataFrames
def split_rows(group, test_fraction):

    num_rows = group.shape[0]
    if num_rows == 1:
        return None, None  # Exclude groups with only one sample

    num_sampled_rows = int(min(max(test_fraction * num_rows, 1), num_rows-1))  # At least one sample for each group

    test_df = group.sample(n=num_sampled_rows)
    train_df = group.drop(test_df.index)

    return train_df, test_df

In [5]:
def custom_interpolate(row):
    if row.count() >= 3:  # Check if there are enough values for polynomial interpolation
        return row.interpolate(method='polynomial', order=order, limit_direction='both')
    else:
        return row.interpolate(method='linear', limit_direction='both')

def metrics(y_true, y_pred, model_type='clf'):
    if model_type == 'clf':
        metric_dict = {'confusion': skm.confusion_matrix(y_true, y_pred),
                       'overall_acc': skm.accuracy_score(y_true, y_pred),
                       'average_acc': skm.balanced_accuracy_score(y_true, y_pred),
                       'kappa': skm.cohen_kappa_score(y_true, y_pred),
                       'IoU': skm.jaccard_score(y_true, y_pred, average='weighted')}
    elif model_type == 'reg':
        metric_dict = {'r2': skm.r2_score(y_true, y_pred),
                       'mae': skm.mean_absolute_error(y_true, y_pred),
                       'mse': skm.mean_squared_error(y_true, y_pred)}

    else: raise 'Incorrect model type'

    return metric_dict

def pd_to_adj_matrix(df:pd.DataFrame, columns:list, weights:list = False, remove_self_conns:bool = True, max_edges = 100, verbose:bool = True):
    """Form pytorch COO format adjacency matrix from pandas dataframe columns"""
    groups = [group.index.values.astype(int) for col in columns for _, group in df.groupby(col)]

    rows = torch.tensor(np.concatenate([np.tile(g.flatten(), min(len(g), max_edges)) for g in groups]), dtype=torch.long)
    cols = torch.tensor(np.concatenate([np.repeat(g.flatten(), min(len(g), max_edges)) for g in groups]), dtype=torch.long)

    if weights is False:
        weight_vector = torch.tensor(np.ones(len(rows), dtype=int), dtype=torch.float)
    else: weight_vector = torch.tensor(np.ones(len(rows), dtype=int)*weights, dtype=torch.long)

    adjacency = torch_sparse.SparseTensor(row=rows, col=cols, value=weight_vector)

    return adjacency


def balance_classes_pt(X_train, X_test, y_train, y_test, col_name = 'Emissions', X_train_unscaled=False, X_test_unscaled=False):
    y_train_pd = pd.Series(y_train, name=col_name)
    min_count = y_train_pd.reset_index().groupby(col_name).count().min()
    y_train_df = y_train_pd.reset_index().groupby(col_name).sample(min_count.values)
    y_train = y_train_df[col_name].values
    X_train = X_train[y_train_df.index]

    y_test_pd = pd.Series(y_test, name=col_name)
    min_count = y_test_pd.reset_index().groupby(col_name).count().min()
    y_test_df = y_test_pd.reset_index().groupby(col_name).sample(min_count.values)
    y_test = y_test_df[col_name].values
    X_test = X_test[y_test_df.index]

    if X_train_unscaled is not False:
      X_train_unscaled = X_train_unscaled[y_train_df.index]
      X_test_unscaled = X_test_unscaled[y_test_df.index]
      return X_train, X_test, torch.tensor(y_train), torch.tensor(y_test), X_train_unscaled, X_test_unscaled
    else:
      return X_train, X_test, torch.tensor(y_train), torch.tensor(y_test)

In [None]:
## Create all graphs
learning_rates = [0.01]
hidden_sizes = [64]

balance=True
for input_data in ['CT_manufacturing','petrochemicals','unfccc']:
    print(input_data)
     # Output data
    #data_path = 'C:/Users\lukec\PycharmProjects\emissions-tracking-conda\emissions-tracking\models/datasets/'
    model_path = 'C:/Users\lukec\PycharmProjects\emissions-tracking-conda\emissions-tracking\models/'+input_data+'/' # '/content/models/'

    # # Define divider for level 3
    if input_data == 'CT_manufacturing':
        divider = 'iso3_country'
        inference_cols = ['iso3_country', 'original_inventory_sector', 'asset_type']
        time_col = 'Timestep'
        timesteps = [str(i) for i in range(0,90)]
        graph_cols, max_edges = [0,1,3,5], 100
    elif input_data == 'petrochemicals':
        divider = 'COUNTRY/TERRITORY'
        inference_cols = ['PRODUCT', 'COUNTRY/TERRITORY']
        time_col = 'Year'
        timesteps = [str(i) for i in range(1978,2051)]
        graph_cols, max_edges = [0,3], 15
    elif input_data == 'unfccc':
        divider='Party'
        inference_cols = ['Party', 'Category']
        time_col = 'Year'
        timesteps = [str(i) for i in range(1990,2021)]
        graph_cols, max_edges = [0], 100

    ## Parameters
    # max_test_set = 100000
    # random_state = 2
    # test_size = 0.3
    regression = False

    # Gap level
    for gap_filling_level in [1,2,3]:
        print('Gap level '+str(gap_filling_level))
        X_train_pt = torch.load(data_path+'X_train_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')#.cuda()
        X_test_pt = torch.load(data_path+'X_test_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')#.cuda()
        y_train_pt = torch.load(data_path+'y_train_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')#.cuda()
        y_test_pt = torch.load(data_path+'y_test_pt-'+input_data+'-'+str(gap_filling_level)+'.pt')#.cuda()

        X_train_unscaled = np.load(data_path+'X_train_unscaled-'+input_data+'-'+str(gap_filling_level)+'.npy')
        X_test_unscaled = np.load(data_path+'X_test_unscaled-'+input_data+'-'+str(gap_filling_level)+'.npy')

        if balance:
          X_train_pt, X_test_pt, y_train_pt, y_test_pt, X_train_unscaled, X_test_unscaled = balance_classes_pt(X_train_pt, X_test_pt, y_train_pt, y_test_pt, X_train_unscaled=X_train_unscaled, X_test_unscaled=X_test_unscaled)

        # Define graph
        graph = Data()
        graph.x = torch.cat((X_train_pt, X_test_pt))
        graph.y = torch.cat((y_train_pt, y_test_pt))

        # Train/test division
        graph.train_mask = torch.tensor([True]*len(X_train_pt)+[False]*len(X_test_pt))
        graph.test_mask = ~graph.train_mask

        # Edge creation
        input_df = pd.DataFrame(np.concatenate((X_train_unscaled[:,graph_cols].astype(int), X_test_unscaled[:,graph_cols].astype(int))))
        graph.edge_index = pd_to_adj_matrix(input_df, columns=list(range(len(graph_cols))), max_edges=max_edges)

        num_features = graph.x.shape[1]
        num_classes = 4

        for hidden_dim in hidden_sizes:
            print(hidden_dim)
            if input_data=='CT_manufacturing' and gap_filling_level==1:
                models = [
                   #('GCN', GCN(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    # ('SAGE', GraphSAGE(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    # ('GIN', GIN(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    # ('GAT', GAT(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),
                    ('GCLSTM', GC_LSTM(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    ('GUNET', GraphUNet(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes))#.cuda())
                    ]
            else:
                models = [
                   #('GCN', GCN(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    ('SAGE', GraphSAGE(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    ('GIN', GIN(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    # ('GAT', GAT(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),
                    ('GCLSTM', GC_LSTM(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes)),#.cuda()),
                    ('GUNET', GraphUNet(num_features=num_features, hidden_dim=hidden_dim, num_classes=num_classes))#.cuda())
                        ]
            # # Define a list of models to train

            # best_accuracy = {}
            # best_learning_rate = {}
            # best_hidden_size = {}

            for model_name, model in models:

                # best_accuracy[model_name] = 0
                # best_learning_rate[model_name] = 0
                # best_hidden_size[model_name] = 0

                print(f"Training {model_name}")
                for learning_rate in learning_rates:
                    print(learning_rate)

                    model_type = 'reg' if regression else 'clf'
                    model_file = model_path + model_type + '_' + model_name + '_l' + str(str(gap_filling_level)) + '_' + date.today().strftime("%y%m%d")

                    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
                    criterion = nn.CrossEntropyLoss()
                    epochs = 50

                    loss, model, optimizer = train_graph(model, graph, optimizer, criterion, epochs=epochs)

                    torch.save(model.state_dict(), model_file+'.pt')
                    #
                    if regression:
                        y_pred = test_graph(model, graph)
                    else:
                        _, y_pred = test_graph(model, graph)

                    scores = metrics(graph.y[graph.test_mask], y_pred, model_type)

                    np.save(model_file+'_'+str(learning_rate)+'_'+str(hidden_size)+'_20iter.npy', scores)

                    # # Check if the current combination of hyperparameters is the best
                    # if scores['average_acc'] > best_accuracy[model_name]:
                    #     best_accuracy[model_name] = scores['average_acc']
                    #     best_learning_rate[model_name] = learning_rate
                    #     best_hidden_size[model_name] = hidden_size

        # Train the best models with the best hyperparameters for 100 epochs
        # for model_name, model in models:
        #     best_lr = best_learning_rate[model_name]
        #     best_hs = best_hidden_size[model_name]
        #     model_type = 'reg' if regression else 'clf'
        #     model_file = model_path + model_type + '_' + model_name + '_l' + str(str(gap_filling_level)) + '_' + date.today().strftime("%y%m%d")
        #
        #     optimizer = torch.optim.Adam(model.parameters(), lr=best_lr)
        #     model.hidden_dim = best_hs
        #     criterion = nn.CrossEntropyLoss()
        #     epochs = 100
        #
        #     criterion = torch.nn.CrossEntropyLoss()
        #
        #     loss, model, optimizer = train_graph(model, graph, optimizer, criterion, epochs=epochs)
        #
        #     torch.save(model.state_dict(), model_file+'.pt')
        #
        #     if regression:
        #         y_pred = test_graph(model, graph)
        #     else:
        #         _, y_pred = test_graph(model, graph)
        #
        #     scores = metrics(graph.y[graph.test_mask], y_pred, model_type)
        #
        #     np.save(model_file+'_'+str(best_lr)+'_'+str(best_hs)+'_100iter.npy', scores)

CT_manufacturing
Gap level 1
64
Training GCLSTM
0.01
in
Train activated
0
zeroed
out
loss
Epoch 0: train loss: 1.3850598335266113
1
zeroed
out
loss
Epoch 1: train loss: 1.3434926271438599
2
zeroed
out
loss
Epoch 2: train loss: 1.2638555765151978
3
zeroed
out
loss
Epoch 3: train loss: 1.0503066778182983
4
zeroed
out
loss
Epoch 4: train loss: 0.7897329330444336
5
zeroed
out
loss
Epoch 5: train loss: 0.6696930527687073
6
zeroed
out
loss
Epoch 6: train loss: 1.5732266902923584
7
zeroed
out
loss
Epoch 7: train loss: 0.5507981777191162
8
zeroed
out
loss
Epoch 8: train loss: 0.48783057928085327
9
zeroed
out
loss
Epoch 9: train loss: 0.43939098715782166
10
zeroed
out
loss
Epoch 10: train loss: 0.41586723923683167
11
zeroed
out
loss
Epoch 11: train loss: 0.4298539459705353
12
zeroed
out
loss
Epoch 12: train loss: 0.4459995925426483
13
zeroed
out
loss
Epoch 13: train loss: 0.5271507501602173
14
zeroed
out
loss
Epoch 14: train loss: 0.37927868962287903
15
zeroed
out
loss
Epoch 15: train loss: 0.7

In [43]:
scores

{'confusion': array([[ 7972,  5030,  5578,  2328],
        [ 2878,  9261,  7109,  1660],
        [ 1206,  5932, 11457,  2313],
        [  829,  1949,  6823, 11307]], dtype=int64),
 'overall_acc': 0.47824995217141764,
 'average_acc': 0.47824995217141764,
 'kappa': 0.3043332695618902,
 'IoU': 0.32040123437967016}

In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv, GINConv, GATConv

# Graph Convolutional Network (GCN)
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# class GCN(nn.Module):
#     def __init__(self, num_features, hidden_dim, num_classes):
#         super(GCN, self).__init__()
#         self.conv1 = GCNConv(num_features, hidden_dim)
#         self.conv2 = GCNConv(hidden_dim, num_classes)
#
#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
#         x = F.relu(self.conv1(x, edge_index))
#         x = F.dropout(x, training=self.training)
#         x = self.conv2(x, edge_index)
#         return F.log_softmax(x, dim=1)


#GraphSAGE
class GraphSAGE(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(num_features, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


# Graph Isomorphism Network (GIN)
class GIN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GIN, self).__init__()
        self.conv1 = GINConv(torch.nn.Sequential(torch.nn.Linear(num_features, hidden_dim), torch.nn.ReLU(),
                                                 torch.nn.Linear(hidden_dim, hidden_dim)))
        self.conv2 = GINConv(torch.nn.Sequential(torch.nn.Linear(hidden_dim, hidden_dim), torch.nn.ReLU(),
                                                 torch.nn.Linear(hidden_dim, num_classes)))

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


# Graph Attention Network (GAT)
class GAT(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim, heads=8, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * 8, num_classes, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


# Graph Convolutional LSTM (GC-LSTM)
class GC_LSTM(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GC_LSTM, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.lstm = torch.nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.lin = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = x.unsqueeze(0)
        x, _ = self.lstm(x)
        x = x.squeeze(0)
        x = self.lin(x)
        return F.log_softmax(x, dim=1)


# Graph U-Net
class GraphUNet(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GraphUNet, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x1 = self.conv1(x, edge_index)
        x1 = F.relu(x1)
        x2 = self.conv2(x1, edge_index)
        x2 = F.relu(x2)
        x3 = self.conv3(x2, edge_index)
        return F.log_softmax(x3, dim=1)

from torch_geometric.nn import GCNConv

# class GCN(nn.Module):
#     def __init__(self, num_features, hidden_dim, num_classes):
#         super(GCN, self).__init__()
#         self.conv1 = GCNConv(num_features, hidden_dim)
#         self.conv2 = GCNConv(hidden_dim, num_classes)
#
#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
#         x = F.relu(self.conv1(x, edge_index))
#         x = F.dropout(x, training=self.training)
#         x = self.conv2(x, edge_index)
#         return F.log_softmax(x, dim=1)

from torch_geometric.nn import SAGEConv
#
class SAGE(nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(SAGE, self).__init__()
        self.conv1 = SAGEConv(num_features, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


# Create a random graph dataset with 100000 nodes and 9 features
# x_train = torch.randn(100000, 9)
# edge_index_train = torch.randint(0, 100000, (2, 100000))
#
# data = Data(x=x_train, edge_index=edge_index_train)
#
# # Instantiate the graph neural network models
# gcn_model = GCN(num_features=9, hidden_dim=64, num_classes=10)
# graphsage_model = GraphSAGE(num_features=9, hidden_dim=64, num_classes=10)
# gin_model = GIN(num_features=9, hidden_dim=64, num_classes=10)
# gat_model = GAT(num_features=9, hidden_dim=64, num_classes=10)
# gclstm_model = GC_LSTM(num_features=9, hidden_dim=64, num_classes=10)
# graphunet_model = GraphUNet(num_features=9, hidden_dim=64, num_classes=10)
#
# # Perform forward pass
# output_gcn = gcn_model(data)
# output_graphsage = graphsage_model(data)
# output_gin = gin_model(data)
# output_gat = gat_model(data)
# output_gclstm = gclstm_model(data)
# output_graphunet = graphunet_model(data)
#
# # Print the output shapes
# print("GCN output shape:", output_gcn.shape)
# print("GraphSAGE output shape:", output_graphsage.shape)
# print("GIN output shape:", output_gin.shape)
# print("GAT output shape:", output_gat.shape)
# print("GC-LSTM output shape:", output_gclstm.shape)
# print("Graph U-Net output shape:", output_graphunet.shape)


In [7]:
## Define torch geometric functions and models
import torch.nn.functional as F

from torch_geometric.nn import GCNConv

# class GCN(nn.Module):
#     def __init__(self, num_features, hidden_dim, num_classes):
#         super(GCN, self).__init__()
#         self.conv1 = GCNConv(num_features, hidden_dim)
#         self.conv2 = GCNConv(hidden_dim, num_classes)
#
#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
#         x = F.relu(self.conv1(x, edge_index))
#         x = F.dropout(x, training=self.training)
#         x = self.conv2(x, edge_index)
#         return F.log_softmax(x, dim=1)

# from torch_geometric.nn import SAGEConv
#
# class SAGE(nn.Module):
#     def __init__(self, num_features, hidden_dim, num_classes):
#         super(SAGE, self).__init__()
#         self.conv1 = SAGEConv(num_features, hidden_dim)
#         self.conv2 = SAGEConv(hidden_dim, num_classes)
#
#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
#         x = F.relu(self.conv1(x, edge_index))
#         x = F.dropout(x, training=self.training)
#         x = self.conv2(x, edge_index)
#         return F.log_softmax(x, dim=1)


def train_graph(model, data, optimizer, criterion=torch.nn.CrossEntropyLoss(), epochs=100, verbose=True):
    """Training function for pytorch models"""
    print('in')
    model.train()
    print('Train activated')
    for epoch in range(epochs):
        print(str(epoch))
        optimizer.zero_grad()  # Clear gradients
        print('zeroed')
        out = model(data) # Forward pass
        print('out')
        loss = criterion(out[data.train_mask], data.y[data.train_mask]) # Compute loss
        print('loss')
        if verbose:
            print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
        loss.backward()  # Derive gradients
        optimizer.step()  # Update parameters based on gradients

    return loss, model, optimizer

def test_graph(model, data, categorical=True):
    """Test function for pytorch models"""
    model.eval()
    pred = model(data) # Forward pass
    y_pred = pred[data.test_mask]

    if categorical: # Get category with the highest probability
        _, y_pred_cats = torch.max(y_pred, dim = 1)
        return y_pred, y_pred_cats
    else: # Return raw prediction
        return y_pred

In [8]:
from torch_geometric.utils import remove_self_loops
from scipy.sparse import coo_matrix, diags

def pd_to_adj_matrix(df:pd.DataFrame, columns:list, weights:list = False, remove_self_conns:bool = True, max_edges = 100, verbose:bool = True):
    """Form pytorch COO format adjacency matrix from pandas dataframe columns"""
    groups = [group.index.values.astype(int) for col in columns for _, group in df.groupby(col)]

    rows = torch.tensor(np.concatenate([np.tile(g.flatten(), min(len(g), max_edges)) for g in groups]), dtype=torch.long)
    cols = torch.tensor(np.concatenate([np.repeat(g.flatten(), min(len(g), max_edges)) for g in groups]), dtype=torch.long)

    if weights is False:
        weight_vector = torch.tensor(np.ones(len(rows), dtype=int), dtype=torch.float)
    else: weight_vector = torch.tensor(np.ones(len(rows), dtype=int)*weights, dtype=torch.long)

    adjacency = torch_sparse.SparseTensor(row=rows, col=cols, value=weight_vector)
        #coo_matrix((rows, cols), weight_vector, size=(len(df), len(df)))

    #if remove_self_conns:
        #adjacency = remove_self_loops(adjacency)
        #adjacency -= diags(adjacency.diagonal(k=0))

    #if verbose:
        # print(f"Size of adjacency in memory: {(adjacency.data.nbytes + adjacency.indptr.nbytes + adjacency.indices.nbytes) / 8 / 1024 ** 3:.3f}GB")
        #print(f"Numb edges: {len(adjacency.indices)}")
    return adjacency

# ## Attempts for full edges (3 billion) - other solution, kneighbors graph, faiss
# for group in tqdm(groups[10:20]):
#     edges += list(product(group, repeat=2))
#
#     all_pairs = pd.DataFrame()
#
#
# x_arr = np.array(graph.x[:,0]).reshape(-1,1)
# unique_vals = np.unique(x_arr)
# for u_val in tqdm(unique_vals):
#     locs = locate(x_arr, lambda x: x==u_val)
#     df = pd.DataFrame(locs)
#     vals = df.values.astype(int)
#     pairs = product(vals,vals)
#     pair_df = pd.DataFrame(pairs).astype(int)
#     all_pairs = pd.concat((all_pairs, pair_df))

In [None]:
## torch geometric classifier training - Graph definition

In [None]:
## Loop over names and model definitions

regression = False

input_dim = X_train.shape[1]
output_dim = 4
hidden_dim = 64

names = ['GCN-h'+str(hidden_dim), 'SAGE-h'+str(hidden_dim)] #['GCN-h'+str(hidden_dim), 'GAT-h'+str(hidden_dim), 'SAGE-h'+str(hidden_dim), 'GIN-h'+str(hidden_dim)]
models = [GCN(input_dim, hidden_dim, output_dim), SAGE(input_dim, hidden_dim, output_dim)] #[GCN(input_dim, hidden_dim, output_dim), GAT(input_dim, hidden_dim, output_dim), SAGE(input_dim, hidden_dim, output_dim), GIN(input_dim, hidden_dim, output_dim)]

criterion = torch.nn.CrossEntropyLoss()

for model_name, model in zip(names, models):
    model_type = 'reg' if regression else 'clf'
    model_file = model_path+model_type+'_'+model_name+'_l'+str(gap_filling_level)+'_'+date.today().strftime("%y%m%d")

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    loss, model, optimizer = train_graph(model, graph, optimizer, criterion, epochs=10)

    torch.save(model.state_dict(), model_file+'.pt')

    if regression:
        y_pred = test_graph(model, graph)
    else:
        _, y_pred = test_graph(model, graph)

    scores = metrics(y_test_pt, y_pred, model_type)

    np.save(model_file+'.npy', scores)

Epoch 0: train loss: 1.5003718137741089
Epoch 1: train loss: 1.3746167421340942
Epoch 2: train loss: 1.2647819519042969
Epoch 3: train loss: 1.1701548099517822
Epoch 4: train loss: 1.0908373594284058
Epoch 5: train loss: 1.0272905826568604
Epoch 6: train loss: 0.9780631065368652
Epoch 7: train loss: 0.942402184009552
Epoch 8: train loss: 0.9171971678733826
Epoch 9: train loss: 0.8992316722869873
Epoch 0: train loss: 1.356438398361206
Epoch 1: train loss: 1.1591078042984009
Epoch 2: train loss: 1.067864179611206
Epoch 3: train loss: 1.0133190155029297
Epoch 4: train loss: 0.9641536474227905
Epoch 5: train loss: 0.9199181795120239
Epoch 6: train loss: 0.885001540184021
Epoch 7: train loss: 0.8621838688850403
Epoch 8: train loss: 0.847547173500061
Epoch 9: train loss: 0.8411931991577148
