# Report

In this notebook, you will learn who to generate our submission from computed features of the dataset.

First, import the necessary packages

In [None]:
import sys
import os
import pickle
import numpy as np
np.random.seed(0)
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
from typing import List, Tuple, Dict, Union

# from src.utils import fit_lr_classifier, infer_lr_classifier, calculate_score, calculate_score_raw

Define somm helper functions. They can be found at `src.utils.fitter` and `src.utils.io`

In [None]:
def calculate_score_raw(Y_dev: np.ndarray, 
                        Y_pred: np.ndarray):
    """Calculate loss, accuracy and f1_score of prediction

    Args:
        Y_dev (np.ndarray): label
        Y_pred (np.ndarray): prediction

    Returns:
        Tuple[int, int, int]: loss, accuracy and f1_score
    """
    loss_val = metrics.log_loss(Y_dev, Y_pred)
    Y_pred = (Y_pred > 0.5).astype(int)
    f1_score = metrics.f1_score(Y_dev, Y_pred, labels=None, pos_label=1, average='binary', sample_weight=None)
    acc = metrics.accuracy_score(Y_dev, Y_pred)
    print(f"Loss: {loss_val}, Accuracy: {acc}, F1-score: {f1_score}")
    return loss_val, acc, f1_score


def calculate_score(clf: LogisticRegression,
                    X_dev: np.ndarray, 
                    Y_dev: np.ndarray):
    """Test a classifier on dev set

    Args:
        clf (LogisticRegression): classifier
        X_dev (np.ndarray): input
        Y_dev (np.ndarray): label

    Returns:
        Tuple[int, int, int]: loss, accuracy and f1_score
    """
    Y_pred = clf.predict_proba(X_dev)
    Y_pred = Y_pred[:, 1]

    loss_val, acc, f1_score = calculate_score_raw(Y_dev, Y_pred)
    return loss_val, acc, f1_score


def fit_lr_classifier(X_train: np.ndarray,
                      Y_train: np.ndarray,
                      X_dev: np.ndarray = None,
                      Y_dev: np.ndarray = None,
                      *args,
                      **kwargs):
    """Fit a classifier using X_train and Y_train, teset on X_dev (if available). The hyperparameters is selected from args and kwargs

    Args:
        X_train (np.ndarray): train input
        Y_train (np.ndarray): train label
        X_dev (np.ndarray, optional): dev input. Defaults to None.
        Y_dev (np.ndarray, optional): dev label. Defaults to None.

    Returns:
        LogisticRegression: trained classifier
    """
    clf = LogisticRegression(*args, **kwargs)
    clf.fit(X_train, Y_train)

    if X_dev is not None and Y_dev is not None:
        calculate_score(clf, X_dev, Y_dev)

    return clf


def infer_lr_classifier(clf, X_test):
    """User a classifier to infer

    Args:
        clf (LogisticRegression): classifier
        X_test (np.ndarray): input

    Returns:
        np.ndarray: prediction [Nx1]
    """
    pred = clf.predict_proba(X_test)
    return pred[:, 1]

def generate_submission(output_dir: str, 
                        pred: np.ndarray, 
                        tag: str = ""):
    """The function dumps a prediction tensor to submission csv

    Args:
        output_dir (str): directory of ouput
        pred (np.ndarray): prediction
        tag (str, optional): tag to append before filename. Defaults to "".
    """
    if not (len(pred.shape) == 1 or (len(pred.shape) == 2 and pred.shape[1] == 1)):
        print(f"Expect pred to be a vector")
    if not pred.shape[0] == 106692:
        print(f"Expect pred to have length 106692 but have {pred.shape[0]}")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    results = zip(range(len(pred)), pred)
    with open(os.path.join(output_dir, f'{"_".join([tag, "submission"])}.csv' if len(tag) > 0 else 'submission.csv'), 'w') as f:
        f.write('id,predicted\n')
        for idx, row in enumerate(results):
            f.write(f"{row[0]},{row[1]}\n")

You have to provide the features and put them under the `features` directory. Refer the `README.md` for instruactions

| Name                                    | Content                                           |
| --------------------------------------- | ------------------------------------------------- |
| `features/baseline-enhanced.pkl`        | from `{PROJECT_ROOT}/tasks/run_new_baseline`      |
| `features/author_graph_lr_features.pkl` | from `{PROJECT_ROOT}/tasks/run_authorgraph_lr`    |
| `features/graphsage_essay_features.pkl` | from `{PROJECT_ROOT}/tasks/train_graphsage_essay` |
| `features/uv_list.pkl`                  | from `{PROJECT_ROOT}/tasks/generate_dataset`      |

> `author_graph_lr_features.pkl` and `author_graph_lr_features.pkl` are selective

There are two way to get these features:

**1. Download**

Download the pre-computed features from [https://github.com/davidliyutong/ICE6407P-260-M01/releases/tag/submission](https://github.com/davidliyutong/ICE6407P-260-M01/releases/tag/submission)

Unzip the `features.zip`, then put all `*.pkl` files under the `./features` directory



In [None]:
!bash ./startup.sh

**2. Generation**

By running a set of pythonscripts, you can generate these features. See `manual_generation.md` for details.

## Enhanced baseline method

The quickest run the enhanced baseline method is the run the `{PROJECT_ROOT}/tasks/run_new_baseline/run_new_baseline.py`

In [None]:
sys.path.append('../../')
%run ../run_new_baseline/run_new_baseline.py

Load dataset from serialized datasets

In [None]:
features = {
    'author_graph_lr': pickle.load(open('./features/author_graph_lr_features.pkl', 'rb')),
    'graphsage_essay': pickle.load(open('./features/graphsage_essay_features.pkl', 'rb')),
    'baseline_enhanced': pickle.load(open('./features/baseline-enhanced.pkl','rb'))
}
uv_list = pickle.load(open('./features/uv_list.pkl', 'rb'))

In [None]:
X_train_0 = features['baseline_enhanced']['X_train']
X_dev_0 = features['baseline_enhanced']['X_dev']
X_test_0 =  features['baseline_enhanced']['X_test']
X_whole_0 = np.concatenate([features['baseline_enhanced']['X_train'],
                            features['baseline_enhanced']['X_dev']],axis=0)

X_train_1 = features['graphsage_essay']['X_train']
X_dev_1 = features['graphsage_essay']['X_dev']
X_test_1 =  features['graphsage_essay']['X_test']
X_whole_1 = np.concatenate([features['graphsage_essay']['X_train'],
                            features['graphsage_essay']['X_dev']],axis=0)


X_train_2 = features['author_graph_lr']['X_train']
X_dev_2 = features['author_graph_lr']['X_dev']
X_test_2 = features['author_graph_lr']['X_test']
X_whole_2 = np.concatenate([features['author_graph_lr']['X_train'],
                            features['author_graph_lr']['X_dev']],axis=0)
X_train_3 = np.concatenate([X_train_0, X_train_2], axis=1)
X_dev_3 = np.concatenate([X_dev_0, X_dev_2], axis=1)
X_test_3 =  np.concatenate([X_test_0, X_test_2], axis=1)
X_whole_3 =  np.concatenate([X_whole_0, X_whole_2], axis=1)

Y_train = uv_list['train_y']
Y_dev = uv_list['dev_y']
Y_whole = np.concatenate([Y_train, Y_dev], axis=0)

Train the LR classifiers

In [None]:
clf0 = fit_lr_classifier(
    X_whole_0,
    Y_whole,
    X_dev_0,
    Y_dev,
    solver='lbfgs',
    tol=1e-5,
    max_iter=1000,
    n_jobs=8,
    verbose=1,
)
clf1 = fit_lr_classifier(
    X_whole_1,
    Y_whole,
    X_dev_1,
    Y_dev,
    tol=1e-5,
    max_iter=600,
    verbose=1,
)
clf2 = fit_lr_classifier(
    X_whole_2, 
    Y_whole, 
    X_dev_2, 
    Y_dev, 
    tol=1e-5,
    max_iter=600, 
    verbose=1,
)
clf3 = fit_lr_classifier(
    X_whole_3,
    Y_whole,
    X_dev_3,
    Y_dev,
    solver='lbfgs',
    tol=1e-5,
    max_iter=400,
    n_jobs=8,
    verbose=1,
) 

In [None]:
scores = infer_lr_classifier(clf3, X_test_3)
generate_submission('./outputs', scores, "all_gather")

In [None]:
scores = infer_lr_classifier(clf0, X_test_0)
generate_submission('./outputs', scores, "baseline_enhanced")

## Voting

We could use the voting techniques to aggregate models

In [None]:
class Voter:
    def __init__(self, estimators: list, weight=None):
        self.estimators_lookup = {name: idx for idx, (name, _) in enumerate(estimators)}
        self.estimators = [item for name, item in estimators]
        if weight is not None:
            self.weight = weight / sum(weight)
        else:
            self.weight = None

    def fit(self, x, y, *args, **kwargs):
        for name, data in x.keys():
            self.estimators[self.estimators_lookup[name]].fit(data, y, *args, **kwargs)
            print(f"fitting: {name}")

    def predict_proba(self, x):
        predictions = [self.estimators[self.estimators_lookup[name]].predict_proba(x[name]) for name in x.keys()]
        summary = 0

        for idx, partial_result in enumerate(predictions):
            if self.weight is not None:
                summary += partial_result * self.weight[idx]
            else:
                summary += 1 / len(self.estimators) * partial_result

        return summary

    def decide(self, x, thresh=0.5):
        predictions = [self.estimators[self.estimators_lookup[name]].predict_proba(x[name]) for name in x.keys()]
        summary = 0

        for idx, partial_result in enumerate(predictions):
            if self.weight is not None:
                summary += partial_result * self.weight[idx]
            else:
                summary += 1 / len(self.estimators) * partial_result

        return summary[:, 1] > 0.5

In [None]:
voter = Voter(estimators=[('lr0', clf0), ('lr1', clf1), ('lr2', clf2)],weight=np.array([100,87,88]))
pred = voter.decide({'lr0': X_dev_0, 'lr1': X_dev_1, 'lr2': X_dev_2})
calculate_score_raw(pred, Y_dev)

In [None]:
scores = voter.predict_proba({'lr0': X_test_0, 'lr1': X_test_1, 'lr2': X_test_2})[:,1]
generate_submission('./outputs', scores, "voting")