# Data Preparation

The code in this notebook demonstraps how to create training data of the form (Target ISIN, Better Recommendation ISIN, Worse Recommendation ISIN) from the outputs of the distance-based model.

This training data is then suitable for input to the model as detailed in BPRModel.ipynb

In [None]:
import csv
import json
import numpy as np
import pandas as pd
from tabulate import tabulate
from tqdm import tqdm_notebook as tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from data import SingleDayDataLoader
from main import get_similar_bonds

In [None]:
# THIS FILE CONTAINS A MAPPING BETWEEN ISIN AND AN INDEX FOR EACH BOND
# These indices will be the unique identifiers that the model uses to identify each bond
# e.g. {"US00037BAB80": 0, "US00037BAC63": 1, ...}
isin_to_index_mapping_file = 'isin_to_index2.json'

In [None]:
# This is the ouputfile of training data that we will generate with the code in this file
# It will contain rows of (TARGET BOND ISIN, BETTER RECOMMENDATION ISIN, WORSE RECOMMENDATION ISIN)
training_data_csv = 'train/all_rankings.csv'

In [None]:
# For performance reasons, we will load the CSV file, transform the ISINs to their numeric Index, 
# and save it as a PyTorch tensor
training_data_tensor = 'train/all_rankings.pt'

In [None]:
# DEFINE SOME ATTRIBUTES THAT WE REASONABLY EXPECT "SIMILAR" BONDS TO SHARE
group_attributes = ['BCLASS3', 'Country', 'Class - Detail - Code']

# DEFINE THE FEATURES THAT THE DISTANCE-BASED MODEL WILL TRAIN ON
features = ['Bid Spread',
 'Cur Yld',
 'G Spd',
 'Years to Mat',
 'OAS',
 'OAD',
 'Amt Out',
 'Cpn',
 'Excess Rtn',
 'Px Close',
 'KRD 6M',
 'KRD 2Y',
 'KRD 5Y',
 'KRD 10Y',
 'KRD 20Y',
 'KRD 30Y',
 'S&P Rating Num',
 'Accrued Int (%)',
 'Yield to Mat'
]

In [None]:
dataloader = SingleDayDataLoader()

In [None]:
dataloader.data.head()

In [None]:
EXAMPLE_ISIN = "US00037BAB80"

In [None]:
get_similar_bonds(EXAMPLE_ISIN, features=features, cohort_attributes=group_attributes)

In [None]:
# TODO Create the isin_to_index.json

# Generate distance2.csv  --> all_rankings.csv

The above code just demonstrates how to construct some rankings data for one bond. This process could be:
- Repeated for all bonds
- Run with several different types of distance based models (i.e. no group_attributes, train on yield and OAS)

In [None]:
class CsvFileDataset(Dataset):
    """
    Load the dataset from a saved CSV file on disk. This is slower, so run it once and then save as a Tensor
    """
    def __init__(self, csv_file_path, isin_index_mapping_file):
        self.isin_to_index = json.load(open(isin_index_mapping_file))
        with open(csv_file_path) as data_file:
            reader = csv.reader(data_file)
            next(reader)
            self.data = torch.tensor([[self.isin_to_index[isin] for isin in row] for row in reader])
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.data[idx]
    
    def save_as_tensor(self, path):
        torch.save(self.data, path)

In [None]:
dataset = CsvFileDataset(training_data_csv, 'isin_to_index_mapping_file')

In [None]:
dataset.save_as_tensor(training_data_tensor)