# Summary

Idea: We have seen decent signals from the 0th and last index of our embeddings; what if we used the average of our embeddings?

In [1]:
"""
Mount Drive onto this notebook. Note that there is a specific
file structure. In particular, we have
-- MyDrive
-- -- EVO
-- -- -- vcfs
-- -- -- CRyPTIC_reuse_table_20231208.csv
-- -- -- h37rv_genebank_flatfile.gbff

(This will need to be updated!)
"""

import os

import pandas as pd
import numpy as np
import sklearn as sk

from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

evo_general_dir = '/content/drive/MyDrive/EVO/'
samples_dir = 'vcfs/'

emb_dir = evo_general_dir + 'rif_embeddings_v1/'
current_dir = emb_dir + 'embeds_1.0_singles_partial/'

h37rv_genome_file = 'GCF_000195955.2_ASM19595v2_genomic.fna'
cryptic_general_file = 'CRyPTIC_reuse_table_20231208.csv'

unique_ids = np.load(emb_dir + 'unique_ids.npy')
reuse_df = pd.read_csv(evo_general_dir + cryptic_general_file)

df = pd.read_csv(emb_dir + 'rif_labels.csv')[['UNIQUEID', 'RIF_BINARY_PHENOTYPE']]
embs_df = df.dropna()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Utilities

In [2]:
from sklearn.metrics import confusion_matrix
from os.path import exists


def get_site(unique_id):
  return unique_id.split('.')[1]


def embedding_generator(dir, unique_ids, ind=0, avg=False):
  """
    Generator for obtaining embeddings from a directory.
  """
  for id in unique_ids:
    site = 'site_' + get_site(id) + '/'
    sample_dir = dir + site + id + '.npy'

    if avg:
      yield np.mean(np.load(sample_dir), axis=0)
    else:
      yield np.load(sample_dir)[ind]


def compute_confusion(model, data_dir, test_df, save_file_path, pheno_col):
  """
    Compute confusion matrix for a given model.
  """
  preds = []
  for i, embed in enumerate(embedding_generator(
                                      data_dir,
                                      test_df['UNIQUEID'],
                                      avg=True
                                      )):
    preds.append(model.predict(embed.reshape(1, -1)))

  matrix = confusion_matrix(test_df[pheno_col], preds)
  np.save(save_file_path, matrix)

  return matrix

# Good ol' Logistic Regression

In [3]:
test = []
for i, row in embs_df.iterrows():
  if exists(current_dir + 'site_' + get_site(row['UNIQUEID']) + '/' + row['UNIQUEID'] + '.npy'):
    test.append(row)
test = pd.DataFrame(test)
embs_df = test

In [4]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(embs_df, test_size=0.2)
test_df, val_df = train_test_split(test_df, test_size=0.5)

In [None]:
import gc

pheno_col = 'RIF_BINARY_PHENOTYPE'
prepend = 'avg_'

weights = compute_class_weight(class_weight='balanced',
                               classes=np.unique(embs_df[pheno_col]),
                               y=embs_df[pheno_col])
weights = dict(zip(np.unique(embs_df[pheno_col]), weights))
lr = SGDClassifier(loss='log_loss', class_weight=weights)

confusions = []
errors = []
for k in range(10):
  for i, embed in tqdm(enumerate(
                        embedding_generator(
                            current_dir,
                            train_df['UNIQUEID'],
                            avg=True
                            )
                        )):
    if len(embed) != 4096:
      errors.append((i, embed))
      continue

    lr.partial_fit(embed.reshape(1, -1),
                        [train_df[pheno_col].iloc[i]],
                        classes=np.unique(embs_df[pheno_col]))
    del embed
    gc.collect()

  matrix = compute_confusion(lr,
                             current_dir,
                             val_df,
                             current_dir + prepend + 'lr_confusion_' + str(k) + '.npy',
                             pheno_col)
  confusions.append(matrix)
  print(matrix)

np.save(emb_dir + prepend + 'lr_model.npy', lr)
np.save(emb_dir + prepend+ 'lr_confusions.npy', confusions)

676it [08:35,  1.31it/s]


ValueError: cannot reshape array of size 12272 into shape (500,4096)