# Summary

In [11]:
"""
Mount Drive onto this notebook. Note that there is a specific
file structure. In particular, we have
-- MyDrive
-- -- EVO
-- -- -- vcfs
-- -- -- CRyPTIC_reuse_table_20231208.csv
-- -- -- h37rv_genebank_flatfile.gbff

(This will need to be updated!)
"""

import os

import pandas as pd
import numpy as np
import sklearn as sk

from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

evo_general_dir = '/content/drive/MyDrive/EVO/'
samples_dir = 'vcfs/'

emb_dir = evo_general_dir + 'emb_embeddings_v1/'
emb_left_dir = emb_dir + 'embeds_1.0_left/'
emb_right_dir = emb_dir + 'embeds_1.0_right/'
emb_full_dir = emb_dir + 'embeds_1.0_full/'

h37rv_genome_file = 'GCF_000195955.2_ASM19595v2_genomic.fna'
cryptic_general_file = 'CRyPTIC_reuse_table_20231208.csv'

unique_ids = np.load(emb_dir + 'unique_ids.npy')
reuse_df = pd.read_csv(evo_general_dir + cryptic_general_file)

med = 'EMB'
df = pd.read_csv(emb_dir + 'embs_df.csv')[['UNIQUEID', med + '_BINARY_PHENOTYPE']]
embs_df = df[df.UNIQUEID.isin(unique_ids)].dropna().reset_index(drop=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Utilities

In [13]:
from sklearn.metrics import confusion_matrix


def get_site(unique_id):
  return unique_id.split('.')[1]


def compute_confusion(model, data_dir, test_df, save_file_path, pheno_col):
  """
    Compute confusion matrix for a given model.
  """
  preds = []
  for i, embed in enumerate(embedding_generator(
                                      data_dir,
                                      test_df['UNIQUEID'],
                                      ind=-1
                                      )):
    preds.append(model.predict(embed.reshape(1, -1)))

  matrix = confusion_matrix(test_df[pheno_col], preds)
  np.save(save_file_path, matrix)

  return matrix


def embedding_generator(dir, unique_ids, ind=-1):
  """
    Generator for obtaining embeddings from a directory.
  """
  for id in unique_ids:
    site = 'site_' + get_site(id) + '/'
    sample_dir = dir + site + id + '.npy'
    yield np.load(sample_dir)[0][-1]



# Good ol' Logistic Regression

In [14]:
current_dir = emb_left_dir
for i, embed in tqdm(enumerate(
                      embedding_generator(
                          current_dir,
                          embs_df['UNIQUEID'],
                          ind=-1
                          )
                      )):
  np.save(emb_dir + 'embeds_1.0_left_index_0/' + embs_df['UNIQUEID'].loc[i] + '.npy',embed)

12100it [2:07:07,  1.59it/s]


In [10]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(embs_df, test_size=0.2)
test_df, val_df = train_test_split(test_df, test_size=0.5)

import gc

pheno_col = 'EMB_BINARY_PHENOTYPE'

weights = compute_class_weight(class_weight='balanced',
                               classes=np.unique(embs_df[pheno_col]),
                               y=embs_df[pheno_col])
weights = dict(zip(np.unique(embs_df[pheno_col]), weights))
lr = SGDClassifier(loss='log_loss', class_weight=weights)

emb_type = 'left'
current_dir = emb_left_dir
confusions = []
errors = []

for k in range(10):
  for i, embed in tqdm(enumerate(
                        embedding_generator(
                            current_dir,
                            train_df['UNIQUEID'],
                            ind=-1
                            )
                        )):
    if len(embed) != 4096:
      errors.append((i, embed))
      continue

    lr.partial_fit(embed.reshape(1, -1),
                        [train_df[pheno_col].iloc[i]],
                        classes=np.unique(embs_df[pheno_col]))
    del embed
    gc.collect()

  matrix = compute_confusion(lr,
                             current_dir,
                             val_df,
                             emb_dir + 'results/right_lr_confusion_' + str(k) + '.npy',
                             pheno_col)
  confusions.append(matrix)
  print(matrix)

np.save(emb_dir + emb_type + '_lr_model.npy', lr)
np.save(emb_dir + emb_type + '_lr_confusions.npy', confusions)

9680it [2:03:23,  1.31it/s]


[[  0   0 150]
 [  0   0 213]
 [  0   0 847]]


2452it [30:28,  1.34it/s]


KeyboardInterrupt: 

## Right embeddings

In [95]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

embs_df = df.sample(n=6000)
train_df, test_df = train_test_split(embs_df, test_size=0.2)
test_df, val_df = train_test_split(test_df, test_size=0.5)

In [108]:
current_dir = emb_right_dir

weights = compute_class_weight(class_weight='balanced',
                               classes=np.unique(embs_df['EMB_BINARY_PHENOTYPE']),
                               y=embs_df['EMB_BINARY_PHENOTYPE'])
weights = dict(zip(np.unique(embs_df['EMB_BINARY_PHENOTYPE']), weights))
right_lr = SGDClassifier(loss='hinge', class_weight=weights)

confusions = []
for k in range(10):
  for X, Y in tqdm(embedding_generator(current_dir,
                                        train_df,
                                        batch_size=10,
                                        ind=-1)):

    right_lr.partial_fit(X,
                         Y,
                         classes=np.unique(embs_df['EMB_BINARY_PHENOTYPE']))

  matrix = compute_confusion(right_lr,
                              current_dir,
                              val_df,
                              current_dir + 'right_lr_confusion_' + str(k) + '.npy')
  confusions.append(matrix)
  print(matrix)

np.save(emb_dir + 'right_lr_model.npy', right_lr)

7it [00:31,  4.48s/it]


KeyboardInterrupt: 

## Left embeddings

In [3]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(embs_df, test_size=0.2)
test_df, val_df = train_test_split(test_df, test_size=0.5)

NameError: name 'embs_df' is not defined

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

weights = compute_class_weight(class_weight='balanced',
                               classes=np.unique(embs_df['EMB_BINARY_PHENOTYPE']),
                               y=embs_df['EMB_BINARY_PHENOTYPE'])
weights = dict(zip(np.unique(embs_df['EMB_BINARY_PHENOTYPE']), weights))
left_lr = SGDClassifier(loss='hinge', class_weight=weights)

for k in range(10):
  for i, embed in tqdm(enumerate(embedding_generator(emb_left_dir, train_df['UNIQUEID']))):
    left_lr.partial_fit(embed.reshape(1, -1),
                        [train_df['EMB_BINARY_PHENOTYPE'].iloc[i]],
                        classes=np.unique(embs_df['EMB_BINARY_PHENOTYPE']))

  preds = []
  for i, embed in tqdm(enumerate(embedding_generator(emb_left_dir, val_df['UNIQUEID']))):
    preds.append(left_lr.predict(embed.reshape(1, -1)))

  matrix = confusion_matrix(val_df['EMB_BINARY_PHENOTYPE'],
                              preds)
  np.save(emb_dir + 'left_lr_confusion_' + str(k) + '.npy', matrix)

np.save(emb_dir + 'left_lr_model.npy', left_lr)

In [8]:
from sklearn.metrics import confusion_matrix

preds = []
for i, embed in tqdm(enumerate(embedding_generator(emb_left_dir, test_df['UNIQUEID']))):
  preds.append(left_lr.predict(embed.reshape(1, -1)))

matrix = confusion_matrix(test_df['EMB_BINARY_PHENOTYPE'],
                          preds)

np.save(emb_dir + 'left_lr_confusion.npy', matrix)

2420it [27:11,  1.48it/s]
