Use this file as a template to copy and build notebooks

In [None]:
#@title Mount drive and load libraries
%%capture
import os
from google.colab import drive

drive.mount('/content/drive/')
path = '/content/drive/MyDrive/msc_project'
os.chdir(path)

!pip install umap-learn[plot]
!wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh
!chmod +x Miniconda3-py39_23.3.1-0-Linux-x86_64.sh
!bash ./Miniconda3-py39_23.3.1-0-Linux-x86_64.sh -b -f -p /usr/local
!conda install -c conda-forge -c bioconda mmseqs2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
import umap

In [None]:
def cluster_purity(y_true, y_pred):
    confomat = confusion_matrix(y_true, y_pred)
    # We use the Linear Assignment Problem approach to solve label switching problem.
    row_ind, col_ind = linear_sum_assignment(-confusion_matrix_)
    return confomat[row_ind, col_ind].sum() / np.sum(confomat)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc_project/all_paired/230722_cluster_tensor_scfv_10000.csv', low_memory=False, index_col=0)

Use UMAP to assess effect of increasing components and increasing the number of clusters

In [None]:
umap_embedding = umap.UMAP(n_components=10, random_state=42)
redux = umap_embedding.fit_transform(df.iloc[:, 2:])

mmcluster = df['cluster']

In [None]:
n_cluster = len(set(mmcluster))

In [None]:
components = [2,5,10,20,30,40,50]
cluster_purities = []
for component in components:
  umap_embedding = umap.UMAP(n_components=component, random_state=42)
  redux = umap_embedding.fit_transform(df.iloc[:, 2:])
  kmeans = KMeans(n_clusters=n_cluster, random_state=42, n_init=10)
  kmeans.fit(redux)
  labels = kmeans.labels_
  purity = cluster_purity(y_true=mmcluster, y_pred=labels)
  cluster_purities.append(purity)

print(cluster_purities)

In [None]:
n_clusters = [20,50,100,200,300,400,500]
cluster_purities = []

umap_embedding = umap.UMAP(n_components=10, random_state=42)
redux = umap_embedding.fit_transform(df.iloc[:, 2:])

for cluster in n_clusters:
  kmeans = KMeans(n_clusters=cluster, random_state=42, n_init=10)
  kmeans.fit(redux)
  labels = kmeans.labels_
  purity = cluster_purity(y_true=mmcluster, y_pred=labels)
  cluster_purities.append(purity)

print(cluster_purities)

[0.2664, 0.2859, 0.2913, 0.2583, 0.2406, 0.2346, 0.231]


In [None]:
kmeans = KMeans(n_clusters=int(round(n_cluster/1.5, 0)), random_state=42)
kmeans.fit(redux)
labels = kmeans.labels_
confusion_matrix_ = confusion_matrix(mmcluster, labels)

row_ind, col_ind = linear_sum_assignment(-confusion_matrix_)
purity = confusion_matrix_[row_ind, col_ind].sum() / np.sum(confusion_matrix_)

print(purity)

0.2264


Assess kmeans by varying the mmseqs cluster threshold - to be carried out

In [None]:
os.chdir('/tmp')
!mkdir /tmp/new_tmp
!cp /content/drive/MyDrive/msc_project/all_paired/230716_scfv_10000.fasta /tmp

!sudo mmseqs easy-cluster 230716_scfv_10000.fasta clusterRes new_tmp --min-seq-id 0.8 -c 0.8 --cov-mode 1


In [None]:
#!/bin/bash

# List of values for the --min-seq-id parameter
min_seq_id_values=("0.8" "0.9" "0.7")

# Loop through each value for --min-seq-id and extract data
for min_seq_id in "${min_seq_id_values[@]}"; do
    # Run the mmseqs command with the current --min-seq-id value
    !sudo mmseqs easy-cluster 230716_scfv_10000.fasta clusterRes new_tmp --min-seq-id "$min_seq_id" -c 0.8 --cov-mode 1

    # Extract data or perform other operations on the results
    # and pass it to the Python script
    python your_python_script.py input_data_for_python_script
done
