# Data extraction and preparation

## Overview
1.   Clone repo and import dependencies
2.   Pull paired sequence data from opig
3.   Merge data into a single file and remove seqs with warnings
4.   Convert dataframe to a fasta file containing an ScFv (heavy & light chain)
5.   Run MMSeqs2


In [1]:
# @title Clone repo for running project and install dependencies { display-mode: "form" }
token = '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' #token specifically for reading and running code
username = ''
repo = 'msc-project-source-code-files-22-23-mbalmf01'
!git clone https://{token}@github.com/Birkbeck/msc-project-source-code-files-22-23-mbalmf01
!pip install biopython
!mkdir /content/all_paired
!mkdir /content/all_paired/opig_data

import sys, pandas as pd, os, time
sys.path.append('/content/msc-project-source-code-files-22-23-mbalmf01/scripts')
from data_preparation import opig_all_paired, install_miniconda, install_mmseqs2
from useful_functions import df_to_fasta, fasta_to_df

install_miniconda()
install_mmseqs2()

Cloning into 'msc-project-source-code-files-22-23-mbalmf01'...
remote: Enumerating objects: 262, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 262 (delta 8), reused 24 (delta 4), pack-reused 218[K
Receiving objects: 100% (262/262), 153.17 MiB | 41.74 MiB/s, done.
Resolving deltas: 100% (115/115), done.
Filtering content: 100% (7/7), 3.29 GiB | 125.83 MiB/s, done.
Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [2]:
#installation required at this point for unknown reasons
%%capture
!pip3 install pandas

In [3]:
#@title Pull data from OPIG and compile for processing
opig_all_paired()
!python /content/msc-project-source-code-files-22-23-mbalmf01/scripts/data_preparation.py

Running...
Dataframe contains 1487957 antibodies before data cleaning
77268 antibodies were removed due to truncations
writing 1410689 antibodies to file...


In [4]:
#@title Convert dataframe to a fasta file containing an ScFv (heavy & light chain)
date = time.strftime('%D')
today = date[-2:] + date[:2] + date[3:5]

df = pd.read_csv(f'/content/all_paired/{today}_human_paired_seqs.csv', low_memory=False, index_col=0)
linker = 'SGGSTITSYNVYYTKLSSSGT'

df['scfv'] = df['sequence_alignment_aa_heavy'] + [linker]*df.shape[0] + df['sequence_alignment_aa_light']

df_sub = df[['seq_id', 'scfv']]

df_to_fasta(df_sub, ['seq_id', 'scfv'], '/content/all_paired/paired_human.fasta')

In [None]:
#@title Run MMSeqs2 on ScFv data
sys.path.append('/usr/local/lib/python3.9/site-packages/')
!sudo mmseqs easy-linclust /content/all_paired/paired_human.fasta clusterRes /content/all_paired --min-seq-id 0.664 -c 0.8 --cov-mode 1

In [None]:
#@title Optional: Read in fasta data for further processing...
df = fasta_to_df('/content/clusterRes_rep_seq.fasta')
print(f'{df.shape[0]} clusters generated with MMSeqs2')

103152 clusters generated with MMSeqs2


In [None]:
token = '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'
username = 'mbalmf01'
repo = 'msc-project-source-code-files-22-23-mbalmf01'

!git config --global user.name 'mbalmf01'
!git config --global user.email 'mbalmf01@student.bbk.ac.uk'

In [None]:
os.chdir(r'/content/msc-project-source-code-files-22-23-mbalmf01')

In [None]:
!git status

On branch driveless
Your branch is up to date with 'origin/driveless'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mdeleted:    230816_aggpred_scores.parquet[m
	[31mmodified:   scripts/__pycache__/useful_functions.cpython-310.pyc[m
	[31mmodified:   scripts/data_preparation.py[m
	[31mdeleted:    scripts/wget_commands.txt[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mdata_files/[m
	[31mscripts/__pycache__/data_preparation.cpython-310.pyc[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git add --all
!git commit -a -m 'created data_files & transferred data. Finished data prep'
!git push origin driveless

[driveless c8a8e9a] created data_files & transferred data. Finished data prep
 5 files changed, 17 insertions(+)
 rename 230816_aggpred_scores.parquet => data_files/230816_aggpred_scores.parquet (100%)
 rename {scripts => data_files}/wget_commands.txt (100%)
 create mode 100644 scripts/__pycache__/data_preparation.cpython-310.pyc
Enumerating objects: 12, done.
Counting objects: 100% (12/12), done.
Delta compression using up to 8 threads
Compressing objects: 100% (8/8), done.
Writing objects: 100% (8/8), 8.24 KiB | 8.24 MiB/s, done.
Total 8 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/Birkbeck/msc-project-source-code-files-22-23-mbalmf01
   6e68c66..c8a8e9a  driveless -> driveless


In [None]:
from google.colab import runtime

def disconnect_and_delete_runtime():
  """Disconnects and deletes the current runtime."""
  runtime.unassign()

if __name__ == "__main__":
  disconnect_and_delete_runtime()