<a href="https://colab.research.google.com/github/lphohmann/BINP37_Research_project/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Mounding drive and importing modules

In [1]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)
# change to the working directory with you data folder present (when putting the path in one string, the directory isnt found)
%cd drive/
%cd MyDrive/
%cd DL_project/ 

/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/DL_project


In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Step 2: create a a pandas dataframe from the fasta file with the protein sequences and associated functional annotations (in form of K numbers)

In [3]:
# defining functions to load the protein sequences into panda dataframe 
# create dictionary from input fasta file where K number is key and the associated seqs the values
def knum_dict(file):
    k_dic = {}
    with open(file) as input_file:
        for line in input_file:
            if line.startswith('>'):
                header = line.split()[1]
                prot_seq = next(input_file).strip().upper()
                seq_list = []
                seq_list.append(prot_seq)
                if header not in k_dic.keys():
                    k_dic[header] = list()
                else:
                    pass
                k_dic[header].extend(seq_list)
            else:
                print("The file is in the wrong format, a single line fasta file is required")
                break
    return k_dic

# create the pandas dataframe based on that dictionary
def dict_to_df(dict):
    x = [item[0] for item in dict.items()] # save all keys in a list
    df1 = pd.DataFrame({'Knum': x[0], 'Seq': dict[x[0]]}) # create initial dataframe
    for key in x[1:]: # now for the rest of the keys
        df2 = pd.DataFrame({'Knum': key, 'Seq': dict[key]}) # for each key make a dataframe
        df1 = pd.concat([df1,df2]) # concatenate them
    return df1.reset_index(drop=True) # have to reset index as it is mixed up after concat

In [4]:
# running code
# input file in fasta format
input_file = 'data/CH4_database_protein.faa' # specify your input file with the protein sequences in fasta format
# create the dataframe with the protein sequences
k_dic = knum_dict(input_file)
k_df = dict_to_df(k_dic)

# filter out K numbers with less than 500 associated sequences from the dataframes
min_seq_cutoff = 500
filt_k_df = k_df.groupby("Knum").filter(lambda x: len(x) > min_seq_cutoff)

In [5]:
# looking into parameters about the data which are needed to decide on parameters for the transform functions when loading the inputs for the model
# get average seq length
avg_seqlen = filt_k_df['Seq'].apply(len).mean()
print("average sequence length:",avg_seqlen) # 260
# the unique characters in the sequences representing 20 amino acids; used for later encoding the sequences
distinc_list = []
for key in k_dic:
    distinc_list.append("{}{}".format(k_dic[key][0],k_dic[key][1]))
distinct_aa = set(''.join(distinc_list))
print(distinct_aa) # {'K', 'D', 'N', 'E', 'R', 'A', 'T', 'L', 'I', 'Q', 'C', 'F', 'G', 'W', 'M', 'S', 'H', 'P', 'V', 'Y'}; excluded *

average sequence length: 260.72583836055605
{'Y', 'G', 'A', 'F', 'H', 'E', 'K', 'N', 'L', 'D', 'M', 'P', 'V', 'W', 'Q', 'S', '*', 'C', 'I', 'R', 'T'}


# Step 3: Stratified random split by K number of the dataframe into training and trainval set


In [6]:
# the data is split into a test set and a trainval set (from which the training and validation set will be created)
trainval, test = train_test_split(filt_k_df, test_size=0.1, random_state=42, stratify=filt_k_df[['Knum']])

# save as csv files in the data directory
trainval.to_csv('data/trainval.csv')
test.to_csv('data/test.csv')

# This step is moved to the DataBlock step when loading my data for the model
# Then the trainval set is split into training and validation set (80/20)
#training, validation = train_test_split(trainval, test_size=0.2, random_state=42, stratify=trainval[['Knum']])
# save these sets in .csv files
#training.to_csv('data/training.csv')
#validation.to_csv('data/validation.csv')