<a href="https://colab.research.google.com/github/lphohmann/BINP37_DL_protein_classification/blob/main/datapreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# mount google drive to access files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/DL_project/
!pwd

In [None]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# create the test set and a set from which later training and validation sets will be produced
# input file in fasta format
input_file = 'data/CH4_database_protein.faa'

'''
Step 1: load the fasta file with the protein seqeunces and k numbers into a pandas dataframe
'''

# create dictionary from input fasta file where K number is key and the associated seqs the values
def knum_dict(file):
    k_dic = {}
    with open(file) as input_file:
        for line in input_file:
            if line.startswith('>'):
                header = line.split()[1]
                prot_seq = next(input_file).strip().upper()
                seq_list = []
                seq_list.append(prot_seq)
                if header not in k_dic.keys():
                    k_dic[header] = list()
                else:
                    pass
                k_dic[header].extend(seq_list)
            else:
                print("This should never appear, if yes the file is in the wrong format")
                break
    return k_dic

# create the pandas dataframe based on that dictionary
def dict_to_df(dict):
    x = [item[0] for item in dict.items()] # save all keys in a list
    df1 = pd.DataFrame({'Knum': x[0], 'Seq': dict[x[0]]}) # create initial dataframe
    for key in x[1:]: # now for the rest of the keys
        df2 = pd.DataFrame({'Knum': key, 'Seq': dict[key]}) # for each kay make a dataframe
        df1 = pd.concat([df1,df2]) # concatenate them
    return df1.reset_index(drop=True) # have to reset index as it is mixed up after concat

# running code
k_dic = knum_dict(input_file)
k_df = dict_to_df(k_dic)

# filter out K numbers with less than 500 associated sequences from the dataframes
min_seq_cutoff = 500
filt_k_df = k_df.groupby("Knum").filter(lambda x: len(x) > min_seq_cutoff)

'''
Step 2: Stratified random split based (group by) K num column of the pd df into training, validation and test set.
'''

# the data is first split into a set from which the training and validation set will be created and the test set
trainval, test = train_test_split(filt_k_df, test_size=0.1, random_state=42, stratify=filt_k_df[['Knum']])

# save as csv files
trainval.to_csv('trainval.csv')
test.to_csv('test.csv')

# This step is moved to the DataBlock step when loading my data for the model
# Then the trainval set is split into training and validation set (80/20)
#training, validation = train_test_split(trainval, test_size=0.2, random_state=42, stratify=trainval[['Knum']])
# save these dfs in .csv files
#training.to_csv('training.csv')
#validation.to_csv('validation.csv')

In [None]:
# check some stats about the data to set some parameters for the transform functions 
# get average seq length
avg_seqlen = filt_k_df['Seq'].apply(len).mean()
print("average sequence length:",avg_seqlen) # 260
# the max sequence length
print("max sequence length:",filt_k_df.Seq.map(len).max()) #2818
# the unique characters in the sequences
distinc_list = []
for key in k_dic:
    distinc_list.append("{}{}".format(k_dic[key][0],k_dic[key][1]))
distinct_aa = set(''.join(distinc_list))
print(distinct_aa) # {'K', 'D', 'N', 'E', 'R', 'A', 'T', 'L', 'I', 'Q', 'C', 'F', 'G', 'W', 'M', 'S', 'H', 'P', 'V', 'Y'} 