In [None]:
import os

In [None]:
##################################################################################
##### Define the encoding hyperparameters
##################################################################################

# k-mer config
kmer = 5

# Tsallis Entropic Parameter
tep = 0

# threshold
th = 10

## sliding-window
sw = 5

## window-step
ws = 1

##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 10

input_data_folder = "Data"
input_dataset = "Psi_Site_Chen"

output_data_folder = "Data"
output_dataset = "Psi_Site_Chen_MathFeature_Latest_{}_{}_{}_{}_{}".format(kmer, tep, th, sw, ws)

In [None]:
##################################################################################
##### Checking the directory
##################################################################################

dataset_setting_path = os.path.join(input_data_folder, input_dataset)
dataset_varieties = next(os.walk(dataset_setting_path))
print(dataset_varieties)

In [None]:
for root, dirs, files in os.walk(dataset_setting_path):
    for file in files:
        print((root, file))

## Encodings

### Preprocessing Step

In [None]:
##################################################################################
##### To eliminate any noise from the sequences (e.g., other letters as: N, K, …)
##################################################################################

out_path_datasets = os.path.join(output_data_folder, output_dataset)

for root, dirs, files in os.walk(dataset_setting_path):
    for file in files:
        
        current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
        
        read_file_full_path = os.path.join(root, file)
        write_file_path = root.replace(input_dataset, output_dataset)
        write_file_full_path = os.path.join(write_file_path, file)
        
        if(not os.path.isdir(write_file_path)):
            os.makedirs(write_file_path)
        
        command = "python MathFeature_Latest/preprocessing/preprocessing.py -i {} -o {}".format(read_file_full_path, write_file_full_path)
        os.system(command)

#### Preprocessed file list

In [None]:
preprocessed_files_list = []
for root, dirs, files in os.walk(out_path_datasets):
    for file in files:
        if os.path.splitext(file)[-1] == ".txt":
            preprocessed_files_list.append((root, file))

In [None]:
# preprocessed_files_list = [('Generated\\{}_setting1\\Datasets\\Setting1\\Drosophila',
#   'nucleosomes_vs_linkers_melanogaster.txt'),
#  ('Generated\\{}_setting1\\Datasets\\Setting1\\Elegans',
#   'nucleosomes_vs_linkers_elegans.txt'),
#  ('Generated\\{}_setting1\\Datasets\\Setting1\\Homo_Sapiens',
#   'nucleosomes_vs_linkers_sapiens.txt'),
#  ('Generated\\{}_setting1\\Datasets\\Setting1\\Yeast',
#   'nucleosomes_vs_linkers_yeast.txt')]

In [None]:
preprocessed_files_list

### Numerical Mapping

In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)

    !python MathFeature_Latest/methods/MappingClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NM-binary.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 1
    !python MathFeature_Latest/methods/MappingClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NM-zcurve.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 2
    !python MathFeature_Latest/methods/MappingClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NM-real.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 3
    !python MathFeature_Latest/methods/MappingClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NM-integer.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 4
    !python MathFeature_Latest/methods/MappingClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NM-eiip.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 5
    !python MathFeature_Latest/methods/MappingClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NM-complex.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 6
    !python MathFeature_Latest/methods/MappingClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NM-atomic.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 7


### Numerical Mapping - Fourier Transform

In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)

    !python MathFeature_Latest/methods/FourierClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NMFT-binary.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 1
    !python MathFeature_Latest/methods/FourierClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NMFT-zcurve.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 2
    !python MathFeature_Latest/methods/FourierClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NMFT-real.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 3
    !python MathFeature_Latest/methods/FourierClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NMFT-integer.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 4
    !python MathFeature_Latest/methods/FourierClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NMFT-eiip.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 5
    !python MathFeature_Latest/methods/FourierClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NMFT-complex.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 6
    !python MathFeature_Latest/methods/FourierClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NMFT-atomic.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 7


### Chaos Game Representation

In [None]:
## some options use kmer

for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/ChaosGameTheory_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Chaos-classic.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 1
#     !python MathFeature_Latest/methods/ChaosGameTheory_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Chaos-frequency.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 2 -f {kmer}
    !python MathFeature_Latest/methods/ChaosGameTheory_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Chaos-classic-signal.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 3
#     !python MathFeature_Latest/methods/ChaosGameTheory_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Chaos-frequency-signal.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 4 -f {kmer}


### Entropy - Shannon and Tsallis

#### Shannon

In [None]:
## uses kmer

for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/EntropyClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Entropy-shannon.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -e Shannon -k {kmer}


#### Tsallis

In [None]:
## uses kmer, and tsallis entropic parameter

for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/TsallisEntropy_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Entropy-tsallis.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -q {tep} -k {kmer}


### Complex Networks

#### Version 1

In [None]:
## uses pattern (kmer) and threshold

for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/ComplexNetworksClass_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Complex-Network.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -k {kmer} -t {th}


#### Version 2 - no threshold

In [None]:
## uses pattern (kmer)

for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/ComplexNetworksClass-v2_Mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Complex-Network-v2.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -k {kmer}


### Customizable k-mer, NAC, DNC, TNC

In [None]:
## uses kmer, sliding window, and window step

for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/ExtractionTechniques_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_NAC.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -seq 2 -t NAC
    !python MathFeature_Latest/methods/ExtractionTechniques_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_DNC.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -seq 2 -t DNC
    !python MathFeature_Latest/methods/ExtractionTechniques_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_TNC.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -seq 2 -t TNC
    !python MathFeature_Latest/methods/ExtractionTechniques_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_kmer.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -seq 2 -t kmer -k {kmer}
    !python MathFeature_Latest/methods/ExtractionTechniques_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_rckmer.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -seq 2 -t rckmer -k {kmer}
    !python MathFeature_Latest/methods/ExtractionTechniques_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_kstep.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -seq 2 -t kstep -k {kmer} -w {sw} -s {ws}


### Accumulated Nucleotide Frequency (ANF)

In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/AccumulatedNucleotideFrequency_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_ANF.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 1
    !python MathFeature_Latest/methods/AccumulatedNucleotideFrequency_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_ANFF.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -r 2
    
#     command = "python MathFeature_Latest/methods/AccumulatedNucleotideFrequency_mod.py -i {} -l {} -o {} -r {}".format(input_file_full_path, current_dataset_variety, "_ANF.".join(input_file_full_path.split(".")).replace(".txt", ".csv"), 1)
#     os.system(command)
    
#     command = "python MathFeature_Latest/methods/AccumulatedNucleotideFrequency_mod.py -i {} -l {} -o {} -r {}".format(input_file_full_path, current_dataset_variety, "_ANFF.".join(input_file_full_path.split(".")).replace(".txt", ".csv"), 2)
#     os.system(command)

### Open Reading Frame (ORF) descriptor

In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/CodingClass_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_ORF.".join(input_file_full_path.split(".")).replace(".txt", ".csv")}


### Fickett score

In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/FickettScore_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_Fickett.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -seq 1


### Pseudo k-tuple nucleotide composition - PseKNC

In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/PseKNC_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_PseKNC1-DN.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -x MathFeature_Latest/files/propNames-RNA-k2.txt -xp MathFeature_Latest/files/propValues-RNA-k2.txt -seq 2 -t 1 -k 2 -j 1 -w 1.0 -s 2
    !python MathFeature_Latest/methods/PseKNC_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_PseKNC2-DN.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -x MathFeature_Latest/files/propNames-RNA-k2.txt -xp MathFeature_Latest/files/propValues-RNA-k2.txt -seq 2 -t 2 -k 2 -j 1 -w 1.0 -s 2
    
    !python MathFeature_Latest/methods/PseKNC_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_PseKNC1-TN.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -x MathFeature_Latest/files/propNames-RNA-k2.txt -xp MathFeature_Latest/files/propValues-RNA-k2.txt -seq 2 -t 1 -k 3 -j 1 -w 1.0 -s 2
    !python MathFeature_Latest/methods/PseKNC_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_PseKNC2-TN.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -x MathFeature_Latest/files/propNames-RNA-k2.txt -xp MathFeature_Latest/files/propValues-RNA-k2.txt -seq 2 -t 2 -k 3 -j 1 -w 1.0 -s 2


### Xmer k-Spaced Ymer Composition Frequency - kGap

In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/Kgap_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_kgap.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -k 1 -bef 1 -aft 1 -seq 2


In [None]:
for (root, file) in preprocessed_files_list:
    
    current_dataset_variety = root.split("\\")[len(root.split("\\"))-1]
    input_file_full_path = os.path.join(root, file)
    
    !python MathFeature_Latest/methods/Kgap_mod.py -i {input_file_full_path} -l {current_dataset_variety} -o {"_kgap2.".join(input_file_full_path.split(".")).replace(".txt", ".csv")} -k 2 -bef 2 -aft 2 -seq 2


### Join ALL embeddings

In [None]:
import pandas as pd

In [None]:
parent_file_types = ["HS_990", "MM_944", "SS_628"]

In [None]:
reject_list_file = []
reject_list_encoding = ['NM-complex', 'Complex-Network']

In [None]:
for file_parent in parent_file_types:
    
    df_all = None
    
    for root, dirs, files in os.walk(out_path_datasets):
        for file in files:

            if (os.path.splitext(file)[-1] == '.csv') and (file not in reject_list_file):

                current_dataset_variety = "_".join(file.split(".")[0].split("_")[0:(len(file.split(".")[0].split("_")) - 1)])
                encoding_type = file.split(".")[0].split("_")[-1]

                if (current_dataset_variety == file_parent) and (encoding_type not in reject_list_encoding):
                    
                    print('Parent:', file_parent)
                    print('File:', file)
                    print('Vareity:',current_dataset_variety)
                    print('Encoding:', encoding_type)
                    print()

                    ##################################################################################
                    ##### read the current file
                    ##################################################################################

                    input_file_full_path = os.path.join(root, file)

                    ## check if input file has header
                    file_obj = open(input_file_full_path, "r")
                    first_line = file_obj.readline()
                    file_obj.close()
                    file_has_header = None
                    if first_line.split(",")[0] == "nameseq" or first_line.replace("\n", "").split(",")[-1] == "label":
                        file_has_header = 0

                    sequences_df = pd.read_csv(input_file_full_path, header = file_has_header)
                    
                    # fixing column headers
                    col_names = [encoding_type + '_'+ str(val) for val in list(sequences_df.columns)]
                    col_names[0] = 'nameseq'
                    col_names[-1] = 'label'
                    sequences_df.columns = col_names
                    
                    sequences_df = sequences_df.drop('label', axis = 1)
                    
                    # joining all data
                    
                    if type(df_all) != pd.DataFrame:
                        df_all = sequences_df
                        
                    else:
                        df_all = pd.merge(
                            df_all,
                            sequences_df,
                            how="inner",
                            on='nameseq',
                            validate='1:1',
                        )
    
    out_path = os.path.join(output_data_folder, output_dataset+'_ALL')
    out_file = file_parent+'_ALL.csv'
    
    if(not os.path.isdir(out_path)):
            os.makedirs(out_path)
    
    if type(df_all) == pd.DataFrame:
        df_all.to_csv(os.path.join(out_path, out_file), index=False)

