# This script will input fasta sequences and align them to assess similarity 

In [7]:
#import modules
import pandas as pd
import numpy as np
import Bio
from Bio import Seq
from Bio import SeqIO
import torch
import matplotlib.pyplot as plt
import sys
from torch.utils.data import TensorDataset, DataLoader

In [23]:
#.seq file input code
def load_seq(filepath):
    seqs = set()
    seqname_pair = {}
    with open(filepath,'r') as f:
        for line in f.readlines():
            line = line.rstrip()
            seq = line.split(" ")[0]
            if seq in seqs:
                continue
            seqs.add(seq)
            name = line.split(" ")[-1]
            seqname_pair[seq] = name
    f.close()
    return seqname_pair


In [24]:
#test the .seq file input code
load_seq('example.seq')

{'SEYAVARQLEHAKRVPDPVIARAALRAAEINAESRGDPELRELVKRVAEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_0_af2pred',
 'SEYAVRAQLEHAKRVPDPVIRRAALRAAEINARSRGDPELRELVKRLLEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_1_af2pred',
 'SEYAVRRQLEHAKRVEDPVIREAALRAVEINARSRGDPELRELLKRVLEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_2_af2pred',
 'SEYAVRRQLEHAKRVPDPVIRRAALRAAAINAESSGNPELRELVKRLLEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_3_af2pred',
 'SEYAVARQLEHAKRVPDPVIVRAALRAVEINLESRGDPELRRLVKRVLEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_4_af2pred',
 'SPYAVRAQLEHAKRVPDPVIRRAALRAAEINARSPEFPELRRLVKRLLEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_5_af2pred',
 'SPYAVRRQLEHAERVPDPVIRRAALRAAEINARSRGDPELRELVKRVLEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_6_af2pred',
 'SEYAILRQLEHAKRVEDPVIRRAALRAAEINAESSGNPELRRLVKRALEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_7_af2pred',
 'SEYAVRRQLEHAERVPDPVIRRAALRAAEINARSSGDPELRELVKEVLEEL': 'HHH_b2_01365_000000198_0001_af2pred_0001_8_af2pred',
 'SEYAVRRQ

In [21]:
#define a new load seq function for fasta files
def load_seq_fasta(filepath):
    seqs = set()
    seqname_pair = {}
    seq_dict = {}  #define empty dict to store sequence ids and sequences
    with open(filepath) as fasta_file:  # Will close handle cleanly
        identifiers = []   #define empty id list 
        sequence = []   #define empty seq list 
        for seq_record in SeqIO.parse(filepath, 'fasta'):  # (generator)
            identifiers.append(str(seq_record.id))    #append ids to id list
            sequence.append(str(seq_record.seq))    #append seqs to seq list
            seq_dict[str(seq_record.seq)] = str(seq_record.id)    #define an ID, seq dictionary
    fasta_file.close()
    return seq_dict


In [25]:
#test the new fasta loading function
thermo_seq_dict = load_seq_fasta('/Users/adamchazin-gray/Documents/data_class/data_sets/thermal_proteins/uniprot-thermophilus.fasta')
meso_seq_dict = load_seq_fasta('/Users/adamchazin-gray/Documents/data_class/data_sets/thermal_proteins/uniprot-mesophilus.fasta')
psychro_seq_dict = load_seq_fasta('/Users/adamchazin-gray/Documents/data_class/data_sets/thermal_proteins/uniprot-psychrophilus.fasta')




In [None]:
#example code for reading fasta files

#define the fasta_to_classified_df function; which inputs fasta seqs and classifies them in a df
def fasta_to_classified_df(fasta_path,protein_class='',sample=False):
    seq_dict = {}  #define empty dict to store sequence ids and sequences
    with open(fasta_path) as fasta_file:  # Will close handle cleanly
        identifiers = []   #define empty id list 
        sequence = []   #define empty seq list 
        for seq_record in SeqIO.parse(fasta_path, 'fasta'):  # (generator)
            identifiers.append(str(seq_record.id))    #append ids to id list
            sequence.append(str(seq_record.seq))    #append seqs to seq list
            seq_dict[str(seq_record.id)] = str(seq_record.seq)    #define an ID, seq dictionary
    seq_list = list(seq_dict.items())  #enumerate the dictionary
    df_seqs = pd.DataFrame(seq_list)    #create a df from enumerated dictionary
    df_seqs.columns = ['protein','sequence']    #define column names
    df_seqs['class'] = protein_class    #define the class of each imported csv
    if sample == True:     
        df_seqs = df_seqs.sample(frac=0.20)    #if sample == True, then sample 1/5 of the data(i.e. for Thermo proteins)
    print(len(df_seqs.index))
    return df_seqs