In [1]:
import os
import glob
import pandas as pd
from openpyxl import Workbook

# 第一步：读取TXT文件，写入Excel文件
def txt_to_excel(folder_path):
    txt_files = glob.glob(os.path.join(folder_path, '*.txt'))
    
    for file in txt_files:
        with open(file, 'r') as f:
            sequences = f.readlines()

        # 创建蛋白质序列的DataFrame
        df = pd.DataFrame(sequences, columns=['Sequence'])
        df['Sequence'] = df['Sequence'].str.strip()  # 移除可能的换行符
        df['Length'] = df['Sequence'].apply(len)
        df.sort_values(by='Length', inplace=True)
        
        # 创建并写入Excel文件
        excel_filename = os.path.splitext(os.path.basename(file))[0] + '.xlsx'
        excel_path = os.path.join(folder_path, excel_filename)
        with pd.ExcelWriter(excel_path) as writer:
            df.to_excel(writer, index=False)
            
# 第二步：过滤氨基酸序列长度，并保存新的TXT文件
def filter_sequences_to_txt(folder_path):
    excel_files = glob.glob(os.path.join(folder_path, '*.xlsx'))

    for file in excel_files:
        df = pd.read_excel(file)
        
        # 只选取长度在1到1000之间的序列
        filtered_df = df[(df['Length'] > 0) & (df['Length'] <= 1000)]
        
        txt_filename = os.path.splitext(os.path.basename(file))[0] + '_1000.txt'
        txt_path = os.path.join(folder_path, txt_filename)

        # 保存到TXT文件
        with open(txt_path, 'w') as f:
            for sequence in filtered_df['Sequence']:
                f.write("%s\n" % sequence.strip())

# 第三步：将过滤后的TXT文件导出到新的Excel文件
def sequences_txt_to_excel_prediction(folder_path):
    txt_files = glob.glob(os.path.join(folder_path, '*_1000.txt'))

    for file in txt_files:
        df = pd.read_csv(file, header=None, names=['Sequence'])
        
        wb = Workbook()
        ws = wb.active
        ws.title = 'Predictions'
        
        for seq in df['Sequence']:
            ws.append([seq])
        
        excel_filename = os.path.splitext(os.path.basename(file))[0] + '_prediction.xlsx'
        wb.save(os.path.join(folder_path, excel_filename))

# 运行上述定义的函数
folder_path = r"folder_file_path"  # 这里设置你的文件夹路径
txt_to_excel(folder_path)
filter_sequences_to_txt(folder_path)
sequences_txt_to_excel_prediction(folder_path)

print("done")

done
