In [3]:
import pandas as pd
from collections import Counter
import math


In [17]:
df = pd.read_csv("../dataset/MG1655GC+CAI+Acimo.csv")

In [4]:
def calculate_shannon_entropy(block, k):
    # 将区段分割成长度为k的子序列
    kmers = [block[i:i+k] for i in range(len(block) - k + 1)]
    
    # 统计每个子序列的出现次数
    kmer_counts = Counter(kmers)
    total_kmers = len(kmers)
    
    # 计算每个子序列的概率
    probabilities = {kmer: count / total_kmers for kmer, count in kmer_counts.items()}
    
    # 计算Shannon熵
    entropy = -sum(prob * math.log2(prob) for prob in probabilities.values())
    
    return entropy

In [5]:
def calculate_gibbs_entropy(sequence, k):
    kmer_counts = Counter(sequence[i:i+k] for i in range(len(sequence) - k + 1))
    total_kmers = sum(kmer_counts.values())
    probabilities = {kmer: count / total_kmers for kmer, count in kmer_counts.items()}
    gibbs_entropy = -sum(prob * math.log2(prob) for prob in probabilities.values())
    return gibbs_entropy

In [18]:
df['nSE2'] = df['DNA'].apply(lambda x: calculate_shannon_entropy(x, 2))
df['nSE3'] = df['DNA'].apply(lambda x: calculate_shannon_entropy(x, 3))
df['nGE2'] = df['DNA'].apply(lambda x: calculate_gibbs_entropy(x,2))
df['nGE3'] = df['DNA'].apply(lambda x: calculate_gibbs_entropy(x,3))

In [16]:
def min_max_normalize(column):
    min_val = column.min()
    max_val = column.max()
    normalized_column = (column - min_val) / (max_val - min_val)
    return normalized_column

In [19]:
df['nSE2'] = min_max_normalize(df['nSE2'])
df['nSE3'] = min_max_normalize(df['nSE3'])
df['nGE2'] = min_max_normalize(df['nGE2'])
df['nGE3'] = min_max_normalize(df['nGE3'])

In [20]:
df.head()

Unnamed: 0,locus tag,essential,DNA,protein sequence,GC_Content,CAI,A,R,N,D,...,P,S,T,W,Y,V,nSE2,nSE3,nGE2,nGE3
0,190..255\t0,0,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,MKRISTTITTTITITTGNGAG*,0.515152,0.693366,0.045455,0.045455,0.045455,0.0,...,0.0,0.045455,0.363636,0.0,0.0,0.0,0.708253,0.271873,0.708253,0.271873
1,337..2799\t0,0,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,0.530654,0.572235,0.11084,0.056029,0.046285,0.053593,...,0.035323,0.062119,0.041413,0.004872,0.024361,0.084044,0.970596,0.969793,0.970596,0.969793
2,2801..3733\t0,0,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,0.562701,0.583953,0.118971,0.054662,0.038585,0.041801,...,0.051447,0.054662,0.028939,0.012862,0.025723,0.073955,0.956975,0.954594,0.956975,0.954594
3,3734..5020\t0,0,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,0.528361,0.606598,0.109557,0.039627,0.044289,0.055944,...,0.053613,0.044289,0.053613,0.006993,0.020979,0.062937,0.960312,0.95549,0.960312,0.95549
4,5234..5530\t0,0,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,0.538721,0.524738,0.070707,0.070707,0.020202,0.050505,...,0.090909,0.030303,0.010101,0.060606,0.030303,0.060606,0.959403,0.901987,0.959403,0.901987


In [21]:
df.to_csv("output.csv",index = False)