In [1]:
import pandas as pd
from collections import Counter
import math


In [11]:
df = pd.read_csv("../../dataset/NCTCGC+CAI+Acimo.csv")

In [4]:
def calculate_shannon_entropy(block, k):
    # 将区段分割成长度为k的子序列
    kmers = [block[i:i+k] for i in range(len(block) - k + 1)]
    
    # 统计每个子序列的出现次数
    kmer_counts = Counter(kmers)
    total_kmers = len(kmers)
    
    # 计算每个子序列的概率
    probabilities = {kmer: count / total_kmers for kmer, count in kmer_counts.items()}
    
    # 计算Shannon熵
    entropy = -sum(prob * math.log2(prob) for prob in probabilities.values())
    
    return entropy

In [5]:
def calculate_gibbs_entropy(sequence, k):
    kmer_counts = Counter(sequence[i:i+k] for i in range(len(sequence) - k + 1))
    total_kmers = sum(kmer_counts.values())
    probabilities = {kmer: count / total_kmers for kmer, count in kmer_counts.items()}
    gibbs_entropy = -sum(prob * math.log2(prob) for prob in probabilities.values())
    return gibbs_entropy

In [12]:
df['nSE2'] = df['DNA'].apply(lambda x: calculate_shannon_entropy(x, 2))
df['nSE3'] = df['DNA'].apply(lambda x: calculate_shannon_entropy(x, 3))
df['nGE2'] = df['DNA'].apply(lambda x: calculate_gibbs_entropy(x,2))
df['nGE3'] = df['DNA'].apply(lambda x: calculate_gibbs_entropy(x,3))

In [7]:
def min_max_normalize(column):
    min_val = column.min()
    max_val = column.max()
    normalized_column = (column - min_val) / (max_val - min_val)
    return normalized_column

In [13]:
df['nSE2'] = min_max_normalize(df['nSE2'])
df['nSE3'] = min_max_normalize(df['nSE3'])
df['nGE2'] = min_max_normalize(df['nGE2'])
df['nGE3'] = min_max_normalize(df['nGE3'])

In [9]:
df.head()

Unnamed: 0,locus tag,essential,DNA,protein sequence,CAI,GC_Content,A,R,N,D,...,P,S,T,W,Y,V,nSE2,nSE3,nGE2,nGE3
0,SA0001,1,ATGTCGGAAAAAGAAATTTGGGAAAAAGTGCTTGAAATTGCTCAAG...,MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSSI...,0.601193,0.326725,0.07064,0.041943,0.057395,0.050773,...,0.04415,0.050773,0.066225,0.006623,0.037528,0.046358,0.825606,0.853573,0.825606,0.853573
1,SA0002,0,ATGATGGAATTCACTATTAAAAGAGATTATTTTATTACACAATTAA...,MMEFTIKRDYFITQLNDTLKAISPRTTLPILTGIKIDAKEHEVILT...,0.620382,0.328042,0.04244,0.03183,0.058355,0.076923,...,0.04244,0.076923,0.071618,0.002653,0.018568,0.076923,0.84699,0.866865,0.84699,0.866865
2,SA0003,0,GTGATTATTTTGGTTCAAGAAGTTGTAGTAGAAGGAGACATTAATT...,MIILVQEVVVEGDINLGQFLKTEGIIESGGQAKWFLQDVEVLINGV...,0.627939,0.353659,0.024691,0.049383,0.024691,0.061728,...,0.024691,0.024691,0.024691,0.012346,0.0,0.08642,0.834126,0.77451,0.834126,0.77451
3,SA0004,1,ATGAAGTTAAATACACTCCAATTAGAAAATTATCGTAACTATGATG...,MKLNTLQLENYRNYDEVTLKCHPDVNILIGENAQGKTNLLESIYTL...,0.562197,0.336927,0.056757,0.045946,0.064865,0.059459,...,0.021622,0.054054,0.056757,0.0,0.040541,0.043243,0.859611,0.887616,0.859611,0.887616
4,SA0005,1,ATGGTGACTGCATTGTCAGATGTAAACAACACGGATAATTATGGTG...,MVTALSDVNNTDNYGAGQIQVLEGLEAVRKRPGMYIGSTSERGLHH...,0.6234,0.369509,0.068323,0.060559,0.049689,0.062112,...,0.026398,0.046584,0.059006,0.006211,0.045031,0.071429,0.905492,0.933875,0.905492,0.933875


In [14]:
df.to_csv("NCTCoutput.csv",index = False)