## Preparation

In [1]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
#from lightgbm.sklearn import LGBMClassifier
import xgboost as xgb
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,roc_auc_score,auc,precision_recall_curve,average_precision_score,accuracy_score,f1_score
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings("ignore")

def clf_select(name,pds=5):
    if name =='DT':
        clf = DecisionTreeClassifier(max_depth=100, min_samples_leaf=5, criterion='gini')
    elif name =='DT_cv':
        tree_para = {'max_depth': [50, 100, 200, 500, 1000]}
        clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'SVM':
        clf = SVC(kernel='linear', probability=True, C=1) #linear
    elif name == 'SVM_cv':
        tree_para = { 'C': [0.01, 0.1, 1, 10,100]}
        clf = GridSearchCV(SVC(kernel= 'rbf',probability=True), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'RF':
        clf = RandomForestClassifier(criterion='gini', max_depth=None, min_samples_split=2,n_estimators=1000)
    elif name == 'RF_cv':
        tree_para = {'n_estimators': [10, 50, 100, 200, 500], 'max_depth': [10, 50, 100, 200, 500]}
        clf = GridSearchCV(RandomForestClassifier(), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'LR':
        clf = LogisticRegression(penalty='l2',solver='liblinear',C=1)
    elif name == 'LR_cv':
        tree_para = {'C': [0.001, 0.1, 1, 10, 100]}
        clf = GridSearchCV(LogisticRegression(penalty='l2',solver='liblinear'),tree_para, cv=pds, n_jobs=5,scoring='f1_macro')   
    elif name == 'KNN':
        clf = KNeighborsClassifier(n_neighbors=10, weights='distance', leaf_size=10)
    elif name == 'KNN_cv':
        tree_para = {'n_neighbors': [5, 10, 20, 50]}
        clf = GridSearchCV(KNeighborsClassifier(weights='distance'), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'NN':
        clf = MLPClassifier(hidden_layer_sizes=(100), max_iter=200)
    elif name == 'LGBoost':
        clf = LGBMClassifier(num_leaves=5, n_estimators=100)
    elif name == 'LGBoost_cv':
        tree_para = {'max_depth': [5, 10, 50, 100, 500, 1000], 'n_estimators': [100, 500, 1000],
                     'num_leaves': [20, 30, 50, 100]}
        clf = GridSearchCV(LGBMClassifier(learning_rate=0.1), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'XGBoost':
        clf = xgb.XGBClassifier(learning_rate=0.5, n_estimators=500, max_depth=50, min_child_weight=3,
                                gamma=1,  # 惩罚项中叶子结点个数前的参数
                                subsample=0.7,  # 随机选择80%样本建立决策树
                                objective='binary:logistic',  # 指定损失函数
                                nthread=5
                                )
    elif name == 'XGBoost_cv':
        tree_para = {'max_depth': [10, 50, 100, 200, 500], 'n_estimators': [50, 100, 200, 500]}
        clf = GridSearchCV(xgb.XGBClassifier(learning_rate=0.5, min_child_weight=3, gamma=3, subsample=0.7,
                                             objective='binary:logistic',
                                             scale_pos_weight=1, nthread=5), tree_para, cv=pds, n_jobs=5,scoring='f1_macro')
    
    elif name == 'ENSEMBLE_hard':
        clf = VotingClassifier(estimators=[('RF',RandomForestClassifier(criterion='gini', max_depth=None, min_samples_split=2,n_estimators=1000)),
                                           ('SVM',SVC(kernel='linear', probability=True, C=0.8)),
                                           ('LR',LogisticRegression(penalty='l2',solver='liblinear')),
                                           ('DT',DecisionTreeClassifier(max_depth=100, min_samples_leaf=5, criterion='gini'))],voting = 'hard')
    elif name == 'ENSEMBLE_soft':
        clf = VotingClassifier(estimators=[('RF',RandomForestClassifier(criterion='gini', max_depth=None, min_samples_split=2,n_estimators=1000)),
                                           ('LR',LogisticRegression(penalty='l2',solver='liblinear'))],voting = 'soft')
#     ('SVM',SVC(kernel='linear', probability=True, C=0.8)),
#     ('DT',DecisionTreeClassifier(max_depth=100, min_samples_leaf=5, criterion='gini'))
    return clf

def clf_select_multi(name, pds=5):
    if name == 'DT':
        clf = DecisionTreeClassifier(max_depth=100, min_samples_leaf=5, criterion='gini')
    elif name == 'DT_cv':
        tree_para = {'max_depth': [50, 100, 200, 500, 1000]}
        clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'SVM':
        clf = SVC(kernel='rbf', probability=True, C=1)
    elif name == 'SVM_cv':
        tree_para = {'C': [0.01, 0.1, 1, 10, 100]}
        clf = GridSearchCV(SVC(kernel='rbf', probability=True), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'RF':
        clf = RandomForestClassifier(n_estimators=100, max_depth=100)
    elif name == 'RF_cv':
        tree_para = {'n_estimators': [10, 50, 100, 200, 500], 'max_depth': [10, 50, 100, 200, 500]}
        clf = GridSearchCV(RandomForestClassifier(), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'LR':
        clf = LogisticRegression(penalty='l2', solver='liblinear', C=1)
    elif name == 'LR_cv':
        tree_para = {'C': [0.001, 0.1, 1, 10, 100]}
        clf = GridSearchCV(LogisticRegression(penalty='l2', solver='liblinear'), tree_para, cv=pds, n_jobs=5,scoring='f1_macro')
    elif name == 'KNN':
        clf = KNeighborsClassifier(n_neighbors=10)
    elif name == 'KNN_cv':
        tree_para = {'n_neighbors': [5, 10, 20, 50]}
        clf = GridSearchCV(KNeighborsClassifier(weights='distance'), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'NN':
        clf = MLPClassifier(hidden_layer_sizes=(100), max_iter=200)
    elif name == 'LGBoost':
        clf = LGBMClassifier(num_leaves=5, n_estimators=100)
    elif name == 'LGBoost_cv':
        tree_para = {'max_depth': [5, 10, 50, 100, 500, 1000], 'n_estimators': [100, 500, 1000],
                     'num_leaves': [20, 30, 50, 100]}
        clf = GridSearchCV(LGBMClassifier(learning_rate=0.1), tree_para, cv=pds, n_jobs=5, scoring='f1_macro')
    elif name == 'XGBoost':
        clf = xgb.XGBClassifier(learning_rate=0.5, n_estimators=500, max_depth=50, min_child_weight=3,
                                gamma=1,  # 惩罚项中叶子结点个数前的参数
                                subsample=0.7,  # 随机选择80%样本建立决策树
                                objective='multi:softprob',  # 指定损失函数
                                nthread=5
                                )
    elif name == 'XGBoost_cv':
        tree_para = {'max_depth': [10, 50, 100, 200, 500], 'n_estimators': [50, 100, 200, 500]}
        clf = GridSearchCV(xgb.XGBClassifier(learning_rate=0.5, min_child_weight=3, gamma=3, subsample=0.7,
                                             objective='binary:logistic',
                                             scale_pos_weight=1, nthread=5), tree_para, cv=pds,n_jobs=5, scoring='f1_macro')
    return clf

def plot_AUROC(Y_test,Y_prob,F):
    fpr, tpr, thresholds = roc_curve(Y_test,Y_prob)
    roc_auc = auc(fpr, tpr)
    print('AUROC:',roc_auc)
    # plot ROC curve
    plt.figure(figsize=(4,4))
    plt.plot(fpr, tpr, '-', color='blue', label='RandomForest AUC = {:.4f}'.format(roc_auc), lw=2)
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Chance')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title('ROC curve of ' + F)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc='best',fontsize='small')
    plt.tight_layout()

    plt.savefig('./figure/cellline/' + F + '_AUROC.png',dpi=600)
    #plt.savefig('figure/LR_small_9.png',dpi=600)
    plt.show()
    plt.close()

def plot_AUPRC(Y_test,Y_prob,F):
    precision, recall, thresholds = precision_recall_curve(Y_test,Y_prob)
    aupr = auc(recall,precision)
    print('AUPRC:',aupr)
    # plot ROC curve
    plt.figure(figsize=(4,4))
    plt.plot(recall,precision, '-', color='blue', label='RandomForest AUPRC = {:.4f}'.format(aupr), lw=2)
    plt.plot([0, 1], [1, 0], '--', color=(0.6, 0.6, 0.6), label='Random Chance')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title('AUPRC curve of ' + F)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend(loc='best',fontsize='small')
    plt.tight_layout()

    plt.savefig('./figure/cellline/' + F + '_AUPRC.png',dpi=600)
    #plt.savefig('figure/LR_small_9.png',dpi=600)
    plt.show()
    plt.close()
    
def find_metrics_best_for_shuffle(label, prob, cut_spe=0.95):
    fpr, tpr, _ = roc_curve(label, prob)
    a = 1 - fpr
    b = tpr
    Sensitivity = b
    Specificity = a
    Sensitivity_ = Sensitivity[Specificity >= cut_spe]
    if (len(Sensitivity_) == 1) & (Sensitivity_[0] == 0):
        Sensitivity_best = ((Sensitivity[1] - Sensitivity[0]) / (Specificity[1] - Specificity[0])) * cut_spe + Sensitivity[1] - ((Sensitivity[1] - Sensitivity[0]) / (Specificity[1] - Specificity[0])) * \
                           Specificity[1]
    else:
        Sensitivity_best = np.max(Sensitivity_)

    return Sensitivity_best, Sensitivity, Specificity


def plot_roc_multi(prob, label):
    pre_label = prob.argmax(axis=1)
    acc = accuracy_score(label, pre_label)
    auc_macro_ovr = roc_auc_score(label, prob, average='macro', multi_class='ovr')
    auc_macro_ovo = roc_auc_score(label, prob, average='macro', multi_class='ovo')
    auc_weighted_ovr = roc_auc_score(label, prob, average='weighted', multi_class='ovr')
    auc_weighted_ovo = roc_auc_score(label, prob, average='weighted', multi_class='ovo')
    f1_macro = f1_score(label, pre_label, average='macro')
    f1_weighted = f1_score(label, pre_label, average='weighted')
    return acc, auc_weighted_ovr, auc_weighted_ovo, auc_macro_ovr, auc_macro_ovo, f1_weighted, f1_macro

## Calculation

### loading data

In [8]:
import random 
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import re

method_dict = {}
DeltaG = {'AA': -0.93, 'UU': -0.93, 'AU': -1.10, 'UA': -1.33, 'CU': -2.08, 'AG': -2.08, 'CA': -2.11, 'UG': -2.11, 'GU': -2.24,  'AC': -2.24, 'GA': -2.35,  'UC': -2.35, 'CG': -2.36, 'GG': -3.26, 'CC': -3.26, 'GC': -3.42, 'init': 4.09, 'endAU': 0.45, 'sym': 0.43}
DeltaH = {'AA': -6.82, 'UU': -6.82, 'AU': -9.38, 'UA': -7.69, 'CU': -10.48, 'AG': -10.48, 'CA': -10.44, 'UG': -10.44, 'GU': -11.40,  'AC': -11.40, 'GA': -12.44,  'UC': -12.44, 'CG': -10.64, 'GG': -13.39, 'CC': -13.39, 'GC': -14.88, 'init': 3.61, 'endAU': 3.72, 'sym': 0}

def antiRNA(RNA):
    antiRNA = []
    for i in RNA:
        if i == 'A' or i == 'a':
            antiRNA.append('T')
        elif i == 'U' or i == 'u' or i == 'T' or i == 't':
            antiRNA.append('A')
        elif i == 'C' or i == 'c':
            antiRNA.append('G')
        elif i == 'G' or i == 'g':
            antiRNA.append('C')
    return ''.join(antiRNA[::-1])

def Calculate_DGH(seq):
    DG_all = 0
    DG_all += DeltaG['init']
    DG_all += ((seq[0] + seq[len(seq)-1]).count('A') + (seq[0] + seq[len(seq)-1]).count('U')) * DeltaG['endAU']
    DG_all += DeltaG['sym'] if antiRNA(seq).replace('T','U') == seq else 0
    for i in range(len(seq) - 1):
        DG_all += DeltaG[seq[i] + seq[i+1]]
    DH_all = 0
    DH_all += DeltaH['init']
    DH_all += ((seq[0] + seq[len(seq)-1]).count('A') + (seq[0] + seq[len(seq)-1]).count('U')) * DeltaH['endAU']
    DH_all += DeltaH['sym'] if antiRNA(seq).replace('T','U') == seq else 0
    for i in range(len(seq) - 1):
        DH_all += DeltaH[seq[i] + seq[i+1]]
    return DG_all,DH_all

def Calculate_end_diff(siRNA):
    count = 0
    _5 = siRNA[:2] # 5'end
    _3 = siRNA[-2:] # 3' end
    if _5 in ['AC','AG','UC','UG']:
        count += 1
    elif _5 in ['GA','GU','CA','CU']:
        count -= 1
    if _3 in ['AC','AG','UC','UG']:
        count += 1
    elif _3 in ['GA','GU','CA','CU']:
        count -= 1
    
    return float('{:.2f}'.format(DeltaG[_5] - DeltaG[_3] + count * 0.45))
    
Hu = pd.read_csv("./data/Hu.csv")
Taka = pd.read_csv("./data/Taka.csv")
new = pd.read_csv("./data/new.csv")
print(sum(Hu['label'] >= (0.7 / 1.341)),Hu.shape[0])
print(sum(Taka['label'] >= 0.7),Taka.shape[0])
print(sum(new['label'] >= 0.7),new.shape[0])


1174 2361
191 702
241 473


Calculate 23 thermodynamic params

In [6]:
HuTD = Hu
# HuTD['targetStruct'] = HuTD['siRNA']
# HuTD['intraOligo'] = HuTD['siRNA']
# HuTD['interOligo'] = HuTD['siRNA']
# HuTD['duplex'] = HuTD['siRNA']
HuTD['ends'] = HuTD['siRNA']
HuTD['DG_1'] = HuTD['siRNA']
HuTD['DH_1'] = HuTD['siRNA']
HuTD['U_1'] = HuTD['siRNA']
HuTD['G_1'] = HuTD['siRNA']
HuTD['DH_all'] = HuTD['siRNA']
HuTD['U_all'] = HuTD['siRNA']
HuTD['UU_1'] = HuTD['siRNA']
HuTD['G_all'] = HuTD['siRNA']
HuTD['GG_1'] = HuTD['siRNA']
HuTD['GC_1'] = HuTD['siRNA']
HuTD['GG_all'] = HuTD['siRNA']
HuTD['DG_2'] = HuTD['siRNA']
HuTD['UA_all'] = HuTD['siRNA']
HuTD['U_2'] = HuTD['siRNA']
HuTD['C_1'] = HuTD['siRNA']
HuTD['CC_all'] = HuTD['siRNA']
HuTD['DG_18'] = HuTD['siRNA']
HuTD['CC_1'] = HuTD['siRNA']
HuTD['GC_all'] = HuTD['siRNA']
HuTD['CG_1'] = HuTD['siRNA']
HuTD['DG_13'] = HuTD['siRNA']
HuTD['UU_all'] = HuTD['siRNA']
HuTD['A_19'] = HuTD['siRNA']

for i in range(HuTD.shape[0]):
    if i % 100 == 0:
        print(i)
    HuTD['ends'] = [Calculate_end_diff(i) for i in HuTD['siRNA']]
    HuTD['DG_1'][i] = DeltaG[HuTD.iloc[i,0][0:2]]
    HuTD['DH_1'][i] = DeltaH[HuTD.iloc[i,0][0:2]]
    HuTD['U_1'][i] = int(HuTD.iloc[i,0][0] == 'U')
    HuTD['G_1'][i] = int(HuTD.iloc[i,0][0] == 'G')
    HuTD['DH_all'][i] = Calculate_DGH(HuTD.iloc[i,0])[1]
    HuTD['U_all'][i] = HuTD.iloc[i,0].count('U') / 19
    HuTD['UU_1'][i] = int(HuTD.iloc[i,0][0:2] == 'UU')
    HuTD['G_all'][i] = HuTD.iloc[i,0].count('G') / 19
    HuTD['GG_1'][i] = int(HuTD.iloc[i,0][0:2] == 'GG')
    HuTD['GC_1'][i] = int(HuTD.iloc[i,0][0:2] == 'GC')
    HuTD['GG_all'][i] = [HuTD.iloc[i,0][j]+HuTD.iloc[i,0][j+1] for j in range(18)].count('GG') / 18
    HuTD['DG_2'][i] = DeltaG[HuTD.iloc[i,0][1:3]]
    HuTD['UA_all'][i] = [HuTD.iloc[i,0][j]+HuTD.iloc[i,0][j+1] for j in range(18)].count('UA') / 18
    HuTD['U_2'][i] = int(HuTD.iloc[i,0][1] == 'U')
    HuTD['C_1'][i] = int(HuTD.iloc[i,0][0] == 'C')
    HuTD['CC_all'][i] = [HuTD.iloc[i,0][j]+HuTD.iloc[i,0][j+1] for j in range(18)].count('CC') / 18
    HuTD['DG_18'][i] = DeltaG[HuTD.iloc[i,0][17:19]]
    HuTD['CC_1'][i] = int(HuTD.iloc[i,0][0:2] == 'CC')
    HuTD['GC_all'][i] = [HuTD.iloc[i,0][j]+HuTD.iloc[i,0][j+1] for j in range(18)].count('GC') / 18
    HuTD['CG_1'][i] = int(HuTD.iloc[i,0][0:2] == 'CG')
    HuTD['DG_13'][i] = DeltaG[HuTD.iloc[i,0][12:14]]
    HuTD['UU_all'][i] = [HuTD.iloc[i,0][j]+HuTD.iloc[i,0][j+1] for j in range(18)].count('UU') / 18
    HuTD['A_19'][i] = int(HuTD.iloc[i,0][18] == 'A')

HuTD.to_csv('data/HuTD.csv',index=False)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300


In [27]:
TakaTD = Taka
TakaTD['ends'] = TakaTD['siRNA']
TakaTD['DG_1'] = TakaTD['siRNA']
TakaTD['DH_1'] = TakaTD['siRNA']
TakaTD['U_1'] = TakaTD['siRNA']
TakaTD['G_1'] = TakaTD['siRNA']
TakaTD['DH_all'] = TakaTD['siRNA']
TakaTD['U_all'] = TakaTD['siRNA']
TakaTD['UU_1'] = TakaTD['siRNA']
TakaTD['G_all'] = TakaTD['siRNA']
TakaTD['GG_1'] = TakaTD['siRNA']
TakaTD['GC_1'] = TakaTD['siRNA']
TakaTD['GG_all'] = TakaTD['siRNA']
TakaTD['DG_2'] = TakaTD['siRNA']
TakaTD['UA_all'] = TakaTD['siRNA']
TakaTD['U_2'] = TakaTD['siRNA']
TakaTD['C_1'] = TakaTD['siRNA']
TakaTD['CC_all'] = TakaTD['siRNA']
TakaTD['DG_18'] = TakaTD['siRNA']
TakaTD['CC_1'] = TakaTD['siRNA']
TakaTD['GC_all'] = TakaTD['siRNA']
TakaTD['CG_1'] = TakaTD['siRNA']
TakaTD['DG_13'] = TakaTD['siRNA']
TakaTD['UU_all'] = TakaTD['siRNA']
TakaTD['A_19'] = TakaTD['siRNA']

for i in range(TakaTD.shape[0]):
    if i % 100 == 0:
        print(i)
    TakaTD['ends'] = [Calculate_end_diff(i) for i in TakaTD['siRNA']]
    TakaTD['DG_1'][i] = DeltaG[TakaTD.iloc[i,0][0:2]]
    TakaTD['DH_1'][i] = DeltaH[TakaTD.iloc[i,0][0:2]]
    TakaTD['U_1'][i] = int(TakaTD.iloc[i,0][0] == 'U')
    TakaTD['G_1'][i] = int(TakaTD.iloc[i,0][0] == 'G')
    TakaTD['DH_all'][i] = Calculate_DGH(TakaTD.iloc[i,0])[1]
    TakaTD['U_all'][i] = TakaTD.iloc[i,0].count('U') / 19
    TakaTD['UU_1'][i] = int(TakaTD.iloc[i,0][0:2] == 'UU')
    TakaTD['G_all'][i] = TakaTD.iloc[i,0].count('G') / 19
    TakaTD['GG_1'][i] = int(TakaTD.iloc[i,0][0:2] == 'GG')
    TakaTD['GC_1'][i] = int(TakaTD.iloc[i,0][0:2] == 'GC')
    TakaTD['GG_all'][i] = [TakaTD.iloc[i,0][j]+TakaTD.iloc[i,0][j+1] for j in range(18)].count('GG') / 18
    TakaTD['DG_2'][i] = DeltaG[TakaTD.iloc[i,0][1:3]]
    TakaTD['UA_all'][i] = [TakaTD.iloc[i,0][j]+TakaTD.iloc[i,0][j+1] for j in range(18)].count('UA') / 18
    TakaTD['U_2'][i] = int(TakaTD.iloc[i,0][1] == 'U')
    TakaTD['C_1'][i] = int(TakaTD.iloc[i,0][0] == 'C')
    TakaTD['CC_all'][i] = [TakaTD.iloc[i,0][j]+TakaTD.iloc[i,0][j+1] for j in range(18)].count('CC') / 18
    TakaTD['DG_18'][i] = DeltaG[TakaTD.iloc[i,0][17:19]]
    TakaTD['CC_1'][i] = int(TakaTD.iloc[i,0][0:2] == 'CC')
    TakaTD['GC_all'][i] = [TakaTD.iloc[i,0][j]+TakaTD.iloc[i,0][j+1] for j in range(18)].count('GC') / 18
    TakaTD['CG_1'][i] = int(TakaTD.iloc[i,0][0:2] == 'CG')
    TakaTD['DG_13'][i] = DeltaG[TakaTD.iloc[i,0][12:14]]
    TakaTD['UU_all'][i] = [TakaTD.iloc[i,0][j]+TakaTD.iloc[i,0][j+1] for j in range(18)].count('UU') / 18
    TakaTD['A_19'][i] = int(TakaTD.iloc[i,0][18] == 'A')

TakaTD.to_csv('data/TakaTD.csv',index=False)

0
100
200
300
400
500
600
700


In [9]:

newTD = new
# newTD['targetStruct'] = newTD['siRNA']
# newTD['intraOligo'] = newTD['siRNA']
# newTD['interOligo'] = newTD['siRNA']
# newTD['duplex'] = newTD['siRNA']
newTD['ends'] = newTD['siRNA']
newTD['DG_1'] = newTD['siRNA']
newTD['DH_1'] = newTD['siRNA']
newTD['U_1'] = newTD['siRNA']
newTD['G_1'] = newTD['siRNA']
newTD['DH_all'] = newTD['siRNA']
newTD['U_all'] = newTD['siRNA']
newTD['UU_1'] = newTD['siRNA']
newTD['G_all'] = newTD['siRNA']
newTD['GG_1'] = newTD['siRNA']
newTD['GC_1'] = newTD['siRNA']
newTD['GG_all'] = newTD['siRNA']
newTD['DG_2'] = newTD['siRNA']
newTD['UA_all'] = newTD['siRNA']
newTD['U_2'] = newTD['siRNA']
newTD['C_1'] = newTD['siRNA']
newTD['CC_all'] = newTD['siRNA']
newTD['DG_18'] = newTD['siRNA']
newTD['CC_1'] = newTD['siRNA']
newTD['GC_all'] = newTD['siRNA']
newTD['CG_1'] = newTD['siRNA']
newTD['DG_13'] = newTD['siRNA']
newTD['UU_all'] = newTD['siRNA']
newTD['A_19'] = newTD['siRNA']

for i in range(newTD.shape[0]):
    if i % 100 == 0:
        print(i)
    newTD['ends'] = [Calculate_end_diff(i) for i in newTD['siRNA']]
    newTD['DG_1'][i] = DeltaG[newTD.iloc[i,0][0:2]]
    newTD['DH_1'][i] = DeltaH[newTD.iloc[i,0][0:2]]
    newTD['U_1'][i] = int(newTD.iloc[i,0][0] == 'U')
    newTD['G_1'][i] = int(newTD.iloc[i,0][0] == 'G')
    newTD['DH_all'][i] = Calculate_DGH(newTD.iloc[i,0])[1]
    newTD['U_all'][i] = newTD.iloc[i,0].count('U') / 19
    newTD['UU_1'][i] = int(newTD.iloc[i,0][0:2] == 'UU')
    newTD['G_all'][i] = newTD.iloc[i,0].count('G') / 19
    newTD['GG_1'][i] = int(newTD.iloc[i,0][0:2] == 'GG')
    newTD['GC_1'][i] = int(newTD.iloc[i,0][0:2] == 'GC')
    newTD['GG_all'][i] = [newTD.iloc[i,0][j]+newTD.iloc[i,0][j+1] for j in range(18)].count('GG') / 18
    newTD['DG_2'][i] = DeltaG[newTD.iloc[i,0][1:3]]
    newTD['UA_all'][i] = [newTD.iloc[i,0][j]+newTD.iloc[i,0][j+1] for j in range(18)].count('UA') / 18
    newTD['U_2'][i] = int(newTD.iloc[i,0][1] == 'U')
    newTD['C_1'][i] = int(newTD.iloc[i,0][0] == 'C')
    newTD['CC_all'][i] = [newTD.iloc[i,0][j]+newTD.iloc[i,0][j+1] for j in range(18)].count('CC') / 18
    newTD['DG_18'][i] = DeltaG[newTD.iloc[i,0][17:19]]
    newTD['CC_1'][i] = int(newTD.iloc[i,0][0:2] == 'CC')
    newTD['GC_all'][i] = [newTD.iloc[i,0][j]+newTD.iloc[i,0][j+1] for j in range(18)].count('GC') / 18
    newTD['CG_1'][i] = int(newTD.iloc[i,0][0:2] == 'CG')
    newTD['DG_13'][i] = DeltaG[newTD.iloc[i,0][12:14]]
    newTD['UU_all'][i] = [newTD.iloc[i,0][j]+newTD.iloc[i,0][j+1] for j in range(18)].count('UU') / 18
    newTD['A_19'][i] = int(newTD.iloc[i,0][18] == 'A')

newTD.to_csv('data/newTD.csv',index=False)

0
100
200
300
400
