In [1]:
import numpy as np
import pandas as pd
import math
import re
from scipy.stats import skew, kurtosis

In [2]:
df = pd.read_csv('data_3.csv', index_col='id')

In [3]:
df.head()

Unnamed: 0_level_0,fragment_one,start_one,end_one,resolution_one,chain_id_one,protein_id_one,fragment_two,start_two,end_two,resolution_two,chain_id_two,protein_id_two,rms,fragment_type,seq_type,tscore
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2661,METASNILE,1,3,2.8,A,104L.pdb,METASNILE,716,718,1.66,A,4Y04.pdb,2.35063,111,3,1.638366
2672,METASNILE,1,3,2.8,A,104L.pdb,METASNILE,5,7,2.71,A,4Y4M.pdb,1.54444,111,3,-1.81257
2679,METASNILE,716,718,1.66,A,4Y04.pdb,METASNILE,5,7,2.71,A,4Y4M.pdb,2.00858,111,3,0.174204
2911,ASNILEPHE,2,4,2.8,A,104L.pdb,ASNILEPHE,71,73,2.2,A,4Y3K.pdb,2.17766,111,3,63.868294
2936,ASNILEPHE,2,4,2.8,A,104L.pdb,ASNILEPHE,303,305,1.67,A,4Y4X.pdb,2.56818,111,3,81.222265


In [4]:
def calc_t_score(group):
    mean = group['rms'].mean()    
    std = group['rms'].std()
    count = group['rms'].count()
    group['n'] = count
    group['mean'] = mean
    group['std'] = std
#     print(mean, std, count)
#     print(group['rms'])
#     print(group['fragment_one'].unique())

    if count <= 30:
        group['tscore'] = (group['rms'] - mean) / (std/(math.sqrt(count)))
    else:
        group['tscore'] = (group['rms'] - mean) / std
        
    group['skew'] = skew(group['tscore'])
    group['kurtosis'] = kurtosis(group['tscore'], fisher=True)
    return group

df = df.groupby('fragment_one').apply(calc_t_score)

In [5]:
df_final = df.drop(columns=['start_one', 'end_one', 'resolution_one',
       'chain_id_one', 'protein_id_one', 'fragment_two', 'start_two',
       'end_two', 'resolution_two', 'chain_id_two', 'protein_id_two',
       'seq_type'])

In [6]:
df_final.head(50)

Unnamed: 0_level_0,fragment_one,rms,fragment_type,tscore,n,mean,std,skew,kurtosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2661,METASNILE,2.35063,111,1.638366,3,1.967883,0.404633,-0.182902,-1.5
2672,METASNILE,1.54444,111,-1.81257,3,1.967883,0.404633,-0.182902,-1.5
2679,METASNILE,2.00858,111,0.174204,3,1.967883,0.404633,-0.182902,-1.5
2911,ASNILEPHE,2.17766,111,2.398624,709,0.740419,0.599194,-0.016801,-1.630972
2936,ASNILEPHE,2.56818,111,3.050366,709,0.740419,0.599194,-0.016801,-1.630972
3027,ASNILEPHE,2.56359,111,3.042706,709,0.740419,0.599194,-0.016801,-1.630972
3318,ASNILEPHE,1.28032,111,0.901046,709,0.740419,0.599194,-0.016801,-1.630972
3329,ASNILEPHE,1.27885,111,0.898593,709,0.740419,0.599194,-0.016801,-1.630972
3334,ASNILEPHE,0.061513,111,-1.133031,709,0.740419,0.599194,-0.016801,-1.630972
3343,ASNILEPHE,1.41376,111,1.123745,709,0.740419,0.599194,-0.016801,-1.630972


In [7]:
df_final_clean = df_final.groupby(['fragment_one']).mean()[['fragment_type', 'n', 'mean', 'std', 'skew', 'kurtosis']]

In [8]:
df_final_clean = df_final_clean.reset_index().fillna(0)

In [9]:
df_final_clean.head()

Unnamed: 0,fragment_one,fragment_type,n,mean,std,skew,kurtosis
0,ALAALAALA,3,325,0.864336,0.729351,0.40274,-1.599657
1,ALAALAARG,21,91,1.488131,0.497887,-0.580775,0.135429
2,ALAALAASN,21,21,0.661898,0.287421,-0.326125,-1.022283
3,ALAALAASP,21,120,1.274343,0.858643,0.112329,-1.472973
4,ALAALAGLN,21,6,0.845359,0.362645,-0.875807,-0.479346


In [10]:
seq = {
    'ALA': 'A',
    'ARG': 'R',
    'ASN' : 'N',
    'ASP': 'D',
    'CYS': 'C',
    'GLU': 'E',
    'GLN': 'Q',
    'GLY' : 'G',
    'HIS': 'H',
    'ILE': 'I',
    'LEU': 'L',
    'LYS': 'K',
    'MET': 'M',
    'PHE': 'F',
    'PRO': 'P',
    'SER': 'S',
    'THR': 'T',
    'TRP': 'W',
    'TYR': 'Y',
    'VAL': 'V'
}

In [11]:
def find_replace_multi_ordered(string):
    
    for item in sorted(seq.keys(), key = len, reverse = True):
        string = re.sub(item, seq[item], string)
    return string

In [12]:
df_final_clean['fragment_one'] = df_final_clean['fragment_one'].apply(find_replace_multi_ordered)

In [13]:
df_final_clean.head()

Unnamed: 0,fragment_one,fragment_type,n,mean,std,skew,kurtosis
0,AAA,3,325,0.864336,0.729351,0.40274,-1.599657
1,AAR,21,91,1.488131,0.497887,-0.580775,0.135429
2,AAN,21,21,0.661898,0.287421,-0.326125,-1.022283
3,AAD,21,120,1.274343,0.858643,0.112329,-1.472973
4,AAQ,21,6,0.845359,0.362645,-0.875807,-0.479346
