In [3]:
import numpy as np
import pandas as pd
import math
import re
from scipy.stats import skew, kurtosis

In [2]:
df = pd.read_csv('data_3.csv', index_col='id')

In [3]:
df.head()

Unnamed: 0_level_0,fragment_one,start_one,end_one,resolution_one,chain_id_one,protein_id_one,fragment_two,start_two,end_two,resolution_two,chain_id_two,protein_id_two,rms,fragment_type,seq_type,tscore
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2661,METASNILE,1,3,2.8,A,104L.pdb,METASNILE,716,718,1.66,A,4Y04.pdb,2.35063,111,3,1.638366
2672,METASNILE,1,3,2.8,A,104L.pdb,METASNILE,5,7,2.71,A,4Y4M.pdb,1.54444,111,3,-1.81257
2679,METASNILE,716,718,1.66,A,4Y04.pdb,METASNILE,5,7,2.71,A,4Y4M.pdb,2.00858,111,3,0.174204
2911,ASNILEPHE,2,4,2.8,A,104L.pdb,ASNILEPHE,71,73,2.2,A,4Y3K.pdb,2.17766,111,3,63.868294
2936,ASNILEPHE,2,4,2.8,A,104L.pdb,ASNILEPHE,303,305,1.67,A,4Y4X.pdb,2.56818,111,3,81.222265


In [4]:
def calc_t_score(group):
    mean = group['rms'].mean()    
    std = group['rms'].std()
    count = group['rms'].count()
    group['n'] = count
    group['mean'] = mean
    group['std'] = std
#     print(mean, std, count)
#     print(group['rms'])
#     print(group['fragment_one'].unique())

    if count <= 30:
        group['tscore'] = (group['rms'] - mean) / (std/(math.sqrt(count)))
    else:
        group['tscore'] = (group['rms'] - mean) / std
        
    group['skew'] = skew(group['tscore'])
    group['kurtosis'] = kurtosis(group['tscore'], fisher=True)
    return group

df = df.groupby('fragment_one').apply(calc_t_score)

In [5]:
df_final = df.drop(columns=['start_one', 'end_one', 'resolution_one',
       'chain_id_one', 'protein_id_one', 'fragment_two', 'start_two',
       'end_two', 'resolution_two', 'chain_id_two', 'protein_id_two',
       'seq_type'])

In [6]:
df_final.head(50)

Unnamed: 0_level_0,fragment_one,rms,fragment_type,tscore,n,mean,std,skew,kurtosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2661,METASNILE,2.35063,111,1.638366,3,1.967883,0.404633,-0.182902,-1.5
2672,METASNILE,1.54444,111,-1.81257,3,1.967883,0.404633,-0.182902,-1.5
2679,METASNILE,2.00858,111,0.174204,3,1.967883,0.404633,-0.182902,-1.5
2911,ASNILEPHE,2.17766,111,2.398624,709,0.740419,0.599194,-0.016801,-1.630972
2936,ASNILEPHE,2.56818,111,3.050366,709,0.740419,0.599194,-0.016801,-1.630972
3027,ASNILEPHE,2.56359,111,3.042706,709,0.740419,0.599194,-0.016801,-1.630972
3318,ASNILEPHE,1.28032,111,0.901046,709,0.740419,0.599194,-0.016801,-1.630972
3329,ASNILEPHE,1.27885,111,0.898593,709,0.740419,0.599194,-0.016801,-1.630972
3334,ASNILEPHE,0.061513,111,-1.133031,709,0.740419,0.599194,-0.016801,-1.630972
3343,ASNILEPHE,1.41376,111,1.123745,709,0.740419,0.599194,-0.016801,-1.630972


In [7]:
df_final_clean = df_final.groupby(['fragment_one']).mean()[['fragment_type', 'n', 'mean', 'std', 'skew', 'kurtosis']]

In [8]:
df_final_clean = df_final_clean.reset_index().fillna(0)

In [9]:
df_final_clean.head()

Unnamed: 0,fragment_one,fragment_type,n,mean,std,skew,kurtosis
0,ALAALAALA,3,325,0.864336,0.729351,0.40274,-1.599657
1,ALAALAARG,21,91,1.488131,0.497887,-0.580775,0.135429
2,ALAALAASN,21,21,0.661898,0.287421,-0.326125,-1.022283
3,ALAALAASP,21,120,1.274343,0.858643,0.112329,-1.472973
4,ALAALAGLN,21,6,0.845359,0.362645,-0.875807,-0.479346


In [10]:
seq = {
    'ALA': 'A',
    'ARG': 'R',
    'ASN' : 'N',
    'ASP': 'D',
    'CYS': 'C',
    'GLU': 'E',
    'GLN': 'Q',
    'GLY' : 'G',
    'HIS': 'H',
    'ILE': 'I',
    'LEU': 'L',
    'LYS': 'K',
    'MET': 'M',
    'PHE': 'F',
    'PRO': 'P',
    'SER': 'S',
    'THR': 'T',
    'TRP': 'W',
    'TYR': 'Y',
    'VAL': 'V'
}

In [11]:
def find_replace_multi_ordered(string):
    
    for item in sorted(seq.keys(), key = len, reverse = True):
        string = re.sub(item, seq[item], string)
    return string

In [12]:
df_final_clean['fragment_one'] = df_final_clean['fragment_one'].apply(find_replace_multi_ordered)

In [13]:
df_final_clean.head()

Unnamed: 0,fragment_one,fragment_type,n,mean,std,skew,kurtosis
0,AAA,3,325,0.864336,0.729351,0.40274,-1.599657
1,AAR,21,91,1.488131,0.497887,-0.580775,0.135429
2,AAN,21,21,0.661898,0.287421,-0.326125,-1.022283
3,AAD,21,120,1.274343,0.858643,0.112329,-1.472973
4,AAQ,21,6,0.845359,0.362645,-0.875807,-0.479346


In [1]:
data_10 = pd.read_csv('data_10-unfiltered.csv')

NameError: name 'pd' is not defined

In [2]:
data_11 = pd.read_csv('data_11-unfiltered.csv')

NameError: name 'pd' is not defined

In [6]:
data_10.head()

Unnamed: 0,fragment_one,rms,fragment_type,n,mean,std,tscore,skew,kurtosis
0,SERTHRGLYSERALATHRTHRTHRPROILE,0.71753,241111,297,0.32857,0.215249,1.807026,0.727684,-1.014781
1,SERTHRGLYSERALATHRTHRTHRPROILE,0.094162,241111,297,0.32857,0.215249,-1.089011,0.727684,-1.014781
2,SERTHRGLYSERALATHRTHRTHRPROILE,0.189451,241111,297,0.32857,0.215249,-0.646318,0.727684,-1.014781
3,SERTHRGLYSERALATHRTHRTHRPROILE,0.212998,241111,297,0.32857,0.215249,-0.536923,0.727684,-1.014781
4,SERTHRGLYSERALATHRTHRTHRPROILE,0.250286,241111,297,0.32857,0.215249,-0.363691,0.727684,-1.014781


In [7]:
data_11.describe()

Unnamed: 0,rms,fragment_type,n,mean,std,tscore,skew,kurtosis
count,193144.0,193144.0,193144.0,193144.0,188891.0,188891.0,188891.0,188891.0
mean,0.450817,114892800.0,620.590099,0.450817,0.283743,2.125335e-18,0.215062,-0.943549
std,0.336175,655404200.0,177.937137,0.163874,0.090526,1.009592,0.595342,1.149799
min,0.020548,2414.0,1.0,0.04392,0.003917,-4.657178,-1.316703,-1.792855
25%,0.138569,1231112.0,667.0,0.339147,0.21954,-0.882833,-0.267534,-1.573691
50%,0.418546,4111211.0,704.0,0.469782,0.289282,-0.09597898,-0.058532,-1.318551
75%,0.734106,21212110.0,704.0,0.552048,0.335261,0.835939,0.611706,-1.004022
max,6.82309,11111110000.0,704.0,6.82309,1.481713,3.916574,2.233849,5.89968


In [8]:
data_10.describe()

Unnamed: 0,rms,fragment_type,n,mean,std,tscore,skew,kurtosis
count,197135.0,197135.0,197135.0,197135.0,192819.0,192819.0,192819.0,192819.0
mean,0.444251,30639800.0,625.25828,0.444251,0.286172,-1.1755230000000001e-17,0.256079,-0.918965
std,0.339683,140261200.0,174.977487,0.165949,0.0923,1.009695,0.602554,1.212494
min,0.020492,1324.0,1.0,0.040828,0.00279,-4.839693,-1.550729,-1.807239
25%,0.133148,312211.0,667.0,0.331679,0.220339,-0.8730092,-0.261627,-1.573106
50%,0.395707,2112211.0,704.0,0.423701,0.28009,-0.1376818,0.131496,-1.280752
75%,0.747821,11212110.0,704.0,0.543149,0.347688,0.8470857,0.658456,-0.989127
max,6.75517,1111111000.0,704.0,6.75517,1.558827,3.947294,2.322628,6.287606


In [13]:
data_10[(data_10['skew'] <= 0.5) & (data_10['skew'] >= -0.5) & (data_10['kurtosis'] <= -1)]

Unnamed: 0,fragment_one,rms,fragment_type,n,mean,std,tscore,skew,kurtosis
995,GLYSERALATHRTHRTHRPROILEASPSER,0.318243,1213111,704,0.355161,0.192081,-0.192200,0.429012,-1.048342
996,GLYSERALATHRTHRTHRPROILEASPSER,0.335745,1213111,704,0.355161,0.192081,-0.101083,0.429012,-1.048342
997,GLYSERALATHRTHRTHRPROILEASPSER,0.162411,1213111,704,0.355161,0.192081,-1.003481,0.429012,-1.048342
998,GLYSERALATHRTHRTHRPROILEASPSER,0.660220,1213111,704,0.355161,0.192081,1.588174,0.429012,-1.048342
999,GLYSERALATHRTHRTHRPROILEASPSER,0.340842,1213111,704,0.355161,0.192081,-0.074547,0.429012,-1.048342
...,...,...,...,...,...,...,...,...,...
194231,VALARGARGGLUHISALASERILEASPALA,0.645220,12112111,3,0.597063,0.146332,0.570012,-0.539109,-1.500000
194232,VALARGARGGLUHISALASERILEASPALA,0.432721,12112111,3,0.597063,0.146332,-1.945221,-0.539109,-1.500000
194233,ARGARGGLUHISALASERILEASPALAGLN,1.652690,21121111,3,1.252914,0.710644,0.974373,-0.706416,-1.500000
194234,ARGARGGLUHISALASERILEASPALAGLN,1.673630,21121111,3,1.252914,0.710644,1.025410,-0.706416,-1.500000
