## Cider analysis

## Get all of this data:

```
N          :	49   
f-         :	0.20408  
f+         :	0.16327
FCR        :	0.36735
NCPR       :	-0.04082
Kappa      :	0.15558
Omega      :	0.12680
Sigma      :	0.00454
Delta      :	0.04996
Max Delta  :	0.32115
Hydropathy :	3.47755

Phase Plot Region: 3
Phase Plot Annotation: Coils,Hairpins and Chimeras
```



In [67]:
import pandas as pd
import numpy as np

In [69]:
def get_sigma(f_plus, f_minus):
    '''gets sigma
    Sigma: ⟨σ⟩, or the average sigma value for the entire sequence, where sigma quantifies the charge asymmetry. 
Specifically, σ = (f+-f-)2/(f++f-) where f- and f+ refer to the fraction of negative and positive residues across the entire sequence. See Das & Pappu[2] for more details
    
    '''
    if f_plus == 0 and f_minus==0:
        sigma = np.nan
    else:
        sigma = (f_plus-f_minus)**2/(f_plus+f_minus)
    return sigma

In [4]:
def get_num_hydrophobic(seq):
    ''' counts the number of hydrophobic amino acids (L,V,I,M) in a seq
    '''
    
    I_count = seq.count('I')
    M_count = seq.count('M')
    L_count = seq.count('L')
    V_count = seq.count('V')
    count = I_count+M_count+L_count+V_count
    return count
    

In [40]:
def get_all_data_from_seq(seq):
    ''' This extracts all data needed from cider for a given sequence and returns it as a dictionary
    '''
    SeqOb = SequenceParameters(seq)
    data = {}
    data['N'] = SeqOb.get_length()
    data['seq'] = seq
    data['f-'] = SeqOb.get_fraction_negative()
    data['f+'] = SeqOb.get_fraction_positive()
    data['FCR']= SeqOb.get_FCR()
    data['NCPR'] = SeqOb.get_NCPR()
    data['kappa'] = SeqOb.get_kappa()
    data['Omega'] = SeqOb.get_Omega()
    data['sigma'] = get_sigma(data['f+'], data['f-'])
    data['delta'] = SeqOb.get_delta()
    data['delta max'] = SeqOb.get_deltaMax()
    data['hydropathy'] = SeqOb.get_mean_hydropathy()
    data['Phase plot region'] = int(SeqOb.get_phasePlotRegion())
    data['Hydrophobic count'] = int(get_num_hydrophobic(seq))
    data['Positive count'] = int(SeqOb.get_countPos())
    data['Negative count'] = int(SeqOb.get_countNeg())
    return data

In [17]:
# import the relevant code
from localcider.sequenceParameters import SequenceParameters

data = get_all_data_from_seq("DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV")
print(data)
df = pd.Series(data).to_frame().T
print(df)

{'f-': 0.15, 'f+': 0.075, 'FCR': 0.225, 'NCPR': -0.075, 'kappa': 0.2112978461212795, 'Omega': 0.3107604958163276, 'sigma': 0.025, 'delta': 0.03080798059964728, 'delta max': 0.14580357142857148, 'hydropathy': 4.5575, 'Phase plot region': 1, 'Hydrophobic count': 11, 'Positive count': 3, 'Negative count': 6}
     f-     f+    FCR   NCPR     kappa    Omega  sigma     delta  delta max  \
0  0.15  0.075  0.225 -0.075  0.211298  0.31076  0.025  0.030808   0.145804   

   hydropathy  Phase plot region  Hydrophobic count  Positive count  \
0      4.5575                1.0               11.0             3.0   

   Negative count  
0             6.0  


In [18]:
df

Unnamed: 0,f-,f+,FCR,NCPR,kappa,Omega,sigma,delta,delta max,hydropathy,Phase plot region,Hydrophobic count,Positive count,Negative count
0,0.15,0.075,0.225,-0.075,0.211298,0.31076,0.025,0.030808,0.145804,4.5575,1.0,11.0,3.0,6.0


In [55]:
all_data = pd.read_csv("../filtered_table_final_100223.csv")

In [56]:
all_data.fillna("NA", inplace=True)

In [57]:
all_data.columns

Index(['uniprot_id', 'gene_name', 'refseq_id', 'firstAA_position_in_HELIDR',
       'lastAA_position_in_HELIDR', 'HELIDR_upstream_seq', 'HELIDR_seq',
       'HELIDR_downstream_seq', '2S5P_up', '2S5P_down', '2S5P1_up',
       '2S5P1_down', '2S5P1_helix', 'HEK293T_expressed',
       'NonTMD[3]_TMD[2]_SEC[1]', 'Non_TMD_classification', '4 compartments',
       'TG_CY', 'TG_SR_nonS'],
      dtype='object')

In [58]:
whole_seq = all_data['HELIDR_upstream_seq']+all_data['HELIDR_seq']+all_data['HELIDR_downstream_seq']

In [72]:
# %%time
new_data = []
for i in range(len(whole_seq)):
    if i%1000==0:
        print(f'At entry {i}/{len(whole_seq)}')
    new_data.append(get_all_data_from_seq(whole_seq[i]))

At entry 0/123587
At entry 1000/123587
At entry 2000/123587
At entry 3000/123587
At entry 4000/123587
At entry 5000/123587
At entry 6000/123587
At entry 7000/123587
At entry 8000/123587
At entry 9000/123587
At entry 10000/123587
At entry 11000/123587
At entry 12000/123587
At entry 13000/123587
At entry 14000/123587
At entry 15000/123587
At entry 16000/123587
At entry 17000/123587
At entry 18000/123587
At entry 19000/123587
At entry 20000/123587
At entry 21000/123587
At entry 22000/123587
At entry 23000/123587
At entry 24000/123587
At entry 25000/123587
At entry 26000/123587
At entry 27000/123587
At entry 28000/123587
At entry 29000/123587
At entry 30000/123587
At entry 31000/123587
At entry 32000/123587
At entry 33000/123587
At entry 34000/123587
At entry 35000/123587
At entry 36000/123587
At entry 37000/123587
At entry 38000/123587
At entry 39000/123587
At entry 40000/123587
At entry 41000/123587
At entry 42000/123587
At entry 43000/123587
At entry 44000/123587
At entry 45000/123587
A

In [73]:
new_columns = pd.DataFrame(new_data)  

In [74]:
result = pd.concat([all_data, new_columns], axis=1)

In [75]:
result.to_csv('filtered_table_final_with_cider.csv')

In [76]:
result

Unnamed: 0,uniprot_id,gene_name,refseq_id,firstAA_position_in_HELIDR,lastAA_position_in_HELIDR,HELIDR_upstream_seq,HELIDR_seq,HELIDR_downstream_seq,2S5P_up,2S5P_down,...,kappa,Omega,sigma,delta,delta max,hydropathy,Phase plot region,Hydrophobic count,Positive count,Negative count
0,O00305,CACNB4,NM_001320722,61,89,SADSYTSRPS,DSDVSLEEDREAIRQEREQQAAIQLERAK,SKPVAFAVKT,1,0,...,0.155582,0.126797,0.004535,0.049965,0.321149,3.477551,3,7,8,10
1,O00305,CACNB4,NM_001320722,233,249,VLVGPSLKGY,EVTDMMQKALFDFLKHR,FDGRISITRV,0,0,...,0.151172,0.042937,0.010811,0.029153,0.192847,4.554054,2,12,6,4
2,O00305,CACNB4,NM_001320722,274,280,SLAKRSVLNN,PSKRAII,ERSNTRSSLA,0,0,...,0.211645,0.072938,0.132275,0.028457,0.134458,3.874074,2,6,6,1
3,O00305,CACNB4,NM_001320722,289,302,IIERSNTRSS,LAEVQSEIERIFEL,ARSLQLVVLD,1,0,...,0.129670,0.147679,0.011765,0.027466,0.211815,4.552941,2,12,4,6
4,O00305,CACNB4,NM_001320722,339,348,PIIVHVKVSS,PKVLQRLIKS,RGKSQSKHLN,1,0,...,0.131060,0.071564,0.233333,0.017607,0.134342,4.123333,1,10,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123582,Q9H841,NIPAL2,NM_024759,275,297,FLNQATKLYN,TTTVVPVNHIFFTISAIIAGIIF,YQEFLGAPFL,0,0,...,1.000000,0.643995,0.000000,0.009514,0.009514,5.425581,1,13,1,1
123583,Q9H841,NIPAL2,NM_024759,306,328,IFYQEFLGAP,FLTVFIYLFGCFLSFLGVFLVTR,NREKEHLQQS,0,0,...,0.162459,0.602214,0.000000,0.011628,0.071574,5.102326,1,12,3,3
123584,P0DI81,TRAPPC2,XM_011545566,32,44,FLPAGKAESK,DDHRHLNQFIAHA,ALDLVDENMW,0,0,...,0.197775,0.299646,0.030303,0.035281,0.178391,3.993939,2,7,3,6
123585,P0DI81,TRAPPC2,XM_011545566,46,55,HLNQFIAHAA,LDLVDENMWL,SNNMYLKTVD,0,0,...,0.453409,0.683681,0.060000,0.034285,0.075617,4.446667,1,10,1,4


In [70]:
test = get_all_data_from_seq(whole_seq[1038])