In [282]:
from pymed import PubMed
import pandas as pd
import joblib as jl
import seaborn as sns
import numpy as np

pubmed = PubMed(tool="PlasmoSearch", email="pcarden@gmail.com")

In [283]:
ber = pd.read_csv('berghei.csv')
pig = pd.read_csv('piggyBac.csv')

com = pd.read_csv('GenesByTaxon_UserComments.csv')
nts = pd.read_csv('GenesByTaxon_Notes.csv')

nam = pd.read_csv('GenesByTaxon_Summary.csv').drop(columns=['source_id','Genomic Location (Gene)','Transcript Product Description'])

het = pd.read_csv('het_exp_1000.csv')
ids = pd.read_csv('PlasmoDB_IDS.csv')

pdb = pd.read_csv('pdb_ids.csv')
pdb = pdb.loc[pdb['% Identity']>95,:]

tra = pd.read_csv('toenhake_transcriptome.csv')
tra['Asexual_exp'] = tra.iloc[:,2:].sum(axis=1)

dis = pd.read_csv('d2d2_disordered_0prot_0pred.csv')
dis['Disordered_len'] = dis['End'] - dis['Start']
dis = dis.groupby(['Seqid']).sum().reset_index()

In [284]:
print('Heterologous expression:')
het['Gene ID'] = ''
for i,r in het.iterrows():
    if (ids['Previous ID(s)'].str.find(r['PlasmoDB_ID']) > 0).any():
        pids = (ids.loc[ ids['Previous ID(s)'].str.find(r['PlasmoDB_ID']) > 0,'Gene ID'].unique())
        for pid in pids:
            if r['PlasmoDB_ID'] in ids.loc[ ids['Gene ID']==pid, 'Previous ID(s)' ].values[0][14:].split(';'):
                het.iat[i,-1] = het.iat[i,-1] + pid +','
        
        if len(het.iat[i,-1].split(',')) > 2:
            print('Repeated: '+r['PlasmoDB_ID']+' - '+str(het.iat[i,-1]))
            
    else:
        print(r['PlasmoDB_ID'],r['Expression seen?'])

print('Disorder:')
dis['Gene ID'] = ''
for i,r in dis.iterrows():
    old_id = r['Seqid'][0:-2] if r['Seqid'][-2] == '.' and r['Seqid'][-8] != 'M' else r['Seqid'].replace(':mRNA','')
    if (ids['Previous ID(s)'].str.find(old_id) > 0).any():
        pids = (ids.loc[ ids['Previous ID(s)'].str.find(old_id) > 0,'Gene ID'].unique())
        for pid in pids:
            if old_id in ids.loc[ ids['Gene ID']==pid, 'Previous ID(s)' ].values[0][14:].split(';'):
                dis.iat[i,-1] = dis.iat[i,-1] + pid +','
        
        if len(dis.iat[i,-1].split(',')) > 2:
            print('Repeated: '+old_id+' - '+str(dis.iat[i,-1]))
    else:
        print(r['Seqid'])

Heterologous expression:
chr10.glm_427 True
none True
Repeated: PF10_0320 - PF3D7_1032800,PF3D7_1032900,
chr12.phat_407  True
none True
chr10.gen_380 False
none False
chr10.phat_385 False
chr10.phat_388 False
chr10.gen_223 False
Repeated: PF10_0313 - PF3D7_1031900,PF3D7_1032000,
chr10.gen_248 False
Repeated: PF10_0215 - PF3D7_1022100,PF3D7_1022200,
chr11.glm_2 False
chr11.gen_372 False
PFL1580w False
Repeated: PF10_0228 - PF3D7_1023500,PF3D7_1023600,
chr12.phat_22 False
chr12.phat_15 False
chr12.glm_450 False
PF11_0006 False
chr14.glm_572 False
Repeated: PF14_0213 - PF3D7_1422200,PF3D7_1422300,
chr14.gen_560 False
Repeated: PF14_0345 - PF3D7_1436400,PF3D7_1436500,
MAL13P1.143 False
Repeated: PFF0600w - PF3D7_0612300,PF3D7_0612400,
PF11_0004 False
Repeated: PF14_0183 - PF3D7_1418800,PF3D7_1418900,
Repeated: PFI1460c - PF3D7_0929700,PF3D7_0929800,
Disorder:
MAL13P1.65
MAL7P1.142
Repeated: MAL8P1.136 - PF3D7_0805800,PF3D7_0805900,
MAL8P1.208
MAL8P1.210
MAL8P1.310
MAL8P1.90
Repeated: MAL8P

In [285]:
print(dis.loc[dis['Gene ID'].str.find('PF3D7_1318600') > -1,:])
# dis['Gene ID'].str

         Seqid  Start    End  Disordered_len         Gene ID
1  MAL13P1.100  12977  13370             393  PF3D7_1318600,


In [286]:
com['Note'] = com['Headline']
inf = nts.append(com).reset_index()

inf['Info'] = inf['Note'].str.cat(inf['PubMed ID(s)'].astype(str), sep =": PMID ").astype(str)
inf['Info'] = inf.groupby(['Gene ID'], as_index=False)['Info'].transform(' | '.join)

ber['Class_ber'] = ber['Phenotype']
ber['RelativeGrowth_ber'] = ber['Relative growth rate']
pig['Class_pig'] = pig['Gene Identification'].replace('Mutable in CDS','Dispensable').replace('Non - Mutable in CDS','Essential')
pig['MutabilityIndexScore_pig'] = pig['MIS']

dat = nam\
    .join(ber.set_index('P. falciparum ID')['Class_ber'],on="Gene ID")\
    .join(pig.set_index('Gene_ID')['Class_pig'],on="Gene ID")\
    .join(ber.set_index('P. falciparum ID')['RelativeGrowth_ber'],on="Gene ID")\
    .join(pig.set_index('Gene_ID')['MutabilityIndexScore_pig'],on="Gene ID")\
    .join(inf.set_index('Gene ID')['Info'].drop_duplicates(),on="Gene ID")\
    .join(tra.set_index('Gene ID')['Asexual_exp'],on="Gene ID")

dat['Asexual_exp_pct'] = dat['Asexual_exp'].rank(pct=True) * 100

In [287]:
dat['Structure'] = (dat['Info'].str.find('tructure') > -1) & ((dat['Info'].str.find('rystal') > -1) | (dat['Info'].str.find('NMR') > -1))
dat['Recombinant'] = (dat['Info'].str.find('ecombinant') > -1) | (dat['Structure'] == True)

dat['References'] = ''
dat.loc[(dat['Recombinant']==True),'References'] = dat.loc[(dat['Recombinant']==True),'Info']

In [288]:
dat['Disordered_len'] = 0
dat['Disordered_len_pct'] = 0

for i,r in dat.iterrows():
    print(i)
    if (het.loc[het['Gene ID'].str.find(r['Gene ID']) > -1,'Expression seen?']).any():
        dat.at[i,'Recombinant'] = True
        dat.at[i,'References'] = r['References'] + 'PMID: 16644028 '
        
    if (r['Gene ID'] in pdb['Gene ID'].unique()):
        dat.at[i,'Recombinant'] = True
        dat.at[i,'Structure'] = True
        dat.at[i,'References'] = r['References'] + 'PDB: ' + ','.join(pdb.loc[pdb['Gene ID']==r['Gene ID'],'pdb_id'])
        
    if len(dis.loc[dis['Gene ID'].str.find(r['Gene ID']) > -1,'Disordered_len'].index) > 0:
        if len(dis.loc[dis['Gene ID'].str.find(r['Gene ID']) > -1,'Disordered_len'].index) > 1:
            print(dis.loc[dis['Gene ID'].str.find(r['Gene ID']) > -1,:])
            
        dat.at[i,'Disordered_len'] = dis.loc[dis['Gene ID'].str.find(r['Gene ID']) > -1,'Disordered_len'].mean()
        
    dat['Disordered_len_pct'] = dat['Disordered_len'].rank(pct=True) * 100

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
53
54
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
           Seqid  Start   End  Disordered_len         Gene ID
3048  PFA0345w.1   3399  3963             564  PF3D7_0107000,
3049  PFA0345w.2   2581  3093             512  PF3D7_0107000,
71
72
73
74
75
76
77
78
79
80
81
82
83
83
84
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
           Seqid  Start    End  Disordered_len                       Gene ID
3076  PFA0485w.1  18263  18846             583  PF3D7_0109850,PF3D7_0109950,
3077  PFA0485w.2  19225  19796             571  PF3D7_0109850,PF3D7_0109950,
100
           Seqid  Start    End  Disordered_len                       Gene ID
3076  PFA0485w.1  18263  18846             583  PF3D7_0109850,PF3D7_0109950,
3077  PFA0485w.2  19225  19796             571  PF3D7_0109850,PF3D7_0109950,
101
102
103
104
105
106
107
108
109
110
111
11

501
502
503
504
505
           Seqid  Start   End  Disordered_len         Gene ID
3456  PFC0441c.1   2569  2671             102  PF3D7_0310600,
3457  PFC0441c.2   2556  2662             106  PF3D7_0310600,
505
           Seqid  Start   End  Disordered_len         Gene ID
3456  PFC0441c.1   2569  2671             102  PF3D7_0310600,
3457  PFC0441c.2   2556  2662             106  PF3D7_0310600,
506
           Seqid  Start   End  Disordered_len         Gene ID
3456  PFC0441c.1   2569  2671             102  PF3D7_0310600,
3457  PFC0441c.2   2556  2662             106  PF3D7_0310600,
506
           Seqid  Start   End  Disordered_len         Gene ID
3456  PFC0441c.1   2569  2671             102  PF3D7_0310600,
3457  PFC0441c.2   2556  2662             106  PF3D7_0310600,
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
           Seqid  Start    End  Disordered_len         Gene ID
3486  PFC0571c.1  12572  13172       

1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
           Seqid   Start     End  Disordered_len         Gene ID
4124  PFE1465w.1  181966  187625            5659  PF3D7_0529400,
4125  PFE1465w.2  167672  173182            5510  PF3D7_0529400,
1211
           Seqid   Start     End  Disordered_len         Gene ID
4124  PFE1465w.1  181966  187625            5659  PF3D7_0529400,
4125  PFE1465w.2  167672  173182            5510  PF3D7_0529400,
1212
           Seqid   Start     End  Disordered_len         Gene ID
4124  PFE1465w.1  181966  187625            5659  PF3D7_0529400,
4125  PFE1465w.2  167672  173182            5510  PF3D7_0529400,
1212
           Seqid   Start     End  Disordered_len         Gene ID
4124  PFE1465w.1  181966  187625            5659  PF3D7_0529400,
4125  PFE1465w.2  167672  173182            5510  PF3D7_0529400,
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231


1882
1883
            Seqid  Start   End  Disordered_len         Gene ID
413  MAL7P1.160.1   2005  2449             444  PF3D7_0729600,
414  MAL7P1.160.2   1941  2353             412  PF3D7_0729600,
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1895
1896
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
            Seqid   Start     End  Disordered_len         Gene ID
572  MAL8P1.143.1  116017  119527            3510  PF3D7_0803600,
573  MAL8P1.143.2  113327  116838            3511  PF3D7_0803600,
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
           Seqid  Start   End  Disordered_len                       Gene ID
563   MAL8P1.136   3160  4013             

2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
           Seqid  Start   End  Disordered_len         Gene ID
4653  PFI0890c.1   7402  7771             369  PF3D7_0918200,
4654  PFI0890c.2   6922  7289             367  PF3D7_0918200,
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
           Seqid  Start   End  Disordered_len         Gene ID
4682  PFI1030c.1   2525  2652             127  PF3D7_0921000,
4683  PFI1030c.2   2546  2667             121  PF3D7_0921000,
2474
           Seqid  Start   End  Disordered_len         Gene ID
4682  PFI1030c.1   2525  2652             127  PF3D7_0921000,
4683  PFI1030c.2   2546  2667          

2840
2841
2842
2843
2844
2845
2846
2847
2848
2848
2849
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
           Seqid  Start    End  Disordered_len                       Gene ID
1191   PF10_0215  17725  18480             755  PF3D7_1022100,PF3D7_1022200,
1192  PF10_0215a   3956   4239             283                PF3D7_1022100,
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
          Seqid  Start    End  Disordered_len                       Gene ID
1205  PF10_0228    358    543             185  PF3D7_1023500,PF3D7_1023600,
1206  PF10_0229  16656  18347            1691                PF3D7_1023600,
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
           Seqid  Start    End  Disordered_len                       Gene ID
1222   PF10_0246  65736  67691            1955  PF3D7_1025200,PF3D7_1025300,
1223  PF10_0246a   6944   7537            

3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
            Seqid  Start   End  Disordered_len         Gene ID
1720  PF11_0377.1   7391  7705             314  PF3D7_1136500,
1721  PF11_0377.2   7391  7705             314  PF3D7_1136500,
1722  PF11_0377.3   7090  7381             291  PF3D7_1136500,
3439
            Seqid  Start   End  Disordered_len         Gene ID
1720  PF11_0377.1   7391  7705             314  PF3D7_1136500,
1721  PF11_0377.2   7391  7705             314  PF3D7_1136500,
1722  PF11_0377.3   7090  7381             291  PF3D7_1136500,
3440
            Seqid  Start   End  Disordered_len         Gene ID
1720  PF11_0377.1   7391  7705             314  PF3D7_1136500,
1721  PF11_0377.2   7391  7705             314  PF3D7_1136500,
1722  PF11_0377.3   7090  7381             291  PF3D7_1136500,
3440
            Seqid  Start   End  Disordered_len         Gene ID
1720  PF11_0377.1   7391  7705             314  PF3D7_1136500,
1721  PF11_0377.2   7391  7705             314  PF3D7

4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4357
4358
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388


4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
            Seqid  Start   End  Disordered_len         Gene ID
2288  PF14_0089.1   3365  3950             585  PF3D7_1409200,
2289  PF14_0089.2   3150  3738             588  PF3D7_1409200,
4970
            Seqid  Start   End  Disordered_len         Gene ID
2288  PF14_0089.1   3365  3950             585  PF3D7_1409200,
2289  PF14_0089.2   3150  3738             588  PF3D7_1409200,
4971
            Seqid  Start   End  Disordered_len         Gene ID
2288  PF14_0089.1   3365  3950             585  PF3D7_1409200,
2289  PF14_0089.2   3150  3738             588  PF3D7_1409200,
4971
            Seqid  Start   End  Disordered_len         Gene ID
2288  PF14_0089.1   3365  3950             585  PF3D7_1409200,
2289  PF14_0089.2   3150  3738             588  PF3D7_1409200,
4972
4973
4974
4975
4976
4977

5434
5435
5436
5437
5438
5439
            Seqid  Start   End  Disordered_len         Gene ID
2725  PF14_0526.1   1961  2057              96  PF3D7_1455200,
2726  PF14_0526.2   2102  2193              91  PF3D7_1455200,
5439
            Seqid  Start   End  Disordered_len         Gene ID
2725  PF14_0526.1   1961  2057              96  PF3D7_1455200,
2726  PF14_0526.2   2102  2193              91  PF3D7_1455200,
5440
            Seqid  Start   End  Disordered_len         Gene ID
2725  PF14_0526.1   1961  2057              96  PF3D7_1455200,
2726  PF14_0526.2   2102  2193              91  PF3D7_1455200,
5440
            Seqid  Start   End  Disordered_len         Gene ID
2725  PF14_0526.1   1961  2057              96  PF3D7_1455200,
2726  PF14_0526.2   2102  2193              91  PF3D7_1455200,
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480

In [303]:
eslo = dat.loc[( (dat['Class_ber']!='Dispensable') & (dat['Class_ber']!='Fast') ) & (dat['Class_pig']=='Essential') & (dat['Molecular Weight']<20000),]

In [305]:
eslo['Class_ber']
# (eslo.loc[(eslo['Recombinant']==True) & (eslo['Structure']==False),:])

4                     NaN
6                     NaN
50                    NaN
52                    NaN
64              Essential
              ...        
5625    Insufficient data
5629                  NaN
5638                  NaN
5648                  NaN
5666                  NaN
Name: Class_ber, Length: 499, dtype: object

In [306]:
dat.to_csv('full_proteome.csv',index=False)
eslo.to_csv('essentiaLowme.csv',index=False)

In [294]:
dat['Class_ber'].unique()

array([nan, 'Dispensable', 'Essential', 'Slow', 'Insufficient data',
       'Fast'], dtype=object)

In [None]:
res_it = pubmed.query("plasmodium (structure OR recombinant OR heterologous)", max_results=100000)
res = {}
tit = {}
for i,art in enumerate(res_it):
    print(i)
    res[art.pubmed_id] = art.abstract
    tit[art.pubmed_id] = art.title


In [8]:
res

{'32501284': 'Eukaryotic cell proliferation requires chromosome replication and precise segregation to ensure daughter cells have identical genomic copies. The genus ',
 '32499783\n21266965\n24719471\n31630194\n31801556\n20962255\n31182154\n32405064\n15664649\n24737801\n20843207\n20553604\n29444078\n17988945\n31980693\n23275094\n28973483\n12686607\n30298804\n28192523\n31579826\n8078519\n16123303\n30820557\n26216993\n26492873\n30373575\n30696449\n21414208\n21980386\n20174609\n17598897\n29912680\n24401111\n27809852\n31029229\n27837017\n16111789\n7719909\n17192270\n30131879\n15792998\n24855263\n26921176\n7591074\n30665398': 'In our aim to eliminate malaria, more sensitive tools to detect residual transmission are quickly becoming essential. Antimalarial antibody responses persist in the blood after a malaria infection and provide a wider window to detect exposure to infection compared to parasite detection metrics. Here, we aimed to select antibody responses associated with recent and cum

In [6]:
eslo.loc[(eslo['Product Description']!='conserved protein, unknown function') & (eslo['# TM Domains']>0),]
# list(eslo.loc[(eslo['# TM Domains']==0) & eslo['# TM Domains']==0,]['Gene ID'])#.loc[eslo['# TM Domains']>0,]

Unnamed: 0,Gene ID,Product Description,Gene Name or Symbol,Previous ID(s),Entrez Gene ID,UniProt ID,Protein Length,Molecular Weight,# TM Domains,SignalP Peptide,...,PFam Description,Computed GO Components,Class_ber,Class_pig,RelativeGrowth_ber,MutabilityIndexScore_pig,Info,Structure,Recombinant,References
261,PF3D7_0210000,secretory complex protein 61 gamma subunit,Sec61-gamma,Previous IDs: PF02_0094;PFB0450w,812690,O96183,81.0,9285.0,1.0,,...,SecE/Sec61-gamma subunits of protein transloca...,membrane,Essential,Essential,0.14,0.121,The gene is bioinformatically characterized.: ...,False,False,
466,PF3D7_0306700,"ER membrane protein complex subunit 5, putative",EMC5,Previous IDs: PFC0282w,814385,Q9NLB0,116.0,13267.0,2.0,"HMM: MINNVSVAITLVGLLALFKSGYTV, NN: MINNVSVAITL...",...,,membrane,Essential,Essential,0.15,0.449,,False,False,
1051,PF3D7_0513500,mitochondrial import inner membrane translocas...,PAM16,Previous IDs: MAL5P1.135;PFE0670w,812949,Q8I3X2,124.0,14606.0,1.0,"HMM: MLPFRPLSQFVFQFLIITSTALGK, NN: MLPFRPLSQFV...",...,,integral component of membrane;mitochondrial i...,Essential,Essential,0.3,0.342,,False,False,
1108,PF3D7_0519200,V-type proton ATPase 16 kDa proteolipid subunit,,Previous IDs: MAL5P1.193;PFE0965c,813008,Q8I3R7,165.0,17065.0,4.0,,...,ATP synthase subunit C,"proton-transporting V-type ATPase, V0 domain;p...",Essential,Essential,0.05,0.776,,False,False,
1163,PF3D7_0524700,"mitochondrial import receptor subunit TOM22, p...",TOM22,Previous IDs: MAL5P1.246;PFE1230c,813061,Q8I3L8,105.0,12017.0,1.0,,...,Mitochondrial import receptor subunit Tom22,mitochondrial outer membrane,Essential,Essential,0.05,0.466,,False,False,
1335,PF3D7_0608400,"conserved Plasmodium protein, unknown function",,Previous IDs: 2270.t00229;MAL6P1.87;PFF0415c,3885907,C6KST2,106.0,12708.0,1.0,,...,,,Essential,Essential,0.16,0.482,,False,False,
1371,PF3D7_0612000,"conserved Plasmodium protein, unknown function",,Previous IDs: 2270.t00014;MAL6P1.305;PFF0585c,3885928,C6KSW4,84.0,10244.0,1.0,,...,,integral component of membrane;membrane,Essential,Essential,0.17,0.196,,False,False,
1482,PF3D7_0622600,"cytochrome b-c1 complex subunit 9, putative",QCR9,Previous IDs: PFF1085a,9221868,C0H4H6,96.0,11696.0,1.0,,...,"Ubiquinol-cytochrome C reductase, UQCRX/QCR9 like",mitochondrial inner membrane;mitochondrial res...,Essential,Essential,0.19,0.127,,False,False,
2027,PF3D7_0810700,"conserved Plasmodium protein, unknown function",,Previous IDs: MAL8P1.107,2655374,C0H4T8,141.0,16803.0,1.0,,...,,,Essential,Essential,0.03,0.198,,False,False,
2139,PF3D7_0821800,"protein transport protein SEC61 subunit beta, ...",SEC61B,Previous IDs: MAL8P1.51,2655350,C0H4W6,80.0,8629.0,1.0,,...,Sec61beta family,Sec61 translocon complex,Essential,Essential,0.27,0.732,,False,False,


In [445]:
def srchLit(row):
    if not pd.isnull(row['Gene Name or Symbol']):
        gen = row['Gene Name or Symbol']
    else:
        gen = row['Gene ID']
        
    ref = ''
    for art in res:
        ab = res[art]
        if ab is not None and len(ab) > 0:
            if (ab.find(gen) > -1):
                if (ab.find('tructure') > -1) and (ab.find('rystal') > -1 or ab.find('NMR') > -1 or ab.find('cryo') > -1):
                    dat.loc[dat['Gene ID']==row['Gene ID'],'Structure'] = True
                    dat.loc[dat['Gene ID']==row['Gene ID'],'Recombinant'] = True
                    ref = ref + 'PMID: ' + art + ' - ' + tit[art]
                elif ( ab.find('ecombinant') > -1 ):
                    dat.loc[dat['Gene ID']==row['Gene ID'],'Structure'] = True
                    ref = ref + 'PMID: ' + art + ' - ' + tit[art]

    dat.loc[dat['Gene ID']==row['Gene ID'],'References'] = dat.loc[dat['Gene ID']==row['Gene ID'],'References'] + ref
    print('Done '+gen)
    
    return row
    
    
out = jl.Parallel(n_jobs=jl.cpu_count(), verbose=10) (
        jl.delayed( srchLit ) (row) for idx,row in dat.iterrows()
        )
    
eslo = dat.loc[(dat['Class_ber']=='Essential') & (dat['Class_pig']=='Essential') & (dat['Molecular Weight']<20000),]

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    2.0s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    3.1s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    4.7s
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    6.3s
[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:    8.2s
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:   10.0s
[Parallel(n_jobs=16)]: Done  81 tasks      | elapsed:   12.3s
[Parallel(n_jobs=16)]: Done  96 tasks      | elapsed:   14.6s
[Parallel(n_jobs=16)]: Done 113 tasks      | elapsed:   17.3s
[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:   20.0s
[Parallel(n_jobs=16)]: Done 149 tasks      | elapsed:   22.9s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:   25.9s
[Parallel(n_jobs=16)]: Done 189 tasks      | elapsed:   29.1s
[Parallel(n_jobs=16)]: Done 210 tasks      | elapsed:   32.4s
[Parallel(n_jobs=16)]: Done 233 tasks      | elapsed:  

In [464]:
list(dat.loc[(dat['Class_ber']=='Essential') & (dat['Class_pig']=='Essential') & (dat['Molecular Weight']<20000) & (dat['Product Description']=='conserved protein, unknown function'),]['Gene ID'])

['PF3D7_0217400',
 'PF3D7_0305400',
 'PF3D7_0306000',
 'PF3D7_0312000',
 'PF3D7_0722700',
 'PF3D7_1003100',
 'PF3D7_1028800',
 'PF3D7_1121200',
 'PF3D7_1233100',
 'PF3D7_1319500',
 'PF3D7_1336600',
 'PF3D7_1366200',
 'PF3D7_1465400']

In [455]:
dat.loc[(dat['Recombinant']==True),]

Unnamed: 0,Gene ID,Product Description,Gene Name or Symbol,Previous ID(s),Entrez Gene ID,UniProt ID,Protein Length,Molecular Weight,# TM Domains,SignalP Peptide,...,PFam Description,Computed GO Components,Class_ber,Class_pig,RelativeGrowth_ber,MutabilityIndexScore_pig,Info,Structure,Recombinant,References
245,PF3D7_0208500,acyl carrier protein,ACP,Previous IDs: PF02_0081;PFB0385w,812677,Q7KWJ1,137.0,15809.0,0.0,"HMM: MKILLLCIIFLYYVNAF, NN: MKILLLCIIFLYYVNAF",...,Phosphopantetheine attachment site,,Essential,Essential,0.07,0.231,Crystal structures of reduced holo-ACP (PDB co...,True,True,Crystal structures of reduced holo-ACP (PDB co...
521,PF3D7_0312100,E3 ubiquitin-protein ligase,,Previous IDs: MAL3P4.11;PFC0510w,814436,O77347,836.0,98718.0,5.0,,...,Ring finger domain,,Dispensable,Essential,0.97,0.169,In vitro ubiquitylation activity has been obse...,False,True,In vitro ubiquitylation activity has been obse...
572,PF3D7_0316900,"E3 ubiquitin-protein ligase, putative",,Previous IDs: MAL3P6.8;PFC0740c,814483,O77387,600.0,71008.0,1.0,,...,"Zinc finger, C3HC4 type (RING finger)",,Essential,Dispensable,0.16,1.0,recombinant RING domain shows in vitro auto-ub...,False,True,recombinant RING domain shows in vitro auto-ub...
892,PF3D7_0423800,cysteine-rich protective antigen,CyRPA,Previous IDs: MAL4P1.221;PFD1130w,812432,Q8IFM8,362.0,42775.0,0.0,"HMM: MIIPFHKKFISFFQIVLVVLLLCRSI, NN: MIIPFHKKF...",...,,,,Essential,,0.248,Invasion inhibitory monoclonal antibody to CyR...,False,True,Invasion inhibitory monoclonal antibody to CyR...
1069,PF3D7_0515300,phosphatidylinositol 3-kinase,PI3K,Previous IDs: MAL5P1.153;PFE0765w,812967,Q8I3V5,2133.0,255918.0,0.0,,...,Phosphatidylinositol 3- and 4-kinase;Phosphoin...,,Essential,Essential,0.08,0.242,probable target for Tres Cantos antimalarial c...,False,True,probable target for Tres Cantos antimalarial c...
1378,PF3D7_0612700,6-cysteine protein P12,P12,Previous IDs: 2270.t00477;MAL6P1.299;PFF0615c;...,3885775,C6KSX0,347.0,39434.0,1.0,"HMM: MIKLSKKYCLGISFVLYILLSVCEGH, NN: MIKLSKKYC...",...,Sexual stage antigen s48/45 domain,,,Essential,,0.155,Pf12 identified in blood stages in PMID:162037...,True,True,Pf12 identified in blood stages in PMID:162037...
1496,PF3D7_0624000,hexokinase,HK,Previous IDs: 2270.t00126;MAL6P1.189;PFF1155w,3885912,C6KT76,493.0,55260.0,0.0,,...,Hexokinase;Hexokinase,,Essential,Essential,0.08,0.462,Recombinant PfHK is a tetramer: PMID nan | Val...,False,True,Recombinant PfHK is a tetramer: PMID nan | Val...
1763,PF3D7_0717500,calcium-dependent protein kinase 4,CDPK4,Previous IDs: PF07_0072,2655116,Q8IBS5,528.0,60779.0,0.0,,...,Protein kinase domain;EF-hand domain pair,,Slow,Dispensable,0.86,0.999,"Synergises in vitro with Pfmap-2 (PF11_0147), ...",True,True,"Synergises in vitro with Pfmap-2 (PF11_0147), ..."
2317,PF3D7_0905700,"autophagy-related protein 3, putative",ATG3,Previous IDs: PF3D7_0905700.1;PF3D7_0905700.2;...,813336,C0H519,319.0,37475.0,0.0,,...,"Autophagocytosis associated protein (Atg3), N-...",,,,,,also confirmed by cDNA amplification (comment ...,True,True,also confirmed by cDNA amplification (comment ...
2318,PF3D7_0905700,"autophagy-related protein 3, putative",ATG3,Previous IDs: PF3D7_0905700.1;PF3D7_0905700.2;...,813336,C0H519,313.0,36792.0,0.0,,...,"Autophagocytosis associated protein (Atg3), N-...",,,,,,also confirmed by cDNA amplification (comment ...,True,True,also confirmed by cDNA amplification (comment ...


In [463]:
eslo

Unnamed: 0,Gene ID,Product Description,Gene Name or Symbol,Previous ID(s),Entrez Gene ID,UniProt ID,Protein Length,Molecular Weight,# TM Domains,SignalP Peptide,...,PFam Description,Computed GO Components,Class_ber,Class_pig,RelativeGrowth_ber,MutabilityIndexScore_pig,Info,Structure,Recombinant,References
64,PF3D7_0106400,"pre-rRNA-processing protein TSR2, putative",,Previous IDs: MAL1P1.55;PFA0315w,,,156.0,18395.0,0.0,,...,Pre-rRNA-processing protein TSR2,,Essential,Essential,0.07,0.154,,False,False,
103,PF3D7_0110200,"FAD-linked sulfhydryl oxidase ERV1, putative",ERV1,Previous IDs: MAL1P2.29;PFA0500w,,,143.0,17160.0,0.0,,...,Erv1 / Alr family,,Essential,Essential,0.07,0.208,,False,False,
232,PF3D7_0207200,"iron-sulfur assembly protein, putative",IscA1,Previous IDs: PF02_0068;PFB0320c,812664,O96161,160.0,18402.0,0.0,,...,Iron-sulphur cluster biosynthesis,,Essential,Essential,0.14,0.195,,False,False,
245,PF3D7_0208500,acyl carrier protein,ACP,Previous IDs: PF02_0081;PFB0385w,812677,Q7KWJ1,137.0,15809.0,0.0,"HMM: MKILLLCIIFLYYVNAF, NN: MKILLLCIIFLYYVNAF",...,Phosphopantetheine attachment site,,Essential,Essential,0.07,0.231,Crystal structures of reduced holo-ACP (PDB co...,True,True,Crystal structures of reduced holo-ACP (PDB co...
261,PF3D7_0210000,secretory complex protein 61 gamma subunit,Sec61-gamma,Previous IDs: PF02_0094;PFB0450w,812690,O96183,81.0,9285.0,1.0,,...,SecE/Sec61-gamma subunits of protein transloca...,membrane,Essential,Essential,0.14,0.121,The gene is bioinformatically characterized.: ...,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5320,PF3D7_1443300,"U6 snRNA-associated Sm-like protein LSm5, puta...",LSM5,Previous IDs: PF14_0411,811993,Q8IL38,101.0,11370.0,0.0,,...,LSM domain,,Essential,Essential,0.11,0.121,,False,False,
5353,PF3D7_1446600,centrin-2,CEN2,Previous IDs: PF14_0443,812025,Q8IL07,168.0,19310.0,0.0,,...,EF-hand domain pair,,Essential,Essential,0.07,0.804,Characterization of Centrins: PMID nan,False,False,
5360,PF3D7_1447300,"mitochondrial ribosomal protein S14 precursor,...",,Previous IDs: PF14_0451,812033,Q8I6V1,112.0,13320.0,0.0,,...,,,Essential,Essential,0.10,0.194,Putative mitochondrial small ribosomal subunit...,False,False,
5432,PF3D7_1454500,"iron sulfur cluster assembly protein, putative",ISU,Previous IDs: PF14_0518,812100,Q8IKT4,162.0,18056.0,0.0,,...,NifU-like N terminal domain,,Essential,Essential,0.19,0.141,,False,False,


In [393]:
for article in results:

    # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle)
    print(type(article))

    # Print a JSON representation of the object
    print(article.title)

In [9]:
import gscholar

gscholar.query("kelch13")

['@article{spring2015dihydroartemisinin,\n  title={Dihydroartemisinin-piperaquine failure associated with a triple mutant including kelch13 C580Y in Cambodia: an observational cohort study},\n  author={Spring, Michele D and Lin, Jessica T and Manning, Jessica E and Vanachayangkul, Pattaraporn and Somethy, Sok and Bun, Rathvicheth and Se, Youry and Chann, Soklyda and Ittiverakul, Mali and Sia-ngam, Piyaporn and others},\n  journal={The Lancet Infectious Diseases},\n  volume={15},\n  number={6},\n  pages={683--691},\n  year={2015},\n  publisher={Elsevier}\n}\n']

In [22]:
r=gscholar.query("kelch13",allresults=True)

In [23]:
len(r)

0

In [458]:
dat.to_csv('full_proteome.csv')

In [41]:
het['Gene ID']

0      1122    PF3D7_0520600
Name: Gene ID, dtype: ob...
1      3746    PF3D7_1216200
Name: Gene ID, dtype: ob...
2      3626    PF3D7_1204300
Name: Gene ID, dtype: ob...
3      1538    PF3D7_0628000
Name: Gene ID, dtype: ob...
4      5102    PF3D7_1421800
Name: Gene ID, dtype: ob...
                             ...                        
995    3790    PF3D7_1220500
Name: Gene ID, dtype: ob...
996    3795    PF3D7_1221100
Name: Gene ID, dtype: ob...
997    3958    PF3D7_1237300
Name: Gene ID, dtype: ob...
998    4003    PF3D7_1241800
Name: Gene ID, dtype: ob...
999    4119    PF3D7_1253300
Name: Gene ID, dtype: ob...
Name: Gene ID, Length: 1000, dtype: object

In [34]:
r = het.iloc[0,17]

In [19]:
ids[ ids['Previous ID(s)'].str.find(r['PlasmoDB_ID']) > 0 ]['Gene ID']

1122    PF3D7_0520600
Name: Gene ID, dtype: object

In [20]:
i=0

In [21]:
het[i,:]['Gene ID'] = ids[ ids['Previous ID(s)'].str.find(r['PlasmoDB_ID']) > 0 ]['Gene ID']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
het.iloc[i,-1]

''

In [37]:
het.columns

Index(['SGPPname', 'FusionMW', 'pI', 'pItag', 'Expression seen?', 'TotalAmt',
       'length', 'SEG_longest', 'SEG_total', 'SEG_percent', 'Pfam_size',
       'disopred2 percent', 'disopred2 max length', 'disopred2 10orMore',
       'LongestQN', 'hydrophi_Woods_PNAS_1981 norm',
       'hydropa_Doolittle_JMB_1982 norm', 'PlasmoDB_ID', 'introns',
       'Annotation', 'Nucleotide Sequence', 'Protein Sequence', 'Atpercent',
       'EcoliHomology', 'Totalcodons', 'ATA', 'AGA', 'AAT', 'AGG', 'TTA',
       'TAT', 'ACA', 'TGT', 'AAA', 'ChewBack', 'Gene ID'],
      dtype='object')