# This script is used to extract one certain gene’s information.

In [14]:
import glob
import json
import ast

In [15]:
#define dir

# input dir
json_path = './Autism_genepheno_results/Extraced_results'             # the output file of step1
np_dir = './Autism_genepheno_results/Sum_all/n_p.txt'                # the output file of step1
ng_dir = './Autism_genepheno_results/Sum_all/n_g.txt'                # the output file of step1
In_Summary_dir='./Autism_genepheno_results/Sum_all/In_Summary.txt'   # the output file of step1
NPMI_dir='./Autism_genepheno_results/NPMI_file/NPMI.json'            # the output file of step2

# output dir
one_information_dir = './Autism_genepheno_results/one_gene_information/'

In [16]:
# define the extracted gene
gene_extract = "SHANK3"

In [17]:
jsons = glob.glob("{}\\*.json".format(json_path)) 

In [18]:
# define a dict to store the extracted gene information
gene_extract_property_dict = {"Gene name":gene_extract,
                              "Gene sfari class":None,
                              'Related sentences':{},
                              'Related phenotype NPMI':{},
                              'Summary':{"Paper number":None, 
                                         "Paper list":[],
                                         "Sentence number":None,
                                         "Normolized phenotype number":None}}

In [19]:
# read file in .\\Sum_all
def read_file():
    # read n_p
    with open(np_dir, 'r', encoding='utf-8-sig') as o:  
        phenotype_dict =  ast.literal_eval(o.read())
    
            
    # read n_g
    with open(ng_dir, 'r', encoding='utf-8-sig') as o:  
        gene_dict =  ast.literal_eval(o.read())
            
    # read n_tot
    with open(In_Summary_dir, 'r', encoding='utf-8-sig') as o:  
        f=o.read()
        h = f.find('N_tot = ')
        e = f.find('Unique gene list from all papers:')
        n_tot = int(f[h+8:e-2])
    return phenotype_dict, gene_dict, n_tot

phenotype_dict, gene_dict, n_tot = read_file()


In [20]:
# get gene_list and phenotype list from gene_dict and phenotype_dict
def get_list(gene_dict,phenotype_dict):
    gene_list=[]
    for key in gene_dict.keys():
        gene_list.append(key)
    phenotype_list=[]
    for key in phenotype_dict.keys():
        phenotype_list.append(key)
    return gene_list,phenotype_list

gene_list,phenotype_list = get_list(gene_dict,phenotype_dict)

In [21]:
# extract sentences which contain this gene from results in step 1
Sentence_number = 0
for index,item in enumerate(jsons) :
    with open(item, 'r', encoding='utf-8-sig') as f:
        json_data = json.load(f)

        for key in json_data['Sentences']:
            json_phenotype = json_data['Sentences'][key]['Normolized phenotype']
            json_gene = json_data['Sentences'][key]['Gene']
            if gene_extract in json_gene:
                Sentence_number += 1
                gene_extract_property_dict['Related sentences'].update({'Sentence'+str(Sentence_number).zfill(3):{'PMCid':json_data['PMCid'],                                                                         'Title':json_data['Title']}})
                gene_extract_property_dict['Related sentences']['Sentence'+str(Sentence_number).zfill(3)].update(json_data['Sentences'][key])
       

In [22]:
# extract NPMI information from results in step 2
extracted_gene_NPMI_list = []

with open(NPMI_dir, 'r', encoding='utf-8-sig') as f:
    NPMI_data = json.load(f)

    for item in NPMI_data:
        if item['gene']==gene_extract:
            extracted_gene_NPMI_list.append(item)
    extracted_gene_NPMI_list.sort(key=lambda x: -x["NPMI"])

    
for index, item in enumerate(extracted_gene_NPMI_list):
    gene_extract_property_dict['Related phenotype NPMI'][item["phenotype"]] = item["NPMI"]
    if index ==0:
        gene_extract_property_dict["Gene sfari class"] = item["gene_sfari_class"]

In [23]:
# get summary of extracted gene information
gene_extract_property_dict['Summary']['Sentence number'] = Sentence_number
gene_extract_property_dict['Summary']['Normolized phenotype number'] = len(gene_extract_property_dict['Related phenotype NPMI'])
templist = []
templist2 = []
for sentence in gene_extract_property_dict['Related sentences']:
    templist.append(gene_extract_property_dict['Related sentences'][sentence]['PMCid'])
    templist2.append(gene_extract_property_dict['Related sentences'][sentence]['Title'])
    
gene_extract_property_dict['Summary']['Paper list'] = list(set(templist))
gene_extract_property_dict['Summary']['Paper name list'] = list(set(templist2))  
gene_extract_property_dict['Summary']['Paper number'] = len(gene_extract_property_dict['Summary']['Paper list'])

In [24]:
# write the json file
with open(one_information_dir+gene_extract+"_information.json", 'w', encoding='utf-8-sig') as f:
    json.dump(gene_extract_property_dict , f , sort_keys=True, indent=4, separators=(',', ': '))
                

In [25]:
# write the summary txt file
with open(one_information_dir+gene_extract+"_summary.txt", 'w', encoding='utf-8-sig') as f:
    f.write("Gene name: %s \n" % gene_extract)
    f.write("Paper number: %d \n" % gene_extract_property_dict['Summary']['Paper number']) 
    f.write("Paper list: \n" )
    for paper in gene_extract_property_dict['Summary']['Paper list'] :
        f.write(paper)
        f.write('\t')
    f.write('\n')        
    f.write("Paper name list: \n" )
    for paper in gene_extract_property_dict['Summary']['Paper name list'] :
        f.write(paper)
        f.write('\n')
    f.write('\n')
    f.write("Sentence number: %d \n" % gene_extract_property_dict['Summary']['Sentence number'])
    f.write("Normolized phenotype number: %d \n" % gene_extract_property_dict['Summary']['Normolized phenotype number'])    
    