In [1]:
import json
from collections import Counter

import pandas as pd
from thefuzz import fuzz
from openpyxl import load_workbook

In [2]:
dataset = "LUAD"
file_id = "0052ae83-7ae5-470a-a125-5cd94a9fa9e9"
file_name = "a6a6b9c6-9db7-42b3-a09f-770b7e126fbb.rna_seq.augmented_star_gene_counts.tsv"
sample_path = f"./TCGA_GDC/{dataset}/samples_info/{file_id}/{file_name}"

df = pd.read_csv(sample_path, sep="\t", skiprows=1)
df = df.iloc[4:, :]

df = df[~df["gene_id"].str.contains("_PAR_Y")]
df.reset_index(drop=True, inplace=True)

tmp_dict = {gene_name: ensg_id for gene_name, ensg_id in zip(df["gene_name"], df["gene_id"])}
gene_ensg_id_dict = {k: tmp_dict[k].split(".")[0] for k in sorted(tmp_dict.keys())}

json_str = json.dumps(gene_ensg_id_dict, ensure_ascii=False, indent=2)
with open("./data/gene_ensg_id_dict.json", "w") as f:
    f.write(json_str)

In [3]:
with open("./data/genes.2023-12-31.json", "r") as f:
    data = json.load(f)


print(f"Gene list length: {len(data)}")
genes = [dic["symbol"] for dic in data]
print(Counter(genes).most_common()[:15])

genes = {dic["symbol"] for dic in data}
print(f"Gene set length: {len(genes)}")
genes = [gene for gene in genes]
genes.sort()

with open("./data/query_genes.txt", "w") as f:
    f.write("\n".join(genes))

Gene list length: 20908
[('Y_RNA', 3), ('SFTA3', 2), ('POLR2J3', 2), ('SMIM40', 2), ('PDE11A', 2), ('ACTL10', 2), ('MATR3', 2), ('KCNK3', 1), ('KRTAP5-9', 1), ('SHC4', 1), ('NELL2', 1), ('ACACB', 1), ('RFWD3', 1), ('CALML5', 1), ('PCDH7', 1)]
Gene set length: 20900


In [4]:
with open("./data/TCGA_Genes.txt", "r") as f:
    tcga_genes = {gene for gene in f.read().split("\n")}

with open("./data/genes.2023-12-31.json", "r") as f:
    data = json.load(f)

gene_to_ensg_id = {dic["symbol"]: dic["gene_id"] for dic in data if dic["symbol"] in tcga_genes}

tmp_dict = {"SFTA3": "ENSG00000229415", "POLR2J3": "ENSG00000168255", "SMIM40": "ENSG00000286920",
            "PDE11A": "ENSG00000128655", "ACTL10": "ENSG00000288649", "MATR3": "ENSG00000015479"
           }
gene_to_ensg_id.update(tmp_dict)

with open("./data/TCGA_Genes.txt", "r") as f:
    tcga_genes = f.read().split("\n")
save_gene_to_ensg_id = {gene: gene_to_ensg_id[gene] for gene in tcga_genes}

json_str = json.dumps(save_gene_to_ensg_id, ensure_ascii=False, indent=2)
with open("./data/gene_ensg_id_dict.json", "w") as f:
    f.write(json_str)

In [5]:
with open("./data/genes_info.json", "r") as f:
    data = json.load(f)

gene_info_dict = {}
count_ = 0
for gene_id, info in data.items():
    if ".[" in info["summary_info"] or ". [" in info["summary_info"] or " [" in info["summary_info"]:
        continue
    # if ". [" not in info["summary_info"] and " [" in info["summary_info"]:
    #     count_ += 1
    #     print(info["summary_info"])
    elif "[provided" in info["summary_info"] or " provided " in info["summary_info"]:
        count_ += 1
        print(f"Gene ID: {gene_id}, Summary Info: {info['summary_info']}")
    elif "NaN" == info["summary_info"]:
        continue
    else:
        print("还有其他情况吗？")

    if count_ > 10:
        break

Gene ID: 43, Summary Info: Acetylcholinesterase hydrolyzes the neurotransmitter, acetylcholine at neuromuscular junctions and brain cholinergic synapses, and thus terminates signal transmission. It is also found on the red blood cell membranes, where it constitutes the Yt blood group antigen. Acetylcholinesterase exists in multiple molecular forms which possess similar catalytic properties, but differ in their oligomeric assembly and mode of cell attachment to the cell surface. It is encoded by the single ACHE gene, and the structural diversity in the gene products arises from alternative mRNA splicing, and post-translational associations of catalytic and structural subunits. The major form of acetylcholinesterase found in brain, muscle and other tissues is the hydrophilic species, which forms disulfide-linked oligomers with collagenous, or lipid-containing structural subunits. The other, alternatively spliced form, expressed primarily in the erythroid tissues, differs at the C-termina

In [6]:
def check_period(your_str):

    if your_str[-1] == ".":
        return your_str
    else:
        return your_str + "."


summary = """This gene encodes an enzyme that plays a role in the recovery of retinal photoreceptors from photobleaching. This enzyme promotes the activity of retinal guanylyl cyclase-1 (GC1) at low calcium concentrations and inhibits GC1 at high calcium concentrations. Mutations in this gene can cause cone dystrophy 3 and code-rod dystrophy 14. provided by RefSeq, Jul 2020]"""
last_index = summary.rfind(" provided ")
summary = check_period(summary[:last_index])
print(summary)

This gene encodes an enzyme that plays a role in the recovery of retinal photoreceptors from photobleaching. This enzyme promotes the activity of retinal guanylyl cyclase-1 (GC1) at low calcium concentrations and inhibits GC1 at high calcium concentrations. Mutations in this gene can cause cone dystrophy 3 and code-rod dystrophy 14.


In [7]:
summary = """The protein encoded by this gene belongs to the pancreatic ribonuclease family, a subset of the ribonuclease A superfamily. The protein exhibits antimicrobial activity against pathogenic bacteria [provided by RefSeq, Oct 2014]"""
new_summary = """The protein encoded by this gene belongs to the pancreatic ribonuclease family, a subset of the ribonuclease A superfamily. The protein exhibits antimicrobial activity against pathogenic bacteria."""
print(fuzz.partial_ratio(summary, new_summary))

summary = """This gene encodes a member of the glycosyltransferase family. The specific function of this protein has not been determined. Alternative splicing results in multiple transcript variants of this gene [provided by RefSeq, May 2013]"""
new_summary = """This gene encodes a member of the glycosyltransferase family. The specific function of this protein has not been determined. Alternative splicing results in multiple transcript variants of this gene."""
print(fuzz.partial_ratio(summary, new_summary))

100
100


In [8]:
with open("./data/genes_info.json", "r") as f:
    data = json.load(f)

def check_period(your_str):
    if your_str[-1] == ".":
        return your_str
    else:
        return your_str + "."


def check_correctness(gene_id, summary, new_summary):
    reduce_words = len(summary.split(" ")) - len(new_summary.split(" "))
    if reduce_words > 8:
        print("-" * 88)
        print("FuzzyWuzzy：简单易用的字符串模糊匹配工具")
        print(f"Gene ID: {gene_id}")
        print(summary)
        print(new_summary)
        print(reduce_words)
        print("-" * 88)

    if fuzz.partial_ratio(summary, new_summary) < 99.9:
        print("-" * 88)
        print("FuzzyWuzzy：简单易用的字符串模糊匹配工具")
        print(f"Gene ID: {gene_id}")
        print(summary)
        print(new_summary)
        print(reduce_words)
        print("-" * 88)


gene_info_dict = {}
for gene_id, info in data.items():
    summary = info["summary_info"]

    if ".[" in summary:
        last_index = summary.rfind(".[")
        new_summary = check_period(summary[:last_index])
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        check_correctness(gene_id, summary, new_summary)
        continue
    elif ". [" in summary:
        last_index = summary.rfind(". [")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif " [" in summary:
        last_index = summary.rfind(" [")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif "[provided" in summary:
        last_index = summary.rfind("[provided")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif " provided " in summary:
        last_index = summary.rfind(" provided ")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif summary == "NaN":
        new_summary = summary
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info

json_str = json.dumps(gene_info_dict, ensure_ascii=False, indent=2)
with open("genes_info_clean.json", "w") as f:
    f.write(json_str)

----------------------------------------------------------------------------------------
FuzzyWuzzy：简单易用的字符串模糊匹配工具
Gene ID: 53827
This gene encodes a member of a family of small membrane proteins that share a 35-amino acid signature sequence domain, beginning with the sequence PFXYD and containing 7 invariant and 6 highly conserved amino acids. The approved human gene nomenclature for the family is FXYD-domain containing ion transport regulator. Mouse FXYD5 has been termed RIC (Related to Ion Channel). FXYD2, also known as the gamma subunit of the Na,K-ATPase, regulates the properties of that enzyme. FXYD1 (phospholemman), FXYD2 (gamma), FXYD3 (MAT-8), FXYD4 (CHIF), and FXYD5 (RIC) have been shown to induce channel activity in experimental expression systems. Transmembrane topology has been established for two family members (FXYD1 and FXYD2), with the N-terminus extracellular and the C-terminus on the cytoplasmic side of the membrane. This gene product, FXYD5, is a glycoprotein that f

In [9]:
# COSMIC - https://cancer.sanger.ac.uk/cosmic/download
# Expert Curation of Genes - https://cancer.sanger.ac.uk/cosmic/curation
df = pd.read_excel("./data/Expert_Curation_of_Genes.xlsx")
wb = load_workbook("./data/Expert_Curation_of_Genes.xlsx")
sheet = wb["Sheet1"]

hyperlinks = []

for row in range(2, len(df) + 2):

    hyperlink = sheet.cell(row, 1).hyperlink.target
    hyperlinks.append(hyperlink)

df["Hyperlinks"] = hyperlinks
df

Unnamed: 0,Genes,Samples,Mutations,Papers,Hyperlinks
0,ABL1,40329,844,913,https://cancer.sanger.ac.uk/cosmic/gene/analys...
1,ACVR1,14329,124,183,https://cancer.sanger.ac.uk/cosmic/gene/analys...
2,ACVR1B,3787,58,166,https://cancer.sanger.ac.uk/cosmic/gene/analys...
3,ACVR2A,4905,332,234,https://cancer.sanger.ac.uk/cosmic/gene/analys...
4,AKT1,63618,1533,1238,https://cancer.sanger.ac.uk/cosmic/gene/analys...
...,...,...,...,...,...
267,VHL,38699,2984,903,https://cancer.sanger.ac.uk/cosmic/gene/analys...
268,WT1,39330,1493,784,https://cancer.sanger.ac.uk/cosmic/gene/analys...
269,XPO1,20593,351,338,https://cancer.sanger.ac.uk/cosmic/gene/analys...
270,ZFHX3,14801,725,346,https://cancer.sanger.ac.uk/cosmic/gene/analys...


In [10]:
df.to_excel("Expert_Curation_of_Genes.xlsx", index=False, encoding="utf-8")