In [1]:
import pandas as pd

In [2]:
datasets = ["LUAD", "LUSC", "BLCA", "BRCA", "KIRC", "LIHC"]
dataframe_columns = ["Dataset", "#Genes", "#Samples", "All Samples", "Imbalance Ratio"]

statistical_information = []

for dataset in datasets:

    csv_file = f"./TCGA_GDC/dataset/{dataset}_TPM.csv"
    df = pd.read_csv(csv_file, index_col=0)

    counts = df["label"].value_counts()
    count_0 = counts.get(0, default=0)
    count_1 = counts.get(1, default=0)

    imbalance_ratio = count_1 / count_0 if count_0 != 0 else float("inf")

    samples_num, genes_num = df.drop(columns=["label"]).shape

    statistical_information.append([
        f"TCGA-{dataset}",
        genes_num,
        f"Cancer({count_1}) : Normal({count_0})",
        samples_num,
        imbalance_ratio
    ])

result_df = pd.DataFrame(statistical_information, columns=dataframe_columns)
result_df.index = list(range(1, len(result_df) + 1))
result_df

Unnamed: 0,Dataset,#Genes,#Samples,All Samples,Imbalance Ratio
1,TCGA-LUAD,17604,Cancer(513) : Normal(58),571,8.844828
2,TCGA-LUSC,17868,Cancer(496) : Normal(51),547,9.72549
3,TCGA-BLCA,17401,Cancer(403) : Normal(19),422,21.210526
4,TCGA-BRCA,17601,Cancer(1081) : Normal(99),1180,10.919192
5,TCGA-KIRC,17719,Cancer(529) : Normal(72),601,7.347222
6,TCGA-LIHC,16783,Cancer(369) : Normal(50),419,7.38


In [None]:
import json
from collections import Counter

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib as mpl
from matplotlib import ticker
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm


with open(f"./data/GDC_Genes_20908_2023-12-31.json", "r") as f:
    data1 = json.load(f)

print(f"Dictionary list length: {len(data1)}")

biotypes = [entry["biotype"] for entry in data1]
cytobands = [band for entry in data1 for band in entry["cytoband"]]

biotype_counts = Counter(biotypes)
biotype_counts_dict = dict(biotype_counts)

sorted_biotype_counts = sorted(
    biotype_counts_dict.items(), key=lambda x: x[1], reverse=True
)
gene_types = [k for k, _ in sorted_biotype_counts]
num = [v for _, v in sorted_biotype_counts]

sorted_indices = sorted(range(len(num)), key=lambda i: num[i], reverse=True)
gene_types_sorted = [gene_types[i] for i in sorted_indices]
counts_sorted = [num[i] for i in sorted_indices]

mpl.rcParams.update(mpl.rcParamsDefault)
font_latex1 = fm.FontProperties(
    fname="./fonts/Times New Roman.ttf", style="italic", size=17, weight="bold"
)
font_latex2 = fm.FontProperties(
    fname="./fonts/Times New Roman.ttf", style="italic", size=16, weight="bold"
)

fm.fontManager.addfont("./fonts/Times New Roman.ttf")

prop = font_latex1
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = prop.get_name()
plt.rcParams["axes.unicode_minus"] = False
plt.rcParams["font.size"] = 16

fig, ax = plt.subplots(figsize=(13, 9), dpi=600)

bars = plt.barh(gene_types_sorted, counts_sorted, color="#0052D9")

labels = ax.get_xticklabels() + ax.get_yticklabels()
[label.set_fontproperties(font_latex2) for label in labels]
[label.set_color("#000000") for label in labels]

plt.tick_params(axis="x", direction="out", labelsize=16, length=4.6, width=1.2)
plt.tick_params(axis="y", direction="out", labelsize=16, length=4.6, width=1.2)

plt.xlabel("Number of Genes", fontproperties=font_latex1, labelpad=9)
ax.set_xlim(left=0, right=22500)
ax.set_xticks(np.arange(0, 22500.0000000001, step=2500))

ax.xaxis.set_minor_locator(ticker.AutoMinorLocator())

lw = 1.33
ax.spines["right"].set_linewidth(lw)
ax.spines["left"].set_linewidth(lw)
ax.spines["top"].set_linewidth(lw)
ax.spines["bottom"].set_linewidth(lw)

plt.gca().invert_yaxis()

for bar in bars:
    width = bar.get_width()
    plt.text(
        width + max(counts_sorted) * 0.01,
        bar.get_y() + bar.get_height() / 2,
        f"{width}",
        va="center",
    )

ax.grid(alpha=0.330, ls="--", which="major", color="#A9A9A9")

plt.tight_layout()
plt.savefig("gene_types.png", dpi=600, transparent=True, bbox_inches="tight")

sorted_biotype_counts_dict = {k_: v_ for k_, v_ in sorted_biotype_counts}
print(sorted_biotype_counts_dict)
print(f"计数：{sum(counts_sorted)}")

cytoband_counts = Counter(cytobands)

print(biotype_counts)
print(cytoband_counts.most_common(10))

df = pd.DataFrame(data1)

biotypes_of_interest = [
    "snoRNA",
    "transcribed_unprocessed_pseudogene",
    "processed_pseudogene",
    "protein_coding",
    "lncRNA",
    "miRNA",
]
filtered_df = df[df["biotype"].isin(biotypes_of_interest)]

exploded_df = filtered_df.explode("cytoband").reset_index(drop=True)

distribution_df = (
    exploded_df.groupby(["biotype", "cytoband"]).size().reset_index(name="counts")
)

distribution_df = distribution_df.sort_values(
    by=["biotype", "counts"], ascending=[True, False]
)
print(distribution_df.head())
distribution_df.to_csv(
    "distribution.csv", index=False, encoding="utf-8"
)

pivot_df = distribution_df.pivot(
    index="cytoband", columns="biotype", values="counts"
).fillna(0)

plt.figure(figsize=(14, 8))
sns.heatmap(pivot_df, cmap="YlGnBu", linewidths=0.5, cbar_kws={"label": "Count"})
plt.title("Distribution of Specific Gene Types Across Chromosomal Bands")
plt.ylabel("Chromosomal Band")
plt.xlabel("Gene Type")
plt.xticks(rotation=45)
plt.tight_layout()

export_path = "gene_type_distribution_across_cytobands.csv"
pivot_df.to_csv(export_path, index=True, encoding="utf-8")