In [4]:
### Housekeeping ###
import os, requests, time

### Data ###
import numpy as np
import pandas as pd
import scipy.io

### Visualization ###
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

### Machine Learning ###
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

### Statistics ###
import scipy.stats as stats
import statsmodels.stats.multitest as smm

### RNA-Seq ###
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data

import gseapy as gp
from gseapy.plot import gseaplot



In [5]:
dir = os.path.join('..', 'data', 'E-GEOD-60052.csv')
df = pd.read_csv(dir, index_col=0)
df

Unnamed: 0,condition,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,ENSG00000281913,ENSG00000281914,ENSG00000281915,ENSG00000281916,ENSG00000281917,ENSG00000281918,ENSG00000281919,ENSG00000281920,ENSG00000281921,ENSG00000281922
SRR1797218,normal,99,51,33,67,39,128,597,20,171,...,0,0,0,0,0,0,0,0,0,0
SRR1797219,normal,33,2,52,141,110,419,454,57,198,...,0,0,0,0,0,0,0,0,0,0
SRR1797220,normal,14,26,11,45,28,197,473,17,90,...,0,0,0,0,0,6,0,0,0,0
SRR1797221,normal,14,13,0,23,52,76,65,14,30,...,0,0,0,0,0,0,0,1,0,0
SRR1797222,normal,48,0,12,183,53,314,134,60,299,...,0,0,0,0,0,6,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR1797299,small cell lung carcinoma,85,58,147,117,127,0,222,196,221,...,0,0,0,0,0,12,0,3,0,0
SRR1797300,small cell lung carcinoma,42,85,18,18,41,241,756,9,195,...,0,0,0,0,0,0,0,0,0,0
SRR1797301,small cell lung carcinoma,49,4,121,167,224,47,466,57,444,...,0,0,0,0,0,32,0,0,0,0
SRR1797302,small cell lung carcinoma,109,0,66,27,100,139,583,70,139,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df = df.loc[:,~df.columns.duplicated()].copy()
df

Unnamed: 0,condition,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,ENSG00000281913,ENSG00000281914,ENSG00000281915,ENSG00000281916,ENSG00000281917,ENSG00000281918,ENSG00000281919,ENSG00000281920,ENSG00000281921,ENSG00000281922
SRR1797218,normal,99,51,33,67,39,128,597,20,171,...,0,0,0,0,0,0,0,0,0,0
SRR1797219,normal,33,2,52,141,110,419,454,57,198,...,0,0,0,0,0,0,0,0,0,0
SRR1797220,normal,14,26,11,45,28,197,473,17,90,...,0,0,0,0,0,6,0,0,0,0
SRR1797221,normal,14,13,0,23,52,76,65,14,30,...,0,0,0,0,0,0,0,1,0,0
SRR1797222,normal,48,0,12,183,53,314,134,60,299,...,0,0,0,0,0,6,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR1797299,small cell lung carcinoma,85,58,147,117,127,0,222,196,221,...,0,0,0,0,0,12,0,3,0,0
SRR1797300,small cell lung carcinoma,42,85,18,18,41,241,756,9,195,...,0,0,0,0,0,0,0,0,0,0
SRR1797301,small cell lung carcinoma,49,4,121,167,224,47,466,57,444,...,0,0,0,0,0,32,0,0,0,0
SRR1797302,small cell lung carcinoma,109,0,66,27,100,139,583,70,139,...,0,0,0,0,0,0,0,0,0,0


In [7]:
meta_df = df['condition'].reset_index().rename(columns={'index':'sample'})
meta_df = meta_df.set_index('sample').rename_axis(None)
meta_df

Unnamed: 0,condition
SRR1797218,normal
SRR1797219,normal
SRR1797220,normal
SRR1797221,normal
SRR1797222,normal
...,...
SRR1797299,small cell lung carcinoma
SRR1797300,small cell lung carcinoma
SRR1797301,small cell lung carcinoma
SRR1797302,small cell lung carcinoma


In [8]:
df = df.drop(columns=['condition'])
df

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,ENSG00000281913,ENSG00000281914,ENSG00000281915,ENSG00000281916,ENSG00000281917,ENSG00000281918,ENSG00000281919,ENSG00000281920,ENSG00000281921,ENSG00000281922
SRR1797218,99,51,33,67,39,128,597,20,171,153,...,0,0,0,0,0,0,0,0,0,0
SRR1797219,33,2,52,141,110,419,454,57,198,155,...,0,0,0,0,0,0,0,0,0,0
SRR1797220,14,26,11,45,28,197,473,17,90,147,...,0,0,0,0,0,6,0,0,0,0
SRR1797221,14,13,0,23,52,76,65,14,30,11,...,0,0,0,0,0,0,0,1,0,0
SRR1797222,48,0,12,183,53,314,134,60,299,133,...,0,0,0,0,0,6,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR1797299,85,58,147,117,127,0,222,196,221,642,...,0,0,0,0,0,12,0,3,0,0
SRR1797300,42,85,18,18,41,241,756,9,195,184,...,0,0,0,0,0,0,0,0,0,0
SRR1797301,49,4,121,167,224,47,466,57,444,266,...,0,0,0,0,0,32,0,0,0,0
SRR1797302,109,0,66,27,100,139,583,70,139,212,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df.shape

(86, 65217)

In [10]:
genes_to_keep = df.columns[df.sum(axis=0) >= 10]
len(genes_to_keep)

56016

In [11]:
filtered_df = df[genes_to_keep]
filtered_df

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,ENSG00000281896,LINC02246,ENSG00000281904,ENSG00000281905,HERC2P7.1,SNORA50A.1,LINC01144,ENSG00000281918,ENSG00000281920,ENSG00000281921
SRR1797218,99,51,33,67,39,128,597,20,171,153,...,6,9,6,0,0,0,1,0,0,0
SRR1797219,33,2,52,141,110,419,454,57,198,155,...,79,15,31,0,2,25,0,0,0,0
SRR1797220,14,26,11,45,28,197,473,17,90,147,...,15,0,9,0,0,0,0,6,0,0
SRR1797221,14,13,0,23,52,76,65,14,30,11,...,13,0,135,0,0,0,0,0,1,0
SRR1797222,48,0,12,183,53,314,134,60,299,133,...,65,0,38,0,0,7,0,6,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR1797299,85,58,147,117,127,0,222,196,221,642,...,19,0,37,0,0,6,26,12,3,0
SRR1797300,42,85,18,18,41,241,756,9,195,184,...,15,3,3,0,0,2,0,0,0,0
SRR1797301,49,4,121,167,224,47,466,57,444,266,...,46,0,75,0,2,0,13,32,0,0
SRR1797302,109,0,66,27,100,139,583,70,139,212,...,24,0,75,0,0,0,0,0,0,0


In [12]:
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    counts=df,
    metadata=meta_df,
    design_factors="condition",
    refit_cooks=True,
    inference=inference,
)

In [13]:
dds

AnnData object with n_obs × n_vars = 86 × 65217
    obs: 'condition'
    obsm: 'design_matrix'

In [14]:
dds.deseq2()

Fitting size factors...
... done in 0.07 seconds.

Fitting dispersions...
... done in 4.73 seconds.

Fitting dispersion trend curve...
... done in 1.06 seconds.

Fitting MAP dispersions...
... done in 4.58 seconds.

Fitting LFCs...
... done in 3.56 seconds.

Refitting 3111 outliers.

Fitting dispersions...
... done in 0.25 seconds.

Fitting MAP dispersions...
... done in 0.25 seconds.

Fitting LFCs...
... done in 0.22 seconds.

  self[:, self.new_all_zeroes_genes].varm["_normed_means"] = np.zeros(
  self[:, self.new_all_zeroes_genes].varm["LFC"] = np.zeros(


In [15]:
dds

AnnData object with n_obs × n_vars = 86 × 65217
    obs: 'condition'
    uns: 'trend_coeffs', '_squared_logres', 'prior_disp_var'
    obsm: 'design_matrix', 'size_factors', 'replaceable'
    varm: 'non_zero', '_MoM_dispersions', 'genewise_dispersions', '_genewise_converged', '_normed_means', 'fitted_dispersions', 'MAP_dispersions', '_MAP_converged', 'dispersions', '_outlier_genes', 'LFC', '_LFC_converged', 'replaced'
    layers: 'normed_counts', '_mu_hat', '_mu_LFC', '_hat_diagonals', 'cooks', 'replace_cooks'

In [16]:
stat_res = DeseqStats(dds, inference=inference)
stat_res.summary()

Running Wald tests...
... done in 1.76 seconds.



Log2 fold change & Wald test p-value: condition small cell lung carcinoma vs normal
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
TSPAN6            50.068813        0.186334  0.412251  0.451991  0.651276   
TNMD               6.722537       -1.378788  1.111712 -1.240239  0.214887   
DPM1              38.271571        0.624634  0.422328  1.479025  0.139134   
SCYL3            106.323251        0.600443  0.258667  2.321295  0.020271   
C1orf112         122.884403        1.117855  0.351354  3.181562  0.001465   
...                     ...             ...       ...       ...       ...   
ENSG00000281918    7.537101        2.352306  0.964029  2.440079  0.014684   
ENSG00000281919    0.000000             NaN       NaN       NaN       NaN   
ENSG00000281920    9.557405        3.829074  1.430825  2.676129  0.007448   
ENSG00000281921    0.719867        2.229193  3.400970  0.655458  0.512173   
ENSG00000281922    0.000000             NaN       NaN       NaN      

In [17]:
results = stat_res.results_df
results

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
TSPAN6,50.068813,0.186334,0.412251,0.451991,0.651276,0.815979
TNMD,6.722537,-1.378788,1.111712,-1.240239,0.214887,0.422666
DPM1,38.271571,0.624634,0.422328,1.479025,0.139134,0.319629
SCYL3,106.323251,0.600443,0.258667,2.321295,0.020271,0.087005
C1orf112,122.884403,1.117855,0.351354,3.181562,0.001465,0.013681
...,...,...,...,...,...,...
ENSG00000281918,7.537101,2.352306,0.964029,2.440079,0.014684,0.069911
ENSG00000281919,0.000000,,,,,
ENSG00000281920,9.557405,3.829074,1.430825,2.676129,0.007448,0.043905
ENSG00000281921,0.719867,2.229193,3.400970,0.655458,0.512173,


In [18]:
results = results[(results['padj'] < 0.05)]
results = results[(abs(results['log2FoldChange']) > 0.05)]
results = results[(results['baseMean'] > 20)]
results

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
C1orf112,122.884403,1.117855,0.351354,3.181562,0.001465,0.013681
FGR,55.528507,-2.368954,0.576093,-4.112105,0.000039,0.000900
STPG1,86.443931,1.000003,0.338278,2.956154,0.003115,0.023684
MYH16,49.771122,1.938114,0.675608,2.868698,0.004122,0.028888
CD99,205.021743,-1.877996,0.489489,-3.836647,0.000125,0.002217
...,...,...,...,...,...,...
ENSG00000280783,26.814512,1.791200,0.519264,3.449501,0.000562,0.006823
LINC00294,175.706382,0.808593,0.276039,2.929269,0.003398,0.025166
LINC00997,177.999021,0.975662,0.292882,3.331246,0.000865,0.009264
LINC00506,2091.780524,1.252806,0.476021,2.631827,0.008493,0.047939


In [19]:
results.sort_values(by=['log2FoldChange'], ascending=False, inplace=True)
results

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
HNRNPA1P42,27.615679,7.679569,1.482099,5.181550,2.200499e-07,1.581557e-05
PABPC1P11,32.590847,7.503462,1.215853,6.171359,6.770569e-10,1.806902e-07
POLR3GP2,21.844447,7.330169,1.703677,4.302558,1.688376e-05,4.764404e-04
FAM83A,20.423203,7.233236,1.482740,4.878289,1.070103e-06,5.649392e-05
THEG,22.237465,6.950658,1.370182,5.072798,3.920092e-07,2.515929e-05
...,...,...,...,...,...,...
SFTPA1,1822.220830,-4.748982,0.898820,-5.283576,1.266862e-07,1.050671e-05
SFTPA2,2831.561330,-4.798173,0.906014,-5.295915,1.184220e-07,9.968003e-06
PGC,108.757986,-4.935291,0.707069,-6.979933,2.953201e-12,2.904853e-09
ADAMTS7P3,51.561853,-4.970657,0.993212,-5.004630,5.596948e-07,3.362759e-05


In [20]:
df

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,ENSG00000281913,ENSG00000281914,ENSG00000281915,ENSG00000281916,ENSG00000281917,ENSG00000281918,ENSG00000281919,ENSG00000281920,ENSG00000281921,ENSG00000281922
SRR1797218,99,51,33,67,39,128,597,20,171,153,...,0,0,0,0,0,0,0,0,0,0
SRR1797219,33,2,52,141,110,419,454,57,198,155,...,0,0,0,0,0,0,0,0,0,0
SRR1797220,14,26,11,45,28,197,473,17,90,147,...,0,0,0,0,0,6,0,0,0,0
SRR1797221,14,13,0,23,52,76,65,14,30,11,...,0,0,0,0,0,0,0,1,0,0
SRR1797222,48,0,12,183,53,314,134,60,299,133,...,0,0,0,0,0,6,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR1797299,85,58,147,117,127,0,222,196,221,642,...,0,0,0,0,0,12,0,3,0,0
SRR1797300,42,85,18,18,41,241,756,9,195,184,...,0,0,0,0,0,0,0,0,0,0
SRR1797301,49,4,121,167,224,47,466,57,444,266,...,0,0,0,0,0,32,0,0,0,0
SRR1797302,109,0,66,27,100,139,583,70,139,212,...,0,0,0,0,0,0,0,0,0,0


In [21]:
results

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
HNRNPA1P42,27.615679,7.679569,1.482099,5.181550,2.200499e-07,1.581557e-05
PABPC1P11,32.590847,7.503462,1.215853,6.171359,6.770569e-10,1.806902e-07
POLR3GP2,21.844447,7.330169,1.703677,4.302558,1.688376e-05,4.764404e-04
FAM83A,20.423203,7.233236,1.482740,4.878289,1.070103e-06,5.649392e-05
THEG,22.237465,6.950658,1.370182,5.072798,3.920092e-07,2.515929e-05
...,...,...,...,...,...,...
SFTPA1,1822.220830,-4.748982,0.898820,-5.283576,1.266862e-07,1.050671e-05
SFTPA2,2831.561330,-4.798173,0.906014,-5.295915,1.184220e-07,9.968003e-06
PGC,108.757986,-4.935291,0.707069,-6.979933,2.953201e-12,2.904853e-09
ADAMTS7P3,51.561853,-4.970657,0.993212,-5.004630,5.596948e-07,3.362759e-05


In [24]:
data = pd.concat([results.head(50), results.tail(50)])
data

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
HNRNPA1P42,27.615679,7.679569,1.482099,5.181550,2.200499e-07,1.581557e-05
PABPC1P11,32.590847,7.503462,1.215853,6.171359,6.770569e-10,1.806902e-07
POLR3GP2,21.844447,7.330169,1.703677,4.302558,1.688376e-05,4.764404e-04
FAM83A,20.423203,7.233236,1.482740,4.878289,1.070103e-06,5.649392e-05
THEG,22.237465,6.950658,1.370182,5.072798,3.920092e-07,2.515929e-05
...,...,...,...,...,...,...
SFTPA1,1822.220830,-4.748982,0.898820,-5.283576,1.266862e-07,1.050671e-05
SFTPA2,2831.561330,-4.798173,0.906014,-5.295915,1.184220e-07,9.968003e-06
PGC,108.757986,-4.935291,0.707069,-6.979933,2.953201e-12,2.904853e-09
ADAMTS7P3,51.561853,-4.970657,0.993212,-5.004630,5.596948e-07,3.362759e-05


In [39]:
data.index

Index(['HNRNPA1P42', 'PABPC1P11', 'POLR3GP2', 'FAM83A', 'THEG', 'HOXD10',
       'NEUROG3', 'CYP4F23P', 'VGF', 'KRTAP10-10', 'ENSG00000246528', 'GRK1',
       'NKX2-2', 'NGB', 'PATE1', 'SCRT1', 'ENSG00000254314', 'ENSG00000272763',
       'LINC00552', 'C1QL1', 'HNRNPA3P16', 'TCEAL6', 'TTC9B', 'CNGA3', 'PITX2',
       'NEUROD4', 'SCGB2A2', 'TP73-AS3', 'MIR217HG', 'DLX2', 'LINGO2',
       'TMEM151B', 'ENSG00000272372', 'SLC45A2', 'SLC6A17', 'RALYL', 'CRIP3',
       'KCNH7', 'TRIM9', 'OPRK1', 'ENSG00000231754', 'CDH22', 'ATP4B', 'FOXN4',
       'CRABP1', 'DMRTA2', 'C14orf180', 'ARNILA', 'GNRH2', 'CNGB1', 'CLIC5',
       'FENDRR', 'LGI3', 'PRX', 'TMPRSS2', 'SLC6A4', 'CXCL8', 'SFTPB', 'TCF21',
       'AOC3', 'SDR16C5', 'SLC39A8', 'TMEM100', 'GPIHBP1', 'LHFPL3-AS2',
       'VIPR1', 'SUSD2', 'FOSB', 'SCEL', 'CXCL2', 'ANKRD1', 'SFTPD', 'RGCC',
       'EDN1', 'HBA1', 'IL1RL1', 'CSRNP1', 'SCGB1A1', 'CAVIN2', 'CSF3', 'MSLN',
       'ICAM1', 'FAM107A', 'CLDN18', 'SLC34A2', 'BTNL9', 'AGER', 'EPAS1'

In [40]:
genes = list(data.index)
genes[:5]

['HNRNPA1P42', 'PABPC1P11', 'POLR3GP2', 'FAM83A', 'THEG']

In [41]:
def get_gene_name(ensembl_id):
    if ensembl_id[:4] == 'ENSG':
        try:
            url = f"https://mygene.info/v3/gene/{ensembl_id}"
            response = requests.get(url)
            data = response.json()

            # Check if 'symbol' is in the data and not None
            if 'symbol' in data and data['symbol'] is not None:
                return data['symbol']
            else:
                return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None
    else:
        return ensembl_id

In [42]:
genes = [get_gene_name(g) for g in genes]
genes[-5:]

['SFTPA1', 'SFTPA2', 'PGC', 'ADAMTS7P3', 'SFTPC']

In [47]:
def fetch_interactions(gene_name):
    url = "https://string-db.org/api/json/network"
    retries = 0
    max_retries = 3

    while retries < max_retries:
        try:
            response = requests.get(url, params={"identifiers": gene_name})
            response.raise_for_status()  # Raises a HTTPError for bad responses
            # If response is successful, parse and return the JSON data
            interactions_data = response.json()
            return interactions_data
        except requests.RequestException as e:
            print(f"Attempt {retries + 1} failed: {e}")
            if retries == max_retries - 1:
                print(f"Failed to fetch interactions for {gene_name} after {max_retries} attempts.")
                return []  # Return an empty list after all retries have been exhausted
        finally:
            retries += 1

In [48]:
all_pairs = []
for gene in genes:
    interactions_json = fetch_interactions(gene)
    for interaction in interactions_json:
        nameA = interaction['preferredName_A']
        nameB = interaction['preferredName_B']
        all_pairs.append([nameA, nameB])
all_pairs

Attempt 1 failed: 400 Client Error: Bad Request for url: https://string-db.org/api/json/network?identifiers=HNRNPA1P42
Attempt 2 failed: 400 Client Error: Bad Request for url: https://string-db.org/api/json/network?identifiers=HNRNPA1P42
Attempt 3 failed: 400 Client Error: Bad Request for url: https://string-db.org/api/json/network?identifiers=HNRNPA1P42
Failed to fetch interactions for HNRNPA1P42 after 3 attempts.
Attempt 1 failed: 400 Client Error: Bad Request for url: https://string-db.org/api/json/network?identifiers=PABPC1P11
Attempt 2 failed: 400 Client Error: Bad Request for url: https://string-db.org/api/json/network?identifiers=PABPC1P11
Attempt 3 failed: 400 Client Error: Bad Request for url: https://string-db.org/api/json/network?identifiers=PABPC1P11
Failed to fetch interactions for PABPC1P11 after 3 attempts.
Attempt 1 failed: 400 Client Error: Bad Request for url: https://string-db.org/api/json/network?identifiers=POLR3GP2
Attempt 2 failed: 400 Client Error: Bad Request f

[['DHX35', 'DERL1'],
 ['DHX35', 'FAM83A'],
 ['DHX35', 'ZHX3'],
 ['DHX35', 'RAB5IF'],
 ['DHX35', 'TBC1D31'],
 ['DERL1', 'FAM83A'],
 ['DERL1', 'RAB5IF'],
 ['DERL1', 'TBC1D31'],
 ['CSNK1A1', 'FAM83A'],
 ['CSNK1A1', 'CSNK1E'],
 ['TBC1D31', 'RAB5IF'],
 ['TBC1D31', 'FAM83A'],
 ['TBC1D31', 'ZHX3'],
 ['TP53I13', 'FAM83A'],
 ['TP53I13', 'LOC107987373'],
 ['TP53I13', 'MRPL23'],
 ['ZHX3', 'FAM83A'],
 ['ZHX3', 'RAB5IF'],
 ['RAB5IF', 'FAM83A'],
 ['CSNK1E', 'FAM83A'],
 ['MRPL23', 'FAM83A'],
 ['FAM83A', 'LOC107987373'],
 ['PRSS54', 'CT62'],
 ['PRSS54', 'THEG'],
 ['ACRBP', 'THEG'],
 ['ACRBP', 'SPACA3'],
 ['MIER2', 'R3HDM4'],
 ['MIER2', 'THEG'],
 ['SPACA3', 'CT62'],
 ['SPACA3', 'THEG'],
 ['CCT5', 'THEG'],
 ['SPATA12', 'THEG'],
 ['THEG', 'R3HDM4'],
 ['THEG', 'PECAM1'],
 ['THEG', 'DNAJC28'],
 ['THEG', 'CT62'],
 ['HOXB7', 'HOXD9'],
 ['HOXB7', 'MEIS1'],
 ['HOXB7', 'HOXD10'],
 ['HOXD9', 'HOXD1'],
 ['HOXD9', 'EVX2'],
 ['HOXD9', 'MEIS1'],
 ['HOXD9', 'SHH'],
 ['HOXD9', 'HOXD13'],
 ['HOXD9', 'HOXD11'],
 ['HOXD9

In [50]:
len(all_pairs)

2778

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [76]:
df

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,ENSG00000281913,ENSG00000281914,ENSG00000281915,ENSG00000281916,ENSG00000281917,ENSG00000281918,ENSG00000281919,ENSG00000281920,ENSG00000281921,ENSG00000281922
SRR1797218,99,51,33,67,39,128,597,20,171,153,...,0,0,0,0,0,0,0,0,0,0
SRR1797219,33,2,52,141,110,419,454,57,198,155,...,0,0,0,0,0,0,0,0,0,0
SRR1797220,14,26,11,45,28,197,473,17,90,147,...,0,0,0,0,0,6,0,0,0,0
SRR1797221,14,13,0,23,52,76,65,14,30,11,...,0,0,0,0,0,0,0,1,0,0
SRR1797222,48,0,12,183,53,314,134,60,299,133,...,0,0,0,0,0,6,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR1797299,85,58,147,117,127,0,222,196,221,642,...,0,0,0,0,0,12,0,3,0,0
SRR1797300,42,85,18,18,41,241,756,9,195,184,...,0,0,0,0,0,0,0,0,0,0
SRR1797301,49,4,121,167,224,47,466,57,444,266,...,0,0,0,0,0,32,0,0,0,0
SRR1797302,109,0,66,27,100,139,583,70,139,212,...,0,0,0,0,0,0,0,0,0,0


In [79]:
y = ['healthy' if int(i[-3:]) < 225 else 'cancer' for i in list(df.index.astype(str))]
y[-5:]

['cancer', 'cancer', 'cancer', 'cancer', 'cancer']

In [88]:
X = df.values
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[ 0.68984067,  2.92912503, -0.42068496, ..., -0.55717466,
        -0.28420367,  0.        ],
       [-0.52315054, -0.38808348, -0.05131407, ..., -0.55717466,
        -0.28420367,  0.        ],
       [-0.87234498,  1.23667171, -0.84837757, ..., -0.55717466,
        -0.28420367,  0.        ],
       ...,
       [-0.22909207, -0.25268722,  1.29008547, ..., -0.55717466,
        -0.28420367,  0.        ],
       [ 0.87362721, -0.52347975,  0.22085395, ..., -0.55717466,
        -0.28420367,  0.        ],
       [-1.09288884,  1.03357731, -0.84837757, ..., -0.55717466,
        -0.28420367,  0.        ]])

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [90]:
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)

In [91]:
y_pred = rf_classifier.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

      cancer       1.00      1.00      1.00        17
     healthy       1.00      1.00      1.00         1

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



In [92]:
importances = rf_classifier.feature_importances_
importances

array([0., 0., 0., ..., 0., 0., 0.])