In [1]:
import pandas as pd
import plotly.express as px

In [2]:
gtex_phenotype_df = pd.read_csv('./GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt', sep='\t')
gtex_phenotype_df

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0
...,...,...,...,...
975,GTEX-ZYY3,2,60-69,4.0
976,GTEX-ZZ64,1,20-29,0.0
977,GTEX-ZZPT,1,50-59,4.0
978,GTEX-ZZPU,2,50-59,0.0


In [3]:
# Use plotly to make a histogram of selected column.
def plot_age_histogram(df, column='age'):
    fig = px.histogram(df, x=column, title='Age distribution')
    fig.update_xaxes(categoryorder='category ascending')
    fig.update_layout(bargap=0.05)
    fig.show()

plot_age_histogram(gtex_phenotype_df, column='AGE')

In [39]:
# Read GTEX RNA-seq data.
tissue = "muscle_skeletal" # "whole_blood"
gtex_rnaseq_df = pd.read_csv('./gene_tpm_2017-06-05_v8_%s.gct' % tissue, sep='\t', skiprows=2)
gtex_rnaseq_df= gtex_rnaseq_df.T
gtex_rnaseq_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56190,56191,56192,56193,56194,56195,56196,56197,56198,56199
id,0,1,2,3,4,5,6,7,8,9,...,56190,56191,56192,56193,56194,56195,56196,56197,56198,56199
Name,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000198886.2,ENSG00000210176.1,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2
Description,DDX11L1,WASH7P,MIR6859-1,MIR1302-2HG,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,CICP27,...,MT-ND4,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP
GTEX-1117F-0426-SM-5EGHI,0.0,3.861,0.0,0.0,0.0,0.056,0.05,0.1025,0.0457,0.0136,...,34030.0,0.0,0.0,0.0,10400.0,14750.0,44.31,26310.0,6.414,6.226
GTEX-111CU-2026-SM-5GZZC,0.0,1.036,0.0,0.0,0.0,0.0,0.1098,0.0,0.0335,0.0,...,35790.0,3.49,3.498,1.454,14430.0,17820.0,59.83,33600.0,1.042,2.024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYW4-0526-SM-5GZZ5,0.0,1.245,0.0,0.0,0.0,0.0447,0.1199,0.0,0.0,0.0,...,27840.0,0.5444,0.0,0.0,11310.0,17720.0,55.53,21220.0,2.277,2.21
GTEX-ZYY3-0526-SM-5E45G,0.0,1.552,0.0,0.0366,0.0,0.0755,0.0,0.1036,0.0308,0.0,...,46560.0,9.649,1.612,0.8931,6926.0,7197.0,20.68,28550.0,3.363,1.399
GTEX-ZZ64-1526-SM-5E43K,0.0,1.819,0.0,0.0,0.0,0.0389,0.0347,0.0356,0.0159,0.0,...,38160.0,0.4731,0.5533,1.839,13390.0,18340.0,71.44,31030.0,2.968,1.92
GTEX-ZZPT-0626-SM-5GZXT,0.028,3.604,0.0,0.1116,0.0,0.0,0.0514,0.0527,0.0,0.0,...,46920.0,1.401,0.8193,0.0,4464.0,3488.0,5.605,28810.0,0.7324,0.7109


In [40]:
# Add sample_ids column.
gtex_rnaseq_df['sample_id'] = gtex_rnaseq_df.index.to_series().apply(lambda x: '-'.join(x.split('-')[0:2]))
gtex_rnaseq_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56191,56192,56193,56194,56195,56196,56197,56198,56199,sample_id
id,0,1,2,3,4,5,6,7,8,9,...,56191,56192,56193,56194,56195,56196,56197,56198,56199,id
Name,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000210176.1,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2,Name
Description,DDX11L1,WASH7P,MIR6859-1,MIR1302-2HG,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,CICP27,...,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP,Description
GTEX-1117F-0426-SM-5EGHI,0.0,3.861,0.0,0.0,0.0,0.056,0.05,0.1025,0.0457,0.0136,...,0.0,0.0,0.0,10400.0,14750.0,44.31,26310.0,6.414,6.226,GTEX-1117F
GTEX-111CU-2026-SM-5GZZC,0.0,1.036,0.0,0.0,0.0,0.0,0.1098,0.0,0.0335,0.0,...,3.49,3.498,1.454,14430.0,17820.0,59.83,33600.0,1.042,2.024,GTEX-111CU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYW4-0526-SM-5GZZ5,0.0,1.245,0.0,0.0,0.0,0.0447,0.1199,0.0,0.0,0.0,...,0.5444,0.0,0.0,11310.0,17720.0,55.53,21220.0,2.277,2.21,GTEX-ZYW4
GTEX-ZYY3-0526-SM-5E45G,0.0,1.552,0.0,0.0366,0.0,0.0755,0.0,0.1036,0.0308,0.0,...,9.649,1.612,0.8931,6926.0,7197.0,20.68,28550.0,3.363,1.399,GTEX-ZYY3
GTEX-ZZ64-1526-SM-5E43K,0.0,1.819,0.0,0.0,0.0,0.0389,0.0347,0.0356,0.0159,0.0,...,0.4731,0.5533,1.839,13390.0,18340.0,71.44,31030.0,2.968,1.92,GTEX-ZZ64
GTEX-ZZPT-0626-SM-5GZXT,0.028,3.604,0.0,0.1116,0.0,0.0,0.0514,0.0527,0.0,0.0,...,1.401,0.8193,0.0,4464.0,3488.0,5.605,28810.0,0.7324,0.7109,GTEX-ZZPT


In [41]:
# Add age column.
def get_age(row):
    subj_row = gtex_phenotype_df[gtex_phenotype_df['SUBJID'] == row['sample_id']]
    if subj_row.empty:
        return None
    else:
        return subj_row['AGE'].values[0]
    
gtex_rnaseq_df['age'] = gtex_rnaseq_df.apply(get_age, axis=1)
gtex_rnaseq_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56192,56193,56194,56195,56196,56197,56198,56199,sample_id,age
id,0,1,2,3,4,5,6,7,8,9,...,56192,56193,56194,56195,56196,56197,56198,56199,id,
Name,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2,Name,
Description,DDX11L1,WASH7P,MIR6859-1,MIR1302-2HG,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,CICP27,...,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP,Description,
GTEX-1117F-0426-SM-5EGHI,0.0,3.861,0.0,0.0,0.0,0.056,0.05,0.1025,0.0457,0.0136,...,0.0,0.0,10400.0,14750.0,44.31,26310.0,6.414,6.226,GTEX-1117F,60-69
GTEX-111CU-2026-SM-5GZZC,0.0,1.036,0.0,0.0,0.0,0.0,0.1098,0.0,0.0335,0.0,...,3.498,1.454,14430.0,17820.0,59.83,33600.0,1.042,2.024,GTEX-111CU,50-59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYW4-0526-SM-5GZZ5,0.0,1.245,0.0,0.0,0.0,0.0447,0.1199,0.0,0.0,0.0,...,0.0,0.0,11310.0,17720.0,55.53,21220.0,2.277,2.21,GTEX-ZYW4,60-69
GTEX-ZYY3-0526-SM-5E45G,0.0,1.552,0.0,0.0366,0.0,0.0755,0.0,0.1036,0.0308,0.0,...,1.612,0.8931,6926.0,7197.0,20.68,28550.0,3.363,1.399,GTEX-ZYY3,60-69
GTEX-ZZ64-1526-SM-5E43K,0.0,1.819,0.0,0.0,0.0,0.0389,0.0347,0.0356,0.0159,0.0,...,0.5533,1.839,13390.0,18340.0,71.44,31030.0,2.968,1.92,GTEX-ZZ64,20-29
GTEX-ZZPT-0626-SM-5GZXT,0.028,3.604,0.0,0.1116,0.0,0.0,0.0514,0.0527,0.0,0.0,...,0.8193,0.0,4464.0,3488.0,5.605,28810.0,0.7324,0.7109,GTEX-ZZPT,50-59


In [42]:
# Create final dataframe with RNA-seq data and age.
final_df = gtex_rnaseq_df

# Set column names.
column_names = final_df.loc['Description']
column_names[-2:] = ['SUBJID', 'AGE']
final_df.columns = column_names

# Remove first two rows.
final_df = gtex_rnaseq_df.iloc[3:]
final_df

# Remove miscellaneous column.
final_df = final_df.drop(columns=['SUBJID'])
final_df.to_csv('gtex_rnaseq_with_age.tsv', index=False, sep='\t')
final_df

Description,DDX11L1,WASH7P,MIR6859-1,MIR1302-2HG,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,CICP27,...,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP,AGE
GTEX-1117F-0426-SM-5EGHI,0.0,3.861,0.0,0.0,0.0,0.056,0.05,0.1025,0.0457,0.0136,...,0.0,0.0,0.0,10400.0,14750.0,44.31,26310.0,6.414,6.226,60-69
GTEX-111CU-2026-SM-5GZZC,0.0,1.036,0.0,0.0,0.0,0.0,0.1098,0.0,0.0335,0.0,...,3.49,3.498,1.454,14430.0,17820.0,59.83,33600.0,1.042,2.024,50-59
GTEX-111FC-0326-SM-5GZZ1,0.0177,1.197,0.0,0.0,0.0251,0.0728,0.0975,0.0999,0.0594,0.0,...,0.0,0.5181,0.4305,6303.0,6102.0,18.16,35310.0,1.853,2.697,60-69
GTEX-111VG-2626-SM-5GZY2,0.0,0.4647,0.0,0.0,0.0266,0.0773,0.0,0.0354,0.0158,0.0094,...,0.9414,0.0,0.4574,7044.0,9476.0,31.54,18520.0,0.4921,0.9552,60-69
GTEX-111YS-2326-SM-5987L,0.0,0.8542,0.0,0.0,0.0642,0.1397,0.0,0.1278,0.0,0.0,...,0.0,1.326,0.0,11030.0,13820.0,48.19,31630.0,2.371,2.301,60-69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYW4-0526-SM-5GZZ5,0.0,1.245,0.0,0.0,0.0,0.0447,0.1199,0.0,0.0,0.0,...,0.5444,0.0,0.0,11310.0,17720.0,55.53,21220.0,2.277,2.21,60-69
GTEX-ZYY3-0526-SM-5E45G,0.0,1.552,0.0,0.0366,0.0,0.0755,0.0,0.1036,0.0308,0.0,...,9.649,1.612,0.8931,6926.0,7197.0,20.68,28550.0,3.363,1.399,60-69
GTEX-ZZ64-1526-SM-5E43K,0.0,1.819,0.0,0.0,0.0,0.0389,0.0347,0.0356,0.0159,0.0,...,0.4731,0.5533,1.839,13390.0,18340.0,71.44,31030.0,2.968,1.92,20-29
GTEX-ZZPT-0626-SM-5GZXT,0.028,3.604,0.0,0.1116,0.0,0.0,0.0514,0.0527,0.0,0.0,...,1.401,0.8193,0.0,4464.0,3488.0,5.605,28810.0,0.7324,0.7109,50-59


In [43]:
# Select K best features and create a new dataset.
K = 4000
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=K)
X = final_df.drop(['AGE'], axis=1)
Y = final_df['AGE']
selector.fit(X, Y)
selected_features = X.columns[selector.get_support()]
selected_features_df = X[selected_features]
selected_features_df['AGE'] = Y
selected_features_df



Features [   14    40   177 ... 56160 56161 56162] are constant.


invalid value encountered in divide



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Description,RP4-669L17.10,RP11-206L10.2,RP11-206L10.9,AGRN,TNFRSF18,LINC01786,SCNN1D,INTS11,MXRA8,CCNL2,...,CLIC2,PRKY,XGY1,ANOS2P,FAM224B,TTTY10,MT-ND1,MT-ND2,MT-ND4L,AGE
GTEX-1117F-0426-SM-5EGHI,0.1441,2.655,0.3496,6.278,0.3777,1.284,1.952,49.75,16.01,69.4,...,3.048,0.0,0.0,0.0,0.0,0.0,38030.0,24550.0,14200.0,60-69
GTEX-111CU-2026-SM-5GZZC,0.1216,0.5669,0.212,2.171,0.1507,0.6706,0.8034,31.44,10.03,43.79,...,1.938,1.114,0.0,0.0,0.0,0.0997,35830.0,29250.0,21980.0,50-59
GTEX-111FC-0326-SM-5GZZ1,0.2017,0.423,0.349,3.133,0.3124,2.085,2.559,31.36,22.79,37.7,...,1.216,1.119,0.0685,0.0387,0.0,0.2125,27240.0,18090.0,24400.0,60-69
GTEX-111VG-2626-SM-5GZY2,0.1607,0.8483,0.2767,2.217,0.6875,2.279,2.687,33.16,21.74,33.6,...,2.367,2.279,0.0728,0.0411,0.0,0.3199,22440.0,12760.0,14040.0,60-69
GTEX-111YS-2326-SM-5987L,0.0922,0.4987,0.3758,1.824,0.1713,1.449,0.9427,33.76,11.92,41.34,...,3.196,1.375,0.614,0.033,0.0,0.272,32810.0,22350.0,17400.0,60-69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYW4-0526-SM-5GZZ5,0.0531,1.454,0.2451,2.455,0.1097,1.025,1.633,30.76,21.81,51.43,...,4.55,1.494,0.1684,0.0159,0.0,0.3917,25790.0,17140.0,11540.0,60-69
GTEX-ZYY3-0526-SM-5E45G,0.0822,1.42,0.2586,2.126,0.5785,1.421,1.457,39.49,13.75,66.46,...,2.4,0.0419,0.0,0.0,0.0,0.0,23730.0,25210.0,20930.0,60-69
GTEX-ZZ64-1526-SM-5E43K,0.1,0.5735,0.1834,2.912,0.0715,0.4455,0.7137,27.97,13.38,45.61,...,3.908,0.9536,0.2196,0.0276,0.0,0.0189,35880.0,24940.0,18490.0,20-29
GTEX-ZZPT-0626-SM-5GZXT,0.1595,1.706,0.7448,6.172,0.8116,1.602,5.176,80.71,44.47,132.9,...,8.613,2.594,0.2168,0.0816,0.0,0.3081,29870.0,38770.0,20880.0,50-59


In [44]:
# Make AGE column categorical.
categories = {
    '20-29': 0,
    '30-39': 1, 
    '40-49': 2,
    '50-59': 3,
    '60-69': 4,
    '70-79': 5
}

selected_features_df['AGE_CAT'] = selected_features_df['AGE'].apply(lambda x: categories[x])
selected_features_df




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Description,RP4-669L17.10,RP11-206L10.2,RP11-206L10.9,AGRN,TNFRSF18,LINC01786,SCNN1D,INTS11,MXRA8,CCNL2,...,PRKY,XGY1,ANOS2P,FAM224B,TTTY10,MT-ND1,MT-ND2,MT-ND4L,AGE,AGE_CAT
GTEX-1117F-0426-SM-5EGHI,0.1441,2.655,0.3496,6.278,0.3777,1.284,1.952,49.75,16.01,69.4,...,0.0,0.0,0.0,0.0,0.0,38030.0,24550.0,14200.0,60-69,4
GTEX-111CU-2026-SM-5GZZC,0.1216,0.5669,0.212,2.171,0.1507,0.6706,0.8034,31.44,10.03,43.79,...,1.114,0.0,0.0,0.0,0.0997,35830.0,29250.0,21980.0,50-59,3
GTEX-111FC-0326-SM-5GZZ1,0.2017,0.423,0.349,3.133,0.3124,2.085,2.559,31.36,22.79,37.7,...,1.119,0.0685,0.0387,0.0,0.2125,27240.0,18090.0,24400.0,60-69,4
GTEX-111VG-2626-SM-5GZY2,0.1607,0.8483,0.2767,2.217,0.6875,2.279,2.687,33.16,21.74,33.6,...,2.279,0.0728,0.0411,0.0,0.3199,22440.0,12760.0,14040.0,60-69,4
GTEX-111YS-2326-SM-5987L,0.0922,0.4987,0.3758,1.824,0.1713,1.449,0.9427,33.76,11.92,41.34,...,1.375,0.614,0.033,0.0,0.272,32810.0,22350.0,17400.0,60-69,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYW4-0526-SM-5GZZ5,0.0531,1.454,0.2451,2.455,0.1097,1.025,1.633,30.76,21.81,51.43,...,1.494,0.1684,0.0159,0.0,0.3917,25790.0,17140.0,11540.0,60-69,4
GTEX-ZYY3-0526-SM-5E45G,0.0822,1.42,0.2586,2.126,0.5785,1.421,1.457,39.49,13.75,66.46,...,0.0419,0.0,0.0,0.0,0.0,23730.0,25210.0,20930.0,60-69,4
GTEX-ZZ64-1526-SM-5E43K,0.1,0.5735,0.1834,2.912,0.0715,0.4455,0.7137,27.97,13.38,45.61,...,0.9536,0.2196,0.0276,0.0,0.0189,35880.0,24940.0,18490.0,20-29,0
GTEX-ZZPT-0626-SM-5GZXT,0.1595,1.706,0.7448,6.172,0.8116,1.602,5.176,80.71,44.47,132.9,...,2.594,0.2168,0.0816,0.0,0.3081,29870.0,38770.0,20880.0,50-59,3


In [45]:
# Save the final dataframe.
to_save_df = selected_features_df.drop(columns=['AGE'])
to_save_df.to_csv('gtex_rnaseq_with_age_%s_%i_selected_features.tsv' % (tissue, K), sep='\t', index=False)
to_save_df

Description,RP4-669L17.10,RP11-206L10.2,RP11-206L10.9,AGRN,TNFRSF18,LINC01786,SCNN1D,INTS11,MXRA8,CCNL2,...,CLIC2,PRKY,XGY1,ANOS2P,FAM224B,TTTY10,MT-ND1,MT-ND2,MT-ND4L,AGE_CAT
GTEX-1117F-0426-SM-5EGHI,0.1441,2.655,0.3496,6.278,0.3777,1.284,1.952,49.75,16.01,69.4,...,3.048,0.0,0.0,0.0,0.0,0.0,38030.0,24550.0,14200.0,4
GTEX-111CU-2026-SM-5GZZC,0.1216,0.5669,0.212,2.171,0.1507,0.6706,0.8034,31.44,10.03,43.79,...,1.938,1.114,0.0,0.0,0.0,0.0997,35830.0,29250.0,21980.0,3
GTEX-111FC-0326-SM-5GZZ1,0.2017,0.423,0.349,3.133,0.3124,2.085,2.559,31.36,22.79,37.7,...,1.216,1.119,0.0685,0.0387,0.0,0.2125,27240.0,18090.0,24400.0,4
GTEX-111VG-2626-SM-5GZY2,0.1607,0.8483,0.2767,2.217,0.6875,2.279,2.687,33.16,21.74,33.6,...,2.367,2.279,0.0728,0.0411,0.0,0.3199,22440.0,12760.0,14040.0,4
GTEX-111YS-2326-SM-5987L,0.0922,0.4987,0.3758,1.824,0.1713,1.449,0.9427,33.76,11.92,41.34,...,3.196,1.375,0.614,0.033,0.0,0.272,32810.0,22350.0,17400.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYW4-0526-SM-5GZZ5,0.0531,1.454,0.2451,2.455,0.1097,1.025,1.633,30.76,21.81,51.43,...,4.55,1.494,0.1684,0.0159,0.0,0.3917,25790.0,17140.0,11540.0,4
GTEX-ZYY3-0526-SM-5E45G,0.0822,1.42,0.2586,2.126,0.5785,1.421,1.457,39.49,13.75,66.46,...,2.4,0.0419,0.0,0.0,0.0,0.0,23730.0,25210.0,20930.0,4
GTEX-ZZ64-1526-SM-5E43K,0.1,0.5735,0.1834,2.912,0.0715,0.4455,0.7137,27.97,13.38,45.61,...,3.908,0.9536,0.2196,0.0276,0.0,0.0189,35880.0,24940.0,18490.0,0
GTEX-ZZPT-0626-SM-5GZXT,0.1595,1.706,0.7448,6.172,0.8116,1.602,5.176,80.71,44.47,132.9,...,8.613,2.594,0.2168,0.0816,0.0,0.3081,29870.0,38770.0,20880.0,3
