In [51]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [1]:
import train_model as tm
import visualize as vis
import build_features as bf
import make_dataset as md

In [24]:
# set random state
rand_state = 1

# Load files into memory

In [4]:
counts_filename = 'data/count_data_species_raw_WIS_overlapping_fungi_bacteria_12773samples.tsv'
metadata_filename = 'data/metadata_species_WIS_overlapping_fungi_bacteria_12773samples.tsv'

In [7]:
counts = md.read_fungi_data(counts_filename)
metadata = md.read_fungi_data(metadata_filename)

In [16]:
# consider selecting metadata columns before combining
combined = pd.merge(metadata, counts, on="sampleid", how="inner")

In [8]:
metadata.columns

Index(['sample_name', 'run_prefix', 'experimental_strategy', 'cgc_base_name',
       'filename', 'analyte_amount', 'analyte_A260A280Ratio',
       'aliquot_concentration', 'cgc_id', 'cgc_filename', 'vital_status',
       'gender', 'race', 'ethnicity', 'disease_type', 'data_subtype',
       'tcga_sample_id', 'cgc_case_uuid', 'sample_type', 'cgc_platform',
       'gdc_file_uuid', 'primary_site', 'age_at_diagnosis', 'cgc_sample_uuid',
       'cgc_aliquot_uuid', 'tcga_aliquot_id', 'reference_genome',
       'tcga_case_id', 'investigation', 'days_to_death', 'knightlabID',
       'data_submitting_center_label', 'tissue_source_site_label',
       'country_of_sample_procurement', 'portion_is_ffpe',
       'pathologic_t_label', 'pathologic_n_label',
       'histological_diagnosis_label', 'pathologic_stage_label', 'PlateCenter',
       'PlateCenterFlag'],
      dtype='object')

In [10]:
metadata.head()

Unnamed: 0_level_0,sample_name,run_prefix,experimental_strategy,cgc_base_name,filename,analyte_amount,analyte_A260A280Ratio,aliquot_concentration,cgc_id,cgc_filename,...,data_submitting_center_label,tissue_source_site_label,country_of_sample_procurement,portion_is_ffpe,pathologic_t_label,pathologic_n_label,histological_diagnosis_label,pathologic_stage_label,PlateCenter,PlateCenterFlag
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13722.58cfa82de4b0c9d6adf6a4c2,58cfa82de4b0c9d6adf6a4c2,12dd02ea14d0a87df23ce3bef406fe27.filtered.,WGS,12dd02ea14d0a87df23ce3bef406fe27,12dd02ea14d0a87df23ce3bef406fe27.bam,32.75,,0.08,58cfa82de4b0c9d6adf6a4c2,12dd02ea14d0a87df23ce3bef406fe27.bam,...,Washington University School of Medicine,Duke,United States,NO,T2,N0,Infiltrating Ductal Carcinoma,Stage IIA,A21Q-09,True
13722.58cfa82de4b0c9d6adf6a502,58cfa82de4b0c9d6adf6a502,3d9f475186150ea055fddf25af7bb7e3.filtered.,WGS,3d9f475186150ea055fddf25af7bb7e3,3d9f475186150ea055fddf25af7bb7e3.bam,64.35,1.7,0.08,58cfa82de4b0c9d6adf6a502,3d9f475186150ea055fddf25af7bb7e3.bam,...,Washington University School of Medicine,University of North Carolina,United States,NO,Not available,Not available,Endometrioid endometrial adenocarcinoma,Not available,A13L-09,False
13722.58cfa82de4b0c9d6adf6a4ce,58cfa82de4b0c9d6adf6a4ce,2258e57e8e0af9db6969a1da86177ca7.filtered.,WGS,2258e57e8e0af9db6969a1da86177ca7,2258e57e8e0af9db6969a1da86177ca7.bam,62.02,2.08,0.08,58cfa82de4b0c9d6adf6a4ce,2258e57e8e0af9db6969a1da86177ca7.bam,...,Washington University School of Medicine,MSKCC,,NO,T3,N2,Infiltrating Ductal Carcinoma,Stage IIIA,A19H-09,True
13722.58cfa82de4b0c9d6adf6a48a,58cfa82de4b0c9d6adf6a48a,142ba22e796cab1075278cd533a287c8.filtered.,WGS,142ba22e796cab1075278cd533a287c8,142ba22e796cab1075278cd533a287c8.bam,93.54,2.18,0.07,58cfa82de4b0c9d6adf6a48a,142ba22e796cab1075278cd533a287c8.bam,...,Washington University School of Medicine,MSKCC,,NO,Not available,Not available,Serous endometrial adenocarcinoma,Not available,A066-09,True
13722.58cfa82de4b0c9d6adf6a4d4,58cfa82de4b0c9d6adf6a4d4,406aecbc23505359850e57fbf05d5b67.filtered.,WGS,406aecbc23505359850e57fbf05d5b67,406aecbc23505359850e57fbf05d5b67.bam,85.32,1.85,0.08,58cfa82de4b0c9d6adf6a4d4,406aecbc23505359850e57fbf05d5b67.bam,...,Washington University School of Medicine,MSKCC,,NO,Not available,Not available,Serous endometrial adenocarcinoma,Not available,A066-09,True


In [11]:
counts.head()

Unnamed: 0_level_0,Actinomyces_dentalis,Actinomyces_johnsonii,Actinomyces_massiliensis,Actinomyces_naeslundii,Actinomyces_oris,Actinomyces_viscosus,Mobiluncus_curtisii,Alloscardovia_omnicolens,Bifidobacterium_adolescentis,Bifidobacterium_animalis,...,Stereum_hirsutum,Tilletiopsis_washingtonensis,Malassezia_globosa,Malassezia_restricta,Malassezia_sympodialis,Rhodotorula_graminis,Trichosporon_asahii,Pseudozyma_hubeiensis,Sporisorium_graminicola,Wallemia_ichthyophaga
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13722.58cfa82de4b0c9d6adf6a4c2,0,4,4,6,10,1,0,0,0,0,...,0,0,13628,98,2,0,0,0,0,0
13722.58cfa82de4b0c9d6adf6a502,2,11,2,5,16,4,0,0,0,0,...,0,0,3003,121,0,0,0,0,0,0
13722.58cfa82de4b0c9d6adf6a4ce,0,7,4,1,7,1,1,0,0,0,...,2,0,9497,169,0,1,0,1,3,0
13722.58cfa82de4b0c9d6adf6a48a,0,0,2,2,14,0,2,0,0,0,...,5,0,14486,783,6,0,0,0,1,0
13722.58cfa82de4b0c9d6adf6a4d4,0,0,0,0,0,0,1,0,0,0,...,4,0,15702,85,0,0,0,0,2,0


In [19]:
combined.head()

Unnamed: 0_level_0,sample_name,run_prefix,experimental_strategy,cgc_base_name,filename,analyte_amount,analyte_A260A280Ratio,aliquot_concentration,cgc_id,cgc_filename,...,Stereum_hirsutum,Tilletiopsis_washingtonensis,Malassezia_globosa,Malassezia_restricta,Malassezia_sympodialis,Rhodotorula_graminis,Trichosporon_asahii,Pseudozyma_hubeiensis,Sporisorium_graminicola,Wallemia_ichthyophaga
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13722.58cfa82de4b0c9d6adf6a4c2,58cfa82de4b0c9d6adf6a4c2,12dd02ea14d0a87df23ce3bef406fe27.filtered.,WGS,12dd02ea14d0a87df23ce3bef406fe27,12dd02ea14d0a87df23ce3bef406fe27.bam,32.75,,0.08,58cfa82de4b0c9d6adf6a4c2,12dd02ea14d0a87df23ce3bef406fe27.bam,...,0,0,13628,98,2,0,0,0,0,0
13722.58cfa82de4b0c9d6adf6a502,58cfa82de4b0c9d6adf6a502,3d9f475186150ea055fddf25af7bb7e3.filtered.,WGS,3d9f475186150ea055fddf25af7bb7e3,3d9f475186150ea055fddf25af7bb7e3.bam,64.35,1.7,0.08,58cfa82de4b0c9d6adf6a502,3d9f475186150ea055fddf25af7bb7e3.bam,...,0,0,3003,121,0,0,0,0,0,0
13722.58cfa82de4b0c9d6adf6a4ce,58cfa82de4b0c9d6adf6a4ce,2258e57e8e0af9db6969a1da86177ca7.filtered.,WGS,2258e57e8e0af9db6969a1da86177ca7,2258e57e8e0af9db6969a1da86177ca7.bam,62.02,2.08,0.08,58cfa82de4b0c9d6adf6a4ce,2258e57e8e0af9db6969a1da86177ca7.bam,...,2,0,9497,169,0,1,0,1,3,0
13722.58cfa82de4b0c9d6adf6a48a,58cfa82de4b0c9d6adf6a48a,142ba22e796cab1075278cd533a287c8.filtered.,WGS,142ba22e796cab1075278cd533a287c8,142ba22e796cab1075278cd533a287c8.bam,93.54,2.18,0.07,58cfa82de4b0c9d6adf6a48a,142ba22e796cab1075278cd533a287c8.bam,...,5,0,14486,783,6,0,0,0,1,0
13722.58cfa82de4b0c9d6adf6a4d4,58cfa82de4b0c9d6adf6a4d4,406aecbc23505359850e57fbf05d5b67.filtered.,WGS,406aecbc23505359850e57fbf05d5b67,406aecbc23505359850e57fbf05d5b67.bam,85.32,1.85,0.08,58cfa82de4b0c9d6adf6a4d4,406aecbc23505359850e57fbf05d5b67.bam,...,4,0,15702,85,0,0,0,0,2,0


In [12]:
metadata.shape

(12773, 41)

In [13]:
counts.shape

(12773, 297)

In [17]:
combined.shape

(12773, 338)

In [28]:
combined["days_to_death"].value_counts()

0.0       14
270.0     13
709.0     12
457.0     11
415.0     11
          ..
1149.0     1
1588.0     1
740.0      1
912.0      1
1714.0     1
Name: days_to_death, Length: 1123, dtype: int64

In [29]:
combined["days_to_death"].mean()

940.4683794466404

In [27]:
# note 76% of samples dont have DTD
combined["days_to_death"].isna().mean()

0.7623111250293588

In [30]:
# 3036 data points to work with
combined["days_to_death"].notna().sum()

3036

## Regression Model

In [35]:
reg_data = combined[combined["days_to_death"].notna()]

In [36]:
reg_data.shape

(3036, 338)

In [58]:
# separate X and Y and generate 

test_prop = 0.1
reg_X = reg_data.drop(columns=metadata.columns)
reg_Y = reg_data["days_to_death"]
reg_Xtrain, reg_Xtest, reg_Ytrain, reg_Ytest = train_test_split(reg_X, reg_Y, test_size=test_prop, random_state=rand_state)

In [59]:
reg_model = sk.linear_model.LinearRegression()
reg_model.fit(reg_Xtrain, reg_Ytrain)
preds = reg_model.predict(reg_Xtest)
scores = mean_squared_error(reg_Ytest, preds)
scores

171944372.45369238

In [60]:
(reg_Ytest - preds).head(20)

sampleid
13722.58cfa82de4b0c9d6adf6a6d6     -248.756802
13767.58cfa837e4b0c9d6adf6ff58     -827.699620
13722.58cfa831e4b0c9d6adf6c594     -775.038069
13722.58cfa830e4b0c9d6adf6c266     -347.895133
13722.58cfa831e4b0c9d6adf6c6b8       92.074162
13767.58cfa82ee4b0c9d6adf6b164     1915.791789
13722.58cfa830e4b0c9d6adf6bc60      173.691988
13767.58cfa835e4b0c9d6adf6ec28      900.010547
13722.58cfa82ee4b0c9d6adf6a81a    53049.783468
13767.58cfa834e4b0c9d6adf6e5fa     -668.973427
13722.58cfa830e4b0c9d6adf6c1b6    12958.662554
13722.58cfa831e4b0c9d6adf6c832      992.733619
13722.58cfa82de4b0c9d6adf6a62e     -328.794262
13722.58cfa831e4b0c9d6adf6cb69      491.609098
13767.58cfa83ce4b0c9d6adf72b54      544.839982
13767.58cfa82fe4b0c9d6adf6b241      311.044421
13767.58cfa82fe4b0c9d6adf6b856     -320.239251
13767.58cfa832e4b0c9d6adf6d5b6     -396.254901
13722.58cfa83ce4b0c9d6adf730d0     -917.741989
13722.58cfa83ce4b0c9d6adf72ee1      430.794992
Name: days_to_death, dtype: float64