In [None]:
# TODO
# Before editing preprocessing pipeline
# X Filter out rows with 'analyte_A260A280Ratio' containing '0'
# X Filter out rows with 'aliquot_concentration' containing '2.10'
# X Truncate 'pathologic_t_label', 'pathologic_n_label', 'pathologic_stage_label' to two digits (reduce stages)
# Preprocessing pipeline
# # Establish ordinal for 'sample_type' WITH given order
# # # If doesn't work, just use ohe
# # Establish ordinal for 'tissue_source_site_label' 'histological_diagnosis_label' with no given order
# # # If doesn't work, just use ohe


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
import train_model as tm
import visualize as vis
import build_features as bf
import make_dataset as md

In [None]:
# set random state
rand_state = 1

# Load files into memory

In [None]:
counts_filename = 'data/count_data_species_raw_WIS_overlapping_fungi_bacteria_12773samples.tsv'
metadata_filename = 'data/metadata_species_WIS_overlapping_fungi_bacteria_12773samples.tsv'

In [None]:
counts = md.read_fungi_data(counts_filename)
metadata = md.read_fungi_data(metadata_filename)

metadata = metadata.replace('Not available', np.nan)

#TODO Merge this later?
combined = pd.merge(metadata, counts, on="sampleid", how="inner")

print('Metadata Shape: ' + str(metadata.shape))
print('Counts Shape: ' + str(counts.shape))
print('Combined Shape: ' + str(combined.shape))

In [None]:
# note 76% of samples dont have DTD
print("Days to Death - NA: " + str(round(combined["days_to_death"].isna().mean(), 3) * 100) + '%')

# 3036 data points to work with
print("Rows remaining: " + str(combined["days_to_death"].notna().sum()))

In [None]:
combined = pd.merge(metadata, counts, on="sampleid", how="inner")

combined = combined[combined['days_to_death'] < 10_000]    # Drop NaN's & outliers
combined = combined[(combined['analyte_A260A280Ratio'] > 0) | (combined['analyte_A260A280Ratio'].isna())]  # Drop strange values / Keep NaN's
combined = combined[(combined['aliquot_concentration'] < 2) | (combined['aliquot_concentration'].isna())]  # Drop strange values / Keep NaN's

for i in range(5):
    combined['pathologic_t_label'] = combined['pathologic_t_label'].replace(f'T{i}.*', f'T{i}', regex=True) # Consolidate T Labels
for i in range(4):
    combined['pathologic_n_label'] = combined['pathologic_n_label'].replace(f'N{i}.*', f'N{i}', regex=True) # Consolidate N Labels

stage_consolidation = {'Stage IA':    'Stage I',
                       'Stage IB':    'Stage I',
                       'Stage IS':    'Stage I',
                       'I or II NOS': 'Stage I',

                       'Stage IIA':   'Stage II',
                       'Stage IIB':   'Stage II',
                       'Stage IIC':   'Stage II',

                       'Stage IIIA':  'Stage III',
                       'Stage IIIB':  'Stage III',
                       'Stage IIIC':  'Stage III',

                       'Stage IVA':   'Stage IV',
                       'Stage IVB':   'Stage IV',
                       'Stage IVC':   'Stage IV',
                      }
combined['pathologic_stage_label'] = combined['pathologic_stage_label'].replace(stage_consolidation)

combined.shape

In [None]:
# drop nans + values < 10k
plt.boxplot(combined[combined['days_to_death'] < 10_000]['days_to_death'].dropna(), vert=False)

## Searching for metadata features

In [None]:
plt.hist(metadata['aliquot_concentration'], bins=100)
plt.show()

In [None]:
# experimental_strategy          - ohe
# analyte_amount                 - numerical (normalize?)
# analyte_A260A280Ratio          - numerical (contains zero?)
# aliquot_concentration          - numerical (omit 2.10 value)
# gender                         - ohe (55 nan)
# race                           - ohe (1462 nan, keep)
# ethnicity                      - ohe (3096 nan, keep)
# disease_type                   - ohe
# sample_type                    - ordinal [['Primary Tumor', 'Recurrent Tumor', 'Additional - New Primary', 'Metastatic', 'Blood Derived Normal', 'Solid Tissue Normal']]
# primary_site                   - ohe
# age_at_diagnosis               - numerical
# reference_genome               - ohe
# data_submitting_center_label   - ohe
# investigation                  - ohe
# days_to_death                  - numerical
# tissue_source_site_label       - ordinal (avoid ohe, 179 unique vals)
# country_of_sample_procurement  - ohe
# pathologic_t_label             - ohe (reduce stages?)
# pathologic_n_label             - ohe (reduce stages?)
# histological_diagnosis_label   - ordinal (avoid ohe, 71 unique vals)
# pathologic_stage_label         - ohe (reduce stages)
# PlateCenter                    - numerical
# PlateCenterFlag                - ohe

important_cols = ['experimental_strategy', 'analyte_amount', 'analyte_A260A280Ratio', 'aliquot_concentration', 'gender', 'race', 'ethnicity', 
'disease_type', 'sample_type','primary_site', 'age_at_diagnosis', 'reference_genome', 'data_submitting_center_label', 
'investigation', 'tissue_source_site_label', 'country_of_sample_procurement', 'pathologic_t_label', 'pathologic_n_label', 
'histological_diagnosis_label', 'pathologic_stage_label', 'PlateCenter', 'PlateCenterFlag', 'days_to_death']

In [None]:
metadata['pathologic_stage_label'].value_counts()

## Metadata Regression

In [None]:
ohe_feats = ['experimental_strategy', 'gender', 'race', 'ethnicity', 'disease_type', 'primary_site',
             'reference_genome', 'data_submitting_center_label', 'investigation', 'country_of_sample_procurement', 
             'pathologic_t_label', 'pathologic_n_label', 'pathologic_stage_label', 'PlateCenterFlag']
scaler_feats = ['analyte_amount', 'analyte_A260A280Ratio', 'aliquot_concentration']
passthrough_feats = ['age_at_diagnosis', 'PlateCenter']
sample_type_feat = ['sample_type'] # [['Primary Tumor', 'Recurrent Tumor', 'Additional - New Primary', 'Metastatic', 'Blood Derived Normal', 'Solid Tissue Normal']]
ordinal_feats = ['tissue_source_site_label', 'histological_diagnosis_label']
drop_feats = ['sample_name', 'run_prefix', 'cgc_base_name',
              'filename', 'cgc_id', 'cgc_filename', 'vital_status',
              'data_subtype', 'tcga_sample_id', 'cgc_case_uuid', 'cgc_platform',
              'gdc_file_uuid', 'cgc_sample_uuid',
              'cgc_aliquot_uuid', 'tcga_aliquot_id',
              'tcga_case_id', 'days_to_death', 'knightlabID', 'portion_is_ffpe']

In [None]:
ct = make_column_transformer(    
    (StandardScaler(), scaler_feats),
    ("passthrough", passthrough_feats),
    (OneHotEncoder(sparse=False), ohe_feats + ordinal_feats),
    ("drop", drop_feats),
)

In [None]:
transformed = ct.fit_transform(metadata)

In [None]:
column_names = (
    scaler_feats
    + passthrough_feats    
    + ct.named_transformers_["onehotencoder"].get_feature_names().tolist()
)

In [None]:
pd.DataFrame(transformed, columns=column_names)

## Regression Model

In [None]:
reg_data = combined[combined["days_to_death"].notna()]

In [None]:
reg_data.shape

In [None]:
# separate X and Y and generate 

test_prop = 0.1
reg_X = reg_data.drop(columns=metadata.columns)
reg_Y = reg_data["days_to_death"]
reg_Xtrain, reg_Xtest, reg_Ytrain, reg_Ytest = train_test_split(reg_X, reg_Y, test_size=test_prop, random_state=rand_state)

In [None]:
reg_model = LinearRegression()
reg_model.fit(reg_Xtrain, reg_Ytrain)
preds = reg_model.predict(reg_Xtest)
preds = [x if x > 0 else 0 for x in preds] # Replace negative predictions
scores = mean_squared_error(reg_Ytest, preds)
scores

In [None]:
(reg_Ytest - preds).head()