In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
import train_model as tm
import visualize as vis
import build_features as bf
import make_dataset as md

In [None]:
# set random state
rand_state = 1

# Load files into memory

In [None]:
counts_filename = 'data/count_data_species_raw_WIS_overlapping_fungi_bacteria_12773samples.tsv'
metadata_filename = 'data/metadata_species_WIS_overlapping_fungi_bacteria_12773samples.tsv'

In [None]:
counts = md.read_fungi_data(counts_filename)
metadata = md.read_fungi_data(metadata_filename)

In [None]:
# consider selecting metadata columns before combining
combined = pd.merge(metadata, counts, on="sampleid", how="inner")

In [None]:
metadata.columns

In [None]:
metadata.head()

In [None]:
counts.head()

In [None]:
combined.head()

In [None]:
metadata.shape

In [None]:
counts.shape

In [None]:
combined.shape

In [None]:
combined["days_to_death"].value_counts()

In [None]:
combined["days_to_death"].mean()

In [None]:
# note 76% of samples dont have DTD
combined["days_to_death"].isna().mean()

In [None]:
# 3036 data points to work with
combined["days_to_death"].notna().sum()

In [None]:
# drop nans + values < 10k
plt.boxplot(combined[combined['days_to_death'] < 10_000]['days_to_death'].dropna(), vert=False)

## Searching for metadata features

In [None]:
len(metadata.columns)

In [None]:
metadata[metadata.columns[1:10]].head()
# important columns
# [experimental_strategy]
# what the hell are these
# [analyte_amount, analyte_A260A280Ratio, aliquot_concentration]

In [None]:
metadata[metadata.columns[11:20]].head()
# important columns
# [gender, race, ethnicity, disease_type, sample_type]
# what the hell are these
# [data_subtype, cgc_platform]

In [None]:
metadata[metadata.columns[21:30]].head()
# important columns
# [primary_site, days_to_death]
# what the hell are these
# [reference_genome, investigation]

In [None]:
metadata[metadata.columns[31:41]].head()
# important columns
# [data_submitting_center_label, tissue_source_site_label, country_of_sample_procurement, pathologic_stage_label]
# what the hell are these
# [portion_is_ffpe, pathologic_t_label, pathologic_n_label, histological_diagnosis_label, PlateCenter, PlateCenterFlag]

## Metadata Regression

In [None]:
important_cols = ['experimental_strategy', 'gender', 'race', 'ethnicity', 
'disease_type', 'sample_type','primary_site', 'days_to_death','data_submitting_center_label', 
'tissue_source_site_label', 'country_of_sample_procurement', 'pathologic_stage_label']

In [None]:
metadata['pathologic_stage_label'].value_counts()
# experimental_strategy          - ohe
# gender                         - ohe
# race                           - ohe
# ethnicity                      - ohe
# disease_type                   - ohe
# sample_type                    - ohe (possibly assign numerical? are they ordinal?)
# primary_site                   - ohe
# data_submitting_center_label   - ohe
# tissue_source_site_label       - avoid ohe (179 unique vals)
# country_of_sample_procurement  - ohe
# pathologic_stage_label         - ohe (reduce stages)


# days_to_death             - numerical

## Regression Model

In [None]:
reg_data = combined[combined["days_to_death"].notna()]

In [None]:
reg_data.shape

In [None]:
# separate X and Y and generate 

test_prop = 0.1
reg_X = reg_data.drop(columns=metadata.columns)
reg_Y = reg_data["days_to_death"]
reg_Xtrain, reg_Xtest, reg_Ytrain, reg_Ytest = train_test_split(reg_X, reg_Y, test_size=test_prop, random_state=rand_state)

In [None]:
reg_model = sk.linear_model.LinearRegression()
reg_model.fit(reg_Xtrain, reg_Ytrain)
preds = reg_model.predict(reg_Xtest)
preds = [x if x > 0 else 0 for x in preds] # Replace negative predictions
scores = mean_squared_error(reg_Ytest, preds)
scores

In [None]:
(reg_Ytest - preds).head()