In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
import train_model as tm
import visualize as vis
import build_features as bf
import make_dataset as md
import data_cleaning as dc
import preprocessing as pp

In [None]:
# set random state
rand_state = 1

# Load files into memory

In [None]:
counts_filename = 'data/count_data_species_raw_WIS_overlapping_fungi_bacteria_12773samples.tsv'
metadata_filename = 'data/metadata_species_WIS_overlapping_fungi_bacteria_12773samples.tsv'

In [None]:
counts = md.read_fungi_data(counts_filename)
metadata = md.read_fungi_data(metadata_filename)

metadata = metadata.replace('Not available', np.nan)

#TODO Merge this later?
combined = pd.merge(metadata, counts, on="sampleid", how="inner")

print('Metadata Shape:\t' + str(metadata.shape))
print('Counts Shape:\t' + str(counts.shape))
print('Combined Shape:\t' + str(combined.shape))

In [None]:
# # note 76% of samples dont have DTD
# print("Days to Death - NA: " + str(round(combined["days_to_death"].isna().mean(), 3) * 100) + '%')

# # 3036 data points to work with
# print("Rows remaining: " + str(combined["days_to_death"].notna().sum()))

# plt.boxplot(combined['days_to_death'], vert=False)
# plt.show()

In [None]:
combined = pd.merge(metadata, counts, on="sampleid", how="inner")

combined = dc.filter_metadata(combined)

combined['pathologic_t_label'] = dc.reduce_stages(combined['pathologic_t_label'])
combined['pathologic_n_label'] = dc.reduce_stages(combined['pathologic_n_label'])
combined['pathologic_stage_label'] = dc.reduce_stages(combined['pathologic_stage_label'])

combined.shape

## Metadata Regression

In [None]:
# TODO Create OrdinalEncoder for ordinal features in preprocessing.py

In [None]:
pp.preprocess_metadata(metadata).head()

## Regression Model

In [None]:
reg_data = combined[combined["days_to_death"].notna()]

In [None]:
reg_data.shape

In [None]:
# separate X and Y and generate 

test_prop = 0.1
reg_X = reg_data.drop(columns=metadata.columns)
reg_Y = reg_data["days_to_death"]
reg_Xtrain, reg_Xtest, reg_Ytrain, reg_Ytest = train_test_split(reg_X, reg_Y, test_size=test_prop, random_state=rand_state)

In [None]:
reg_model = LinearRegression()
reg_model.fit(reg_Xtrain, reg_Ytrain)
preds = reg_model.predict(reg_Xtest)
preds = [x if x > 0 else 0 for x in preds] # Replace negative predictions
scores = mean_squared_error(reg_Ytest, preds)
scores

In [None]:
(reg_Ytest - preds).head()