# Imports

In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.manifold import Isomap
import pickle
import time

In [3]:
# Load the dataset
df = pd.read_csv('csv/train_data.csv')

# Unneeded
df = df.drop(columns=['Unnamed: 0'])
df = df.drop_duplicates()

# Option 1: Drop rows with missing values
df = df.dropna()

# Option 2: Fill missing values (maybe use sklearn.impute)
# NOT YET IMPLEMENTED   

df.head()

Unnamed: 0,Age,Gender,Stage,GeneticRisk,TreatmentType,ComorbidityIndex,TreatmentResponse,SurvivalTime,Censored
1,63.0,1,3,0.0,0,1.0,0.0,4.7,0
3,56.0,0,3,1.0,0,1.0,0.0,2.9,0
4,67.0,1,3,1.0,0,3.0,1.0,3.0,0
7,68.0,1,2,0.0,0,2.0,1.0,3.1,0
8,68.0,0,3,0.0,1,1.0,0.0,4.8,1


In [None]:
# Pandas profiler
profile = ProfileReport(df)
profile.to_notebook_iframe()

In [4]:
X = df.drop(columns=['SurvivalTime', 'Censored'])

y = df['SurvivalTime']

censored = df['Censored']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)

(147, 7)


In [61]:
from sklearn.preprocessing import OrdinalEncoder

numerical_features = ['Age']
categorical_features = ['Gender', 'Stage', 'GeneticRisk', 'TreatmentType', 'ComorbidityIndex', 'TreatmentResponse']

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('isomap', Isomap(n_components=2)),
                        ('model', RandomForestRegressor())])


# Train your model on the training data
model.fit(X_train, y_train)

In [62]:
def cMSE(y_hat, y, c):
  err = y-y_hat
  err = (1-c)*err**2 + c*np.maximum(0,err)**2
  return np.sum(err)/err.shape[0]

y_pred = model.predict(X_test)

print("cMSE: %.4f" % cMSE(y_pred, y_test, censored))

cMSE: 0.4243


# Save Model

In [66]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Submission Prediction

In [63]:
final_test_dataset = pd.read_csv('csv/test_data.csv').drop(columns=['id'])
final_test_dataset.head()
submission_pred = model.predict(final_test_dataset)

In [65]:
submission_pred_df = pd.DataFrame(submission_pred, columns=['TARGET'])
submission_pred_df.insert(0, 'id', range(0, len(submission_pred_df)))
submission_pred_df.to_csv('submission_pred.csv', index=False)