In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge2/train.csv')
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/widsdatathon2024-challenge2/test.csv')
test.head()

# Testing Single Marker Predictors

In [None]:
def single_marker(df, marker, title):
    grouped = df.groupby([marker], as_index=False).mean()
    x = grouped[marker]
    y = grouped['metastatic_diagnosis_period']
    sns.barplot(x=x, y=y)
    plt.xticks(rotation=90, fontsize=7)
#     plt.xlabel(marker, fontsize=5)
    plt.ylabel('Average Per Marker Bin')
    plt.title(title)
    plt.show()

In [None]:
race = train[['patient_race', 'metastatic_diagnosis_period']]
single_marker(race, 'patient_race', 'Average MDP by Patient')

In [None]:
payer = train[['payer_type', 'metastatic_diagnosis_period']]
single_marker(payer, 'payer_type', 'Average MDP by Patient')

In [None]:
age = train[['patient_age', 'metastatic_diagnosis_period']]
age['age_bin'] = pd.cut(age['patient_age'], 10)
single_marker(age, 'age_bin', 'Average MDP by Patient')

In [None]:
bmi = train[['bmi', 'metastatic_diagnosis_period']]
bmi['bmi_bin'] = pd.cut(bmi['bmi'], 5)
single_marker(bmi, 'bmi_bin', 'Average MDP by Patient')

In [None]:
state = train[['patient_state', 'metastatic_diagnosis_period']]
state_graph = single_marker(state, 'patient_state', 'Average MDP by Patient')
state_graph

# Feature Engineering

In [None]:
print(list(train.columns))

In [None]:
def filter_df(df):
    climate = df.filter(like='Average of')
    df_demo = df[df.columns.drop(list(df.filter(like='Average of')))]

    df_demo = df_demo.dropna(subset=['family_size'])

    values = {'patient_race': 'Unknown', 'payer_type': 'Other', 'bmi': 'Unknown', "D": 3}
    df_demo = df_demo.fillna(value=values)
    df_demo = df_demo.drop(['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type', 'breast_cancer_diagnosis_desc'], axis=1)
    df_demo = df_demo.set_index('patient_id')
    
    for col in ['patient_race', 'payer_type', 'patient_state', 'Region', 'Division', 'patient_gender', 'bmi', 'breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code']:
        df_demo[col] = df_demo[col].astype('category')
        df_demo[col] = df_demo[col].cat.codes
    
    return df_demo

In [None]:
train_e = filter_df(train)
train_e.head()

# Random Forest

I chose a decision forest for this task because it is supervised (target variable) and quick to train.

In [None]:
x = train_e.drop('metastatic_diagnosis_period', axis=1)
y = train_e['metastatic_diagnosis_period']

In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)
rf.fit(x, y)

In [None]:
predictions = rf.predict(x)
mse = mean_squared_error(y, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y, predictions)
print(f'R-squared: {r2}')

## Conclusions

In [None]:
def plot_importance(model, X):
    fi = dict(zip(X.columns, model.feature_importances_))
    feature_importances = pd.DataFrame(fi.items(), columns=['feature', 'importance']).sort_values('importance', ascending=False)
    feature_importances = feature_importances[feature_importances['importance']>.02]
    fig, ax = plt.subplots()
    ax = sns.barplot(data=feature_importances, x='feature', y='importance')
    plt.xticks(rotation=30, fontsize=6, horizontalalignment='right')
    ax.bar_label(ax.containers[0])
    plt.show()

plot_importance(rf, x)

The type of breast cancer that a patient is initially diagnosed with esems to have by far the highest impact, followed by their age.

## Output

In [None]:
test_ = filter_df(test)
test_.head()

In [None]:
output_df = pd.DataFrame({
    'patient_id': test_.index,
    'metastatic_diagnosis_period': rf.predict(test_)
}
)

output_df.to_csv('predictions.csv', index=False)
print('predictions saved')