In [119]:
#importing everything we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error,r2_score,plot_confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [120]:
df=pd.DataFrame(pd.read_csv("/kaggle/input/cancerprediction/EXAM_8/data.csv")) #loading dataset
df = df[df['Patient\'s Vital Status'].notna()] #selecting only rows that have y label
df.drop(["Patient ID"],inplace=True,axis=1) #dropping patient id because it's useless

#selecting categorical data
cat_data=['Type of Breast Surgery', 'Overall Survival Status', 'Cancer Type', 'Cancer Type Detailed', 'Cellularity', 'Chemotherapy', 'Pam50 + Claudin-low subtype', 'ER status measured by IHC', 'ER Status', 'HER2 status measured by SNP6', 'HER2 Status', 'Tumor Other Histologic Subtype', 'Hormone Therapy', 'Inferred Menopausal State', 'Integrative Cluster', 'Primary Tumor Laterality', 'PR Status', 'Oncotree Code', 'Radio Therapy', 'Relapse Free Status', 'Sex', '3-Gene classifier subtype','Cohort','Tumor Stage','Neoplasm Histologic Grade'] 
dist=df.drop(cat_data+['Patient\'s Vital Status'],axis=1)
#one-hot encoding categorical data
df=pd.concat((pd.get_dummies(df.iloc[:,:-1],drop_first=True,columns=cat_data),df['Patient\'s Vital Status']),axis=1)


In [121]:
#label encoding of y data
le = LabelEncoder()
df['Patient\'s Vital Status'] = le.fit_transform(df['Patient\'s Vital Status'].values)
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(mapping)

In [122]:
#checking continuous data's distribution type
f,a = plt.subplots(4,2)
a = a.ravel()
for idx,ax in enumerate(a):
    if idx==7:
        break
    ax.hist(df[dist.keys()[idx]])
    ax.set_title(dist.keys()[idx])
plt.gcf().set_size_inches(20, 20)

In [123]:
#standardisation and normalisation 

scaler = MinMaxScaler()

df.iloc[:,1:3] = scaler.fit_transform(df.iloc[:,1:3])
df.iloc[:,4:7] = scaler.fit_transform(df.iloc[:,4:7])

#because age and nottingam prognostic index's distribution look like gausian we use sandartisation,
#for other continuous columns we use normalisation (minMaxScaler)
scaler = StandardScaler()

df.iloc[:,0] = scaler.fit_transform(df.iloc[:,0].values.reshape(-1, 1))
df.iloc[:,3] = scaler.fit_transform(df.iloc[:,3].values.reshape(-1, 1))

df.head()


In [124]:
X=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [126]:
#because nan can negatively impact our model performance we use KNN imputer to "repair" data and the we fit it into 
#RFC model
pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=21)), ('m', RandomForestClassifier())])

In [127]:
pipeline.fit(X_train, y_train)

In [128]:
res = pipeline.predict(X_test)
print(mean_absolute_error(y_test, res))

In [129]:
from sklearn.metrics import classification_report
import pandas as pd
y_pred = pipeline.predict(X_test)
print("\n"+ classification_report(y_test, y_pred,target_names=['Died of Disease', 'Died of Other Causes', 'Living']))
plot_confusion_matrix(pipeline, X_test, y_test)