In [1]:
import pandas as pd

DATA = '/kaggle/input/pancreatic-cancer-prediction-dataset/pancreatic_cancer_prediction_sample.csv'
RANDOM_STATE = 2025

df = pd.read_csv(filepath_or_buffer=DATA)
df = df.sample(n=5000, random_state=RANDOM_STATE)
df.head()

Unnamed: 0,Country,Age,Gender,Smoking_History,Obesity,Diabetes,Chronic_Pancreatitis,Family_History,Hereditary_Condition,Jaundice,...,Stage_at_Diagnosis,Survival_Time_Months,Treatment_Type,Survival_Status,Alcohol_Consumption,Physical_Activity_Level,Diet_Processed_Food,Access_to_Healthcare,Urban_vs_Rural,Economic_Status
47390,United Kingdom,85,Female,0,0,1,0,0,0,0,...,Stage III,16,Chemotherapy,1,0,Low,Low,Low,Urban,High
38566,India,77,Male,0,0,0,0,0,0,0,...,Stage I,52,Radiation,0,1,Medium,High,Low,Rural,Low
32814,China,87,Male,1,0,0,0,0,0,0,...,Stage I,52,Chemotherapy,0,1,Medium,Medium,Medium,Urban,Middle
41393,China,70,Female,0,0,0,0,0,0,0,...,Stage III,15,Radiation,0,0,Low,Medium,Medium,Rural,Low
12564,Canada,74,Male,0,1,1,0,0,0,0,...,Stage IV,6,Surgery,0,0,Low,High,Medium,Urban,Low


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 47390 to 33890
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Country                        5000 non-null   object
 1   Age                            5000 non-null   int64 
 2   Gender                         5000 non-null   object
 3   Smoking_History                5000 non-null   int64 
 4   Obesity                        5000 non-null   int64 
 5   Diabetes                       5000 non-null   int64 
 6   Chronic_Pancreatitis           5000 non-null   int64 
 7   Family_History                 5000 non-null   int64 
 8   Hereditary_Condition           5000 non-null   int64 
 9   Jaundice                       5000 non-null   int64 
 10  Abdominal_Discomfort           5000 non-null   int64 
 11  Back_Pain                      5000 non-null   int64 
 12  Weight_Loss                    5000 non-null   int64 
 13  Dev

In [3]:
COLUMNS = [key for key, value in df.dtypes.to_dict().items() if str(value) != 'object' and key != 'Survival_Status']
TARGET = 'Survival_Status'

In [4]:
from sklearn.manifold import TSNE

reducer = TSNE(random_state=RANDOM_STATE)
plot_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=df[COLUMNS]))
plot_df[TARGET] = df[TARGET].tolist()

In [5]:
from plotly import express
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)

express.scatter(data_frame=plot_df, x='x', y='y', color=TARGET).show(renderer='iframe_connected')

The positives look like they're scattered more or less randomly among the negatives. Let's build a model and find out.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=RANDOM_STATE, stratify=df[TARGET])
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X=X_train, y=y_train)
y_pred = knn.predict(X=X_test)

print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92       874
           1       0.11      0.02      0.03       126

    accuracy                           0.86      1000
   macro avg       0.49      0.50      0.48      1000
weighted avg       0.78      0.86      0.81      1000

