In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.cluster import KMeans
from joblib import Parallel, delayed, parallel_backend
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.model_selection import GridSearchCV
import time
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import os
import seaborn as sns
%matplotlib inline

In [34]:
pancreatic = pd.read_csv('/content/binarized_pancreatic_cancer_dataset.csv')
pancreatic.head()

Unnamed: 0,Age,Smoking_History,Obesity,Diabetes,Chronic_Pancreatitis,Family_History,Hereditary_Condition,Jaundice,Abdominal_Discomfort,Back_Pain,...,Diet_Processed_Food_High,Diet_Processed_Food_Low,Diet_Processed_Food_Medium,Access_to_Healthcare_High,Access_to_Healthcare_Low,Access_to_Healthcare_Medium,Urban_vs_Rural_Urban,Economic_Status_High,Economic_Status_Low,Economic_Status_Middle
0,64,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,0
1,77,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,1,0,1,0
2,71,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1
3,56,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
4,82,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0


In [35]:
pancreatic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 45 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   Age                             50000 non-null  int64
 1   Smoking_History                 50000 non-null  int64
 2   Obesity                         50000 non-null  int64
 3   Diabetes                        50000 non-null  int64
 4   Chronic_Pancreatitis            50000 non-null  int64
 5   Family_History                  50000 non-null  int64
 6   Hereditary_Condition            50000 non-null  int64
 7   Jaundice                        50000 non-null  int64
 8   Abdominal_Discomfort            50000 non-null  int64
 9   Back_Pain                       50000 non-null  int64
 10  Weight_Loss                     50000 non-null  int64
 11  Development_of_Type2_Diabetes   50000 non-null  int64
 12  Survival_Time_Months            50000 non-null  int64
 13  S

In [37]:
cols_to_drop = ['Diabetes', 'Chronic_Pancreatitis','Family_History','Hereditary_Condition','Jaundice','Development_of_Type2_Diabetes','Survival_Status','Country_Australia','Country_Brazil','Country_Canada','Country_China','Country_Germany','Country_India','Country_South Africa','Country_United Kingdom', 'Country_United States', 'Gender_Male', 'Treatment_Type_Chemotherapy','Treatment_Type_Radiation', 'Treatment_Type_Surgery', 'Physical_Activity_Level_High', 'Physical_Activity_Level_Low', 'Physical_Activity_Level_Medium','Diet_Processed_Food_High','Diet_Processed_Food_Low', 'Diet_Processed_Food_Medium', 'Access_to_Healthcare_High', 'Access_to_Healthcare_Low','Access_to_Healthcare_Medium','Economic_Status_High','Economic_Status_Low','Economic_Status_Middle']
pancreatic.drop(cols_to_drop, axis=1, inplace=True)

In [38]:
pancreatic_numerical_features = pancreatic[['Age']]
pancreatic_numerical_features

Unnamed: 0,Age
0,64
1,77
2,71
3,56
4,82
...,...
49995,55
49996,79
49997,76
49998,62


In [39]:
pancreatic_categorical_features = pancreatic.drop('Age', axis=1)
pancreatic_categorical_features

Unnamed: 0,Smoking_History,Obesity,Abdominal_Discomfort,Back_Pain,Weight_Loss,Survival_Time_Months,Alcohol_Consumption,Stage_at_Diagnosis_Stage I,Stage_at_Diagnosis_Stage II,Stage_at_Diagnosis_Stage III,Stage_at_Diagnosis_Stage IV,Urban_vs_Rural_Urban
0,0,0,0,0,0,13,0,0,0,1,0,1
1,1,1,0,0,0,13,1,0,0,1,0,1
2,0,0,0,0,1,3,0,0,0,0,1,0
3,0,0,0,0,0,6,1,0,0,0,1,0
4,0,0,0,0,0,9,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,1,0,0,1,9,0,0,0,0,1,1
49996,0,0,1,0,0,19,0,0,0,1,0,0
49997,0,0,0,0,0,7,0,0,0,0,1,1
49998,0,0,0,1,0,21,1,0,1,0,0,1


In [40]:
pancreatic_categorical_features.drop(['Stage_at_Diagnosis_Stage I','Stage_at_Diagnosis_Stage II','Stage_at_Diagnosis_Stage III','Stage_at_Diagnosis_Stage IV'], axis=1, inplace=True)
pancreatic_categorical_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Smoking_History       50000 non-null  int64
 1   Obesity               50000 non-null  int64
 2   Abdominal_Discomfort  50000 non-null  int64
 3   Back_Pain             50000 non-null  int64
 4   Weight_Loss           50000 non-null  int64
 5   Survival_Time_Months  50000 non-null  int64
 6   Alcohol_Consumption   50000 non-null  int64
 7   Urban_vs_Rural_Urban  50000 non-null  int64
dtypes: int64(8)
memory usage: 3.1 MB


In [41]:
target = pancreatic[['Stage_at_Diagnosis_Stage I','Stage_at_Diagnosis_Stage II','Stage_at_Diagnosis_Stage III','Stage_at_Diagnosis_Stage IV']].copy()
target.head()

Unnamed: 0,Stage_at_Diagnosis_Stage I,Stage_at_Diagnosis_Stage II,Stage_at_Diagnosis_Stage III,Stage_at_Diagnosis_Stage IV
0,0,0,1,0
1,0,0,1,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [42]:
###Convert all categorical variables to category
# Create an empty list to store the Series
series_list = []
for col in pancreatic_categorical_features.columns:
    result= convert_category(pancreatic_categorical_features[col])
    series_list.append(result)
# Concatenate the Series into a DataFrame
result_pancreatic = pd.concat(series_list, axis=1)
pancreatic_categorical_features = result_pancreatic
pancreatic_categorical_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Smoking_History       50000 non-null  category
 1   Obesity               50000 non-null  category
 2   Abdominal_Discomfort  50000 non-null  category
 3   Back_Pain             50000 non-null  category
 4   Weight_Loss           50000 non-null  category
 5   Survival_Time_Months  50000 non-null  category
 6   Alcohol_Consumption   50000 non-null  category
 7   Urban_vs_Rural_Urban  50000 non-null  category
dtypes: category(8)
memory usage: 394.1 KB


In [43]:
def convert_category(series):
  """
  Converts a pandas Series to the 'category' dtype.

  Args:
    series: The pandas Series to convert.

  Returns:
    The converted pandas Series with 'category' dtype.
  """
  return series.astype('category')

In [44]:
pancreatic_categorical_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Smoking_History       50000 non-null  category
 1   Obesity               50000 non-null  category
 2   Abdominal_Discomfort  50000 non-null  category
 3   Back_Pain             50000 non-null  category
 4   Weight_Loss           50000 non-null  category
 5   Survival_Time_Months  50000 non-null  category
 6   Alcohol_Consumption   50000 non-null  category
 7   Urban_vs_Rural_Urban  50000 non-null  category
dtypes: category(8)
memory usage: 394.1 KB


In [45]:
data = pd.concat([pancreatic_numerical_features, pancreatic_categorical_features,target], axis=1)
data.drop(['Stage_at_Diagnosis_Stage I','Stage_at_Diagnosis_Stage II','Stage_at_Diagnosis_Stage III','Stage_at_Diagnosis_Stage IV'], axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Age                   50000 non-null  int64   
 1   Smoking_History       50000 non-null  category
 2   Obesity               50000 non-null  category
 3   Abdominal_Discomfort  50000 non-null  category
 4   Back_Pain             50000 non-null  category
 5   Weight_Loss           50000 non-null  category
 6   Survival_Time_Months  50000 non-null  category
 7   Alcohol_Consumption   50000 non-null  category
 8   Urban_vs_Rural_Urban  50000 non-null  category
dtypes: category(8), int64(1)
memory usage: 784.7 KB


In [46]:
X = data
y = target

In [47]:
## Train-Test Split
###Split 80-20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [48]:
num_col_names = pancreatic_numerical_features.columns
# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler

# 2. Create an instance of the class
scaler = StandardScaler()

# 3. Use the fit method of the instance
scaler.fit(X_train[num_col_names])

# 4. Use the transform method to perform the transformation and explicitly cast to float64
X_train.loc[:, num_col_names] = scaler.transform(X_train[num_col_names]).astype('float64')

## Remember to also standardize the numerical features in the testing set
X_test.loc[:, num_col_names] = scaler.transform(X_test[num_col_names]).astype('float64')

 -0.25530375]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_col_names] = scaler.transform(X_train[num_col_names]).astype('float64')
  0.04521565]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_test.loc[:, num_col_names] = scaler.transform(X_test[num_col_names]).astype('float64')


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold,GridSearchCV

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import numpy as np

# Convert y_train and y_test from multilabel to multiclass
y_train_multiclass = y_train.idxmax(axis=1)
y_test_multiclass = y_test.idxmax(axis=1)


# Initialize model
log_reg = LogisticRegression(solver='liblinear', max_iter=1000)

# Parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'roc_auc']

# GridSearchCV with multiple scoring metrics, set 'refit' to the primary metric (e.g., 'roc_auc')
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='roc_auc',  # Choose the metric you care most about
    n_jobs=-1
)

# Fit the model (ensure X and y are defined)
grid_search.fit(X_train, y_train_multiclass)

# Optional: view best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)
grid_search.best_estimator_



Best Parameters: {'C': 0.001, 'penalty': 'l1'}
Best ROC AUC Score: nan


In [51]:
best_score = grid_search.best_score_
print(f"Best cross-validation score: {best_score:.4f}")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_test_multiclass = y_test.idxmax(axis=1) # Convert y_test to multiclass
test_accuracy = accuracy_score(y_test_multiclass, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(classification_report(y_test_multiclass, y_pred))

Best cross-validation score: nan
Test Accuracy: 0.6524
                              precision    recall  f1-score   support

  Stage_at_Diagnosis_Stage I       0.67      0.60      0.63       987
 Stage_at_Diagnosis_Stage II       0.67      0.31      0.42      2052
Stage_at_Diagnosis_Stage III       0.55      0.57      0.56      2971
 Stage_at_Diagnosis_Stage IV       0.71      0.91      0.80      3990

                    accuracy                           0.65     10000
                   macro avg       0.65      0.59      0.60     10000
                weighted avg       0.65      0.65      0.63     10000



In [52]:
# training a DescisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()

param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [3, 5, 7, None],  # None means no limit
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

# Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'roc_auc']

# Convert y_train and y_test from multilabel to multiclass
y_train_multiclass = y_train.idxmax(axis=1)
y_test_multiclass = y_test.idxmax(axis=1)


# GridSearchCV with multiple scoring metrics, set 'refit' to the primary metric (e.g., 'roc_auc')
grid_search = GridSearchCV(
    estimator=dtree,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='roc_auc',  # Choose the metric you care most about
    n_jobs=-1
)

# Fit the model (ensure X and y are defined)
grid_search.fit(X_train, y_train_multiclass)

# Optional: view best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)
best_score = grid_search.best_score_
print(f"Best cross-validation score: {best_score:.4f}")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test_multiclass, y_pred)
test_accuracy
print(classification_report(y_test_multiclass, y_pred))

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Best Parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best ROC AUC Score: nan
Best cross-validation score: nan
                              precision    recall  f1-score   support

  Stage_at_Diagnosis_Stage I       1.00      0.51      0.68       987
 Stage_at_Diagnosis_Stage II       0.79      0.41      0.54      2052
Stage_at_Diagnosis_Stage III       0.61      0.57      0.59      2971
 Stage_at_Diagnosis_Stage IV       0.70      1.00      0.83      3990

                    accuracy                           0.70     10000
                   macro avg       0.78      0.62      0.66     10000
                weighted avg       0.72      0.70      0.68     10000



In [55]:
# training a Random forest classifier
from sklearn.ensemble import RandomForestClassifier
# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'roc_auc']

# Convert y_train and y_test from multilabel to multiclass
y_train_multiclass = y_train.idxmax(axis=1)
y_test_multiclass = y_test.idxmax(axis=1)


# GridSearchCV with multiple scoring metrics, set 'refit' to the primary metric (e.g., 'roc_auc')
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='roc_auc',  # Choose the metric you care most about
    n_jobs=-1
)

# Fit the model (ensure X and y are defined)
grid_search_rf.fit(X_train, y_train_multiclass)

# Optional: view best parameters and score
print("Best Parameters:", grid_search_rf.best_params_)
print("Best ROC AUC Score:", grid_search_rf.best_score_)
best_score = grid_search_rf.best_score_
print(f"Best cross-validation score: {best_score:.4f}")
best_model = grid_search_rf.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test_multiclass, y_pred)
test_accuracy
print(classification_report(y_test_multiclass, y_pred))

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best ROC AUC Score: nan
Best cross-validation score: nan
                              precision    recall  f1-score   support

  Stage_at_Diagnosis_Stage I       0.69      0.55      0.61       987
 Stage_at_Diagnosis_Stage II       0.51      0.46      0.49      2052
Stage_at_Diagnosis_Stage III       0.50      0.49      0.50      2971
 Stage_at_Diagnosis_Stage IV       0.72      0.81      0.76      3990

                    accuracy                           0.62     10000
                   macro avg       0.61      0.58      0.59     10000
                weighted avg       0.61      0.62      0.61     10000

