In [1]:
import pandas as pd

DATA = '/kaggle/input/oral-cancer-prediction-dataset/oral_cancer_prediction_dataset.csv'
df = pd.read_csv(filepath_or_buffer=DATA, index_col=['ID'])
for column in [column for column in df.columns if set(df[column].unique().tolist()) == {'No', 'Yes'} ]:
    df[column] = df[column] == 'Yes'
df.head()

Unnamed: 0_level_0,Country,Age,Gender,Tobacco Use,Alcohol Consumption,HPV Infection,Betel Quid Use,Chronic Sun Exposure,Poor Oral Hygiene,Diet (Fruits & Vegetables Intake),...,Difficulty Swallowing,White or Red Patches in Mouth,Tumor Size (cm),Cancer Stage,Treatment Type,"Survival Rate (5-Year, %)",Cost of Treatment (USD),Economic Burden (Lost Workdays per Year),Early Diagnosis,Oral Cancer (Diagnosis)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Italy,36,Female,True,True,True,False,False,True,Low,...,False,False,0.0,0,No Treatment,100.0,0.0,0,False,False
2,Japan,64,Male,True,True,True,False,True,True,High,...,False,False,1.782186,1,No Treatment,83.340103,77772.5,177,False,True
3,UK,37,Female,False,True,False,False,True,True,Moderate,...,False,True,3.523895,2,Surgery,63.222871,101164.5,130,True,True
4,Sri Lanka,55,Male,True,True,False,True,False,True,Moderate,...,False,False,0.0,0,No Treatment,100.0,0.0,0,True,False
5,South Africa,68,Male,False,False,False,False,False,True,High,...,False,False,2.834789,3,No Treatment,44.293199,45354.75,52,False,True


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84922 entries, 1 to 84922
Data columns (total 24 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Country                                   84922 non-null  object 
 1   Age                                       84922 non-null  int64  
 2   Gender                                    84922 non-null  object 
 3   Tobacco Use                               84922 non-null  bool   
 4   Alcohol Consumption                       84922 non-null  bool   
 5   HPV Infection                             84922 non-null  bool   
 6   Betel Quid Use                            84922 non-null  bool   
 7   Chronic Sun Exposure                      84922 non-null  bool   
 8   Poor Oral Hygiene                         84922 non-null  bool   
 9   Diet (Fruits & Vegetables Intake)         84922 non-null  object 
 10  Family History of Cancer               

We want to pick out variables that contribute to the diagnosis but that are not aliases for the diagnosis, so we have to be careful. Also, we have a lot of rows, so we can probably get a good sense of whether we have a strong signal in our data using dimensionality reduction and just a small sample (a few percent) of the data.

In [3]:
from sklearn.manifold import TSNE

TARGET = 'Oral Cancer (Diagnosis)'
COLUMNS = [key for key, value in df.dtypes.to_dict().items() if str(value) in {'int64', 'bool', 'float64'} and key not in {TARGET, 'Cancer Stage', 'Early Diagnosis', 'Treatment Type', 'Cost of Treatment (USD)', 'Economic Burden (Lost Workdays per Year)', 'Survival Rate (5-Year, %)'}]
N = 2000
RANDOM_STATE = 2025

sample_df = df.sample(n=N, random_state=RANDOM_STATE)
reducer = TSNE(random_state=RANDOM_STATE)
plot_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=sample_df[COLUMNS]))
plot_df[TARGET] = sample_df[TARGET].tolist()

Do the columns we have chosen make sense?

In [4]:
COLUMNS

['Age',
 'Tobacco Use',
 'Alcohol Consumption',
 'HPV Infection',
 'Betel Quid Use',
 'Chronic Sun Exposure',
 'Poor Oral Hygiene',
 'Family History of Cancer',
 'Compromised Immune System',
 'Oral Lesions',
 'Unexplained Bleeding',
 'Difficulty Swallowing',
 'White or Red Patches in Mouth',
 'Tumor Size (cm)']

In [5]:
from plotly import express
from plotly import io

io.renderers.default = 'iframe'
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGET)

This plot suggests we've chosen reasonably well, because there are a few difficult cases but otherwise we see strong clustering within classes and separation between clusters.

Is our target class balanced?

In [6]:
df[TARGET].value_counts().to_frame().T

Oral Cancer (Diagnosis),False,True
count,42573,42349


Almost exactly. Is that good (it represents a real phenomenon) or bad (it suggests our data is synthetic)? Hard to say. Let's build a model.