In [1]:
# importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

In [2]:
# reading data from the csv 
df = pd.read_csv("global_cancer_patients_2015_2024.csv")


### Exploring the Data Set

In [3]:
# checkig the first few records of the data frame
df.head(2)

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65


In [4]:
# Checking the shape of the data set
print(f"Number of records = {df.shape[0]}")
print(f"Total number of columns in the dataframe = {df.shape[1]}")

Number of records = 50000
Total number of columns in the dataframe = 15


In [5]:
# Checking the column names and data type for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Patient_ID             50000 non-null  object 
 1   Age                    50000 non-null  int64  
 2   Gender                 50000 non-null  object 
 3   Country_Region         50000 non-null  object 
 4   Year                   50000 non-null  int64  
 5   Genetic_Risk           50000 non-null  float64
 6   Air_Pollution          50000 non-null  float64
 7   Alcohol_Use            50000 non-null  float64
 8   Smoking                50000 non-null  float64
 9   Obesity_Level          50000 non-null  float64
 10  Cancer_Type            50000 non-null  object 
 11  Cancer_Stage           50000 non-null  object 
 12  Treatment_Cost_USD     50000 non-null  float64
 13  Survival_Years         50000 non-null  float64
 14  Target_Severity_Score  50000 non-null  float64
dtypes:

**Observation: From the above result we can see that all the columns have the correct data type to start with."**

In [7]:
# Checking the five point summary of the numerical columns in the dataframe
df.describe(exclude='object').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,50000.0,54.42154,20.224451,20.0,37.0,54.0,72.0,89.0
Year,50000.0,2019.48052,2.871485,2015.0,2017.0,2019.0,2022.0,2024.0
Genetic_Risk,50000.0,5.001698,2.885773,0.0,2.5,5.0,7.5,10.0
Air_Pollution,50000.0,5.010126,2.888399,0.0,2.5,5.0,7.5,10.0
Alcohol_Use,50000.0,5.01088,2.888769,0.0,2.5,5.0,7.5,10.0
Smoking,50000.0,4.989826,2.881579,0.0,2.5,5.0,7.5,10.0
Obesity_Level,50000.0,4.991176,2.894504,0.0,2.5,5.0,7.5,10.0
Treatment_Cost_USD,50000.0,52467.298239,27363.229379,5000.05,28686.225,52474.31,76232.72,99999.84
Survival_Years,50000.0,5.006462,2.883335,0.0,2.5,5.0,7.5,10.0
Target_Severity_Score,50000.0,4.951207,1.199677,0.9,4.12,4.95,5.78,9.16


In [8]:
# checking the data summary for object data type columns
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
Patient_ID,50000,50000,PT0049983,1
Gender,50000,3,Male,16796
Country_Region,50000,10,Australia,5092
Cancer_Type,50000,8,Colon,6376
Cancer_Stage,50000,5,Stage II,10124


**We can remove the Patient ID column as it would not add any meaningful difference while predicting the severity score of the cancer patients.**

In [9]:
df.drop('Patient_ID',axis=1, inplace=True)

In [10]:
df.head(2)

Unnamed: 0,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score
0,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92
1,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65


#### Univariate Analysis

In [11]:
# getting the list of numeric and object columns
num_col = list(df.select_dtypes(np.number).columns)
cat_col = list(df.select_dtypes('object').columns)

In [12]:
print(num_col)
print()
print(cat_col)

['Age', 'Year', 'Genetic_Risk', 'Air_Pollution', 'Alcohol_Use', 'Smoking', 'Obesity_Level', 'Treatment_Cost_USD', 'Survival_Years', 'Target_Severity_Score']

['Gender', 'Country_Region', 'Cancer_Type', 'Cancer_Stage']
