In [None]:
%pip install scikit-learn
# aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

Note: you may need to restart the kernel to use updated packages.


In [60]:
import pandas as pd #used for data manipulation and analysis. It provides data structures like DataFrames.
import numpy as np # numerical computations in Python. It provides support for arrays, matrices, and mathematical functions.
import matplotlib.pyplot as plt #used for creating static, interactive, and animated visualizations.
import seaborn as sns # a statistical data visualization library built on Matplotlib. It provides a high-level interface for drawing attractive graphs.
# scikit-learn library, which is a popular tool for machine learning in Python:
from sklearn.model_selection import train_test_split #Used to split datasets into training and testing subsets for model evaluation.
from sklearn.preprocessing import StandardScaler, OneHotEncoder # A preprocessing tool to standardize features by removing the mean and scaling to unit variance.
#Used to convert categorical variables into a one-hot encoded numeric array.
from sklearn.impute import SimpleImputer #Used to handle missing values by imputing them (e.g., replacing NaN with mean/median/mode).
from sklearn.compose import ColumnTransformer #Applies different transformations to different columns in a dataset (e.g., scaling numerical columns and one-hot encoding categorical columns).
from sklearn.pipeline import Pipeline #Chains multiple data processing steps (e.g., imputation, scaling, encoding) into a single object for streamlined workflow.

In [61]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', 1000)

In [62]:
# data_set = pd.read_csv('Lung Cancer.csv')
# age = data_set["age"]
# delay_df = pd.read_csv('hepatitis.delay', sep=r'\s+')
# expense_df = pd.read_csv('hepatitis.expense', sep=r'\s+')
# group_df = pd.read_csv('hepatitis.group', sep=r'\s+')

In [None]:
file_path = r"D:\ECU\Data-mining\Project\Lung Cancer.csv"

In [None]:
df = pd.read_csv(file_path, na_values='?')


In [71]:
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   id   age  gender      country diagnosis_date cancer_stage family_history  smoking_status   bmi  cholesterol_level  hypertension  asthma  cirrhosis  other_cancer treatment_type end_treatment_date  survived
0   1  64.0    Male       Sweden     2016-04-05      Stage I            Yes  Passive Smoker  29.4                199             0       0          1             0   Chemotherapy         2017-09-10         0
1   2  50.0  Female  Netherlands     2023-04-20    Stage III            Yes  Passive Smoker  41.2                280             1       1          0             0        Surgery         2024-06-17         1
2   3  65.0  Female      Hungary     2023-04-05    Stage III            Yes   Former Smoker  44.0                268             1       1          0             0       Combined         2024-04-09         0
3   4  51.0  Female      Belgium     2016-02-05      Stage I             No  Passive Smoker  43.0                241             1       1 

In [72]:
print(df.isnull().sum())


id                    0
age                   0
gender                0
country               0
diagnosis_date        0
cancer_stage          0
family_history        0
smoking_status        0
bmi                   0
cholesterol_level     0
hypertension          0
asthma                0
cirrhosis             0
other_cancer          0
treatment_type        0
end_treatment_date    0
survived              0
dtype: int64


In [74]:
# def detect_outliers_iqr(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
#     return outliers

# # Example: detect outliers in 'bmi'
# bmi_outliers = detect_outliers_iqr(df, 'bmi')
# print("BMI Outliers:")
# print(bmi_outliers[['id', 'bmi']])


def detect_outliers_iqr(df):
    numeric_cols = df.select_dtypes(include='number').columns
    outlier_summary = {}

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = outliers.shape[0]

        if outlier_count > 0:
            outlier_summary[col] = {
                'count': outlier_count,
                'lower_bound': lower_bound,
                'upper_bound': upper_bound,
                'outliers': outliers[[col]]
            }

    return outlier_summary

# Run detection
outliers_by_column = detect_outliers_iqr(df)

# Print summary
for col, info in outliers_by_column.items():
    print(f"\nColumn: {col}")
    print(f"Number of outliers: {info['count']}")
    print(f"Lower bound: {info['lower_bound']:.2f}, Upper bound: {info['upper_bound']:.2f}")
    print(info['outliers'].head())



Column: age
Number of outliers: 3895
Lower bound: 27.00, Upper bound: 83.00
      age
23   21.0
477  85.0
576  87.0
755  25.0
760  90.0

Column: hypertension
Number of outliers: 222479
Lower bound: 1.00, Upper bound: 1.00
    hypertension
0              0
4              0
6              0
8              0
11             0

Column: cirrhosis
Number of outliers: 201101
Lower bound: 0.00, Upper bound: 0.00
    cirrhosis
0           1
13          1
15          1
20          1
22          1

Column: other_cancer
Number of outliers: 78460
Lower bound: 0.00, Upper bound: 0.00
    other_cancer
9              1
18             1
23             1
25             1
41             1

Column: survived
Number of outliers: 196004
Lower bound: 0.00, Upper bound: 0.00
    survived
1          1
6          1
10         1
21         1
27         1


✅ Strategy 1: Remove Outliers

In [75]:
def remove_outliers(df):
    numeric_cols = df.select_dtypes(include='number').columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

df_no_outliers = remove_outliers(df.copy())


✅ Strategy 2: Cap (Winsorize) Outliers


In [76]:
def cap_outliers(df):
    numeric_cols = df.select_dtypes(include='number').columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower, upper)
    return df

df_capped = cap_outliers(df.copy())


In [78]:
numeric_cols = df.select_dtypes(include='number').columns


In [80]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_normalized = df.copy()
df_normalized[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [81]:
# print(df_standardized[numeric_cols].describe())
print(df_normalized[numeric_cols].describe())


                  id            age            bmi  cholesterol_level   hypertension         asthma      cirrhosis   other_cancer       survived
count  890000.000000  890000.000000  890000.000000      890000.000000  890000.000000  890000.000000  890000.000000  890000.000000  890000.000000
mean        0.500000       0.510070       0.499799           0.557559       0.750024       0.469740       0.225956       0.088157       0.220229
std         0.288676       0.099945       0.288570           0.289549       0.432999       0.499084       0.418211       0.283524       0.414401
min         0.000000       0.000000       0.000000           0.000000       0.000000       0.000000       0.000000       0.000000       0.000000
25%         0.250000       0.440000       0.251724           0.306667       1.000000       0.000000       0.000000       0.000000       0.000000
50%         0.500000       0.510000       0.500000           0.613333       1.000000       0.000000       0.000000       0.000000 

In [82]:
# 2. Define your features (X) and target (y)

In [83]:
X = df.drop('survived', axis=1)  # All columns except 'survived'
y = df['survived']               # The target column


In [84]:
# 3. (Optional) Encode categorical features
X = pd.get_dummies(X, drop_first=True)


In [85]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 20% for testing, 80% for training
    random_state=42,      # Ensures reproducibility
    stratify=y            # Keeps class distribution balanced (important for classification)
)


In [None]:
print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)



Training set: (712000, 7888)
Testing set: (178000, 7888)


In [89]:
#  engineer a new column that calculates how long the patient was under treatment:
df['diagnosis_date'] = pd.to_datetime(df['diagnosis_date'])
df['end_treatment_date'] = pd.to_datetime(df['end_treatment_date'])

df['treatment_duration_days'] = (df['end_treatment_date'] - df['diagnosis_date']).dt.days


In [91]:
print(df[['id', 'diagnosis_date', 'end_treatment_date', 'treatment_duration_days']].head())


   id diagnosis_date end_treatment_date  treatment_duration_days
0   1     2016-04-05         2017-09-10                      523
1   2     2023-04-20         2024-06-17                      424
2   3     2023-04-05         2024-04-09                      370
3   4     2016-02-05         2017-04-23                      443
4   5     2023-11-29         2025-01-08                      406
