In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
final_prostate_data = pd.read_csv('final_prostate_data.csv')

# Separate features and target
X = final_prostate_data.drop('PROSTCAN_A', axis=1)  # Replace 'target_column' with your actual target column name
y = final_prostate_data['PROSTCAN_A']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Define transformers for preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Create a pipeline with the Decision Tree model
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Train the model on the training data
dt_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['BMICAT_A', 'DIBEV_A', 'SMKCIGST_A', 'ASEV_A', 'COPDEV_A', 'HYPEV_A',
       'AGEP_A', 'SKNNMCAN_A', 'OTHERCANP_A', 'SKNDCAN_A', 'MELANCAN_A',
       'WEIGHTLBTC_A', 'LUNGCAN_A', 'LYMPHCAN_A', 'COLONCAN_A', 'HE...
       'LARYNCAN_A', 'BREASCAN_A', 'REGION', 'ANXFREQ_A', 'RACEALLP',
       'HOUYRSLIV_A', 'SMOKELSEV_A', 'CHDEV_A', 'PSATEST_A', 'LEGMSTAT_A',
       'ORIENT_A', 'ECIGEV_A'],
      dtype='object')),
                                                 ('cat',
                                    

In [4]:
# Predict on the training data
y_train_pred = dt_pipeline.predict(X_train)
# Generate and print the classification report for the training data
print("Classification Report for Training Data:")
print(classification_report(y_train, y_train_pred))

Classification Report for Training Data:
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1724
         2.0       1.00      1.00      1.00      4198
         7.0       1.00      1.00      1.00        10
         9.0       1.00      1.00      1.00        32

    accuracy                           1.00      5964
   macro avg       1.00      1.00      1.00      5964
weighted avg       1.00      1.00      1.00      5964



In [5]:
# Predict on the test data
y_test_pred = dt_pipeline.predict(X_test)

# Generate and print the classification report for the test data
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         1.0       0.86      0.87      0.86       439
         2.0       0.95      0.94      0.94      1039
         7.0       1.00      1.00      1.00         2
         9.0       1.00      1.00      1.00        11

    accuracy                           0.92      1491
   macro avg       0.95      0.95      0.95      1491
weighted avg       0.92      0.92      0.92      1491

