In [48]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_X.csv')
train_data.head()


Unnamed: 0,PatientID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,247,48,M,ASY,122,275,1,ST,150,Y,2.0,Down,1
1,829,29,M,ATA,130,204,0,LVH,202,N,0.0,Up,0
2,446,54,M,ASY,130,0,0,ST,117,Y,1.4,Flat,1
3,780,64,F,ASY,180,325,0,Normal,154,Y,0.0,Up,0
4,488,65,M,TA,140,252,0,Normal,135,N,0.3,Up,0


In [49]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop the "PatientID" column
train_data.drop("PatientID", axis=1, inplace=True)

# 1. Handle categorical features
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
label_encoders = {}  # we'll store our label encoders here for later use on test data

for col in categorical_features:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

# 2. Scale the numerical features
numerical_features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
scaler = StandardScaler()
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])

# 3. Split the data into features (X) and target (y)
X_train = train_data.drop("HeartDisease", axis=1)
y_train = train_data["HeartDisease"]

X_train.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,-0.587544,1,0,-0.582619,0.701478,1,2,0.523234,1,1.037,0
1,-2.584958,1,1,-0.152377,0.042658,0,0,2.552011,0,-0.838509,2
2,0.043219,1,0,-0.152377,-1.850288,0,2,-0.764259,1,0.474347,1
3,1.09449,0,0,2.536633,1.165435,0,1,0.679293,1,-0.838509,2
4,1.199617,1,3,0.385425,0.488057,0,1,-0.06199,0,-0.557182,2


In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Use cross-validation to evaluate its performance
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = scores.mean()
std_score = scores.std()

mean_score, std_score


(0.8625, 0.01778645621509123)

In [51]:
# Train the classifier on the entire training set
clf.fit(X_train, y_train)

# preprocess the test data
patient_ids = test_data['PatientID'].values  # Store patient IDs for the submission
test_data.drop("PatientID", axis=1, inplace=True)

# Handle categorical features
for col in categorical_features:
    test_data[col] = label_encoders[col].transform(test_data[col])

# Scale numerical features
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

# Generate predictions for the test set
y_pred = clf.predict(test_data)

y_pred


array([1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])

In [52]:
# Create a DataFrame for the predictions
submission_df = pd.DataFrame({
    "PatientID": patient_ids,
    "HeartDisease": y_pred
})

# Save the predictions to a CSV file
submission_file_path = "submission.csv"
submission_df.to_csv(submission_file_path, index=False)

submission_file_path


'submission.csv'