In [2]:
import pandas as pd
from sklearn.model_selection import KFold

In [4]:
df = pd.read_csv('../dataset/heart.csv')

In [5]:
df.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1


### K-fold splitting

In [11]:
N_FOLDS = 3
splits = KFold(N_FOLDS, shuffle = True, random_state = 0).split(df, df['HeartDisease'])
df['fold'] = 0
for i, (train_ind, val_ind) in enumerate(splits):
    df.loc[val_ind, 'fold'] = i

print(df['fold'].value_counts())

1    306
0    306
2    306
Name: fold, dtype: int64


## Complete example

In [12]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
df = pd.read_csv(url, compression='zip', sep='\t', header=None, names=['label', 'message'])

ValueError: Multiple files found in ZIP file. Only one file per ZIP: ['SMSSpamCollection', 'readme']

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset (Change the path to where the dataset is located)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
df = pd.read_csv(url, compression='zip', sep='\t', header=None, names=['label', 'message'])

# Encode label to 0 and 1 (0 for 'ham', 1 for 'spam')
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42)

# Define a pipeline for Logistic Regression
lr_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(solver='liblinear', random_state=42))
])

# Define a pipeline for XGBoost
xgb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# Hyperparameter tuning for Logistic Regression using GridSearchCV
lr_param_grid = {'clf__C': [0.1, 1, 10]}
lr_grid = GridSearchCV(lr_pipeline, param_grid=lr_param_grid, cv=5)
lr_grid.fit(X_train, y_train)

# Hyperparameter tuning for XGBoost using GridSearchCV
xgb_param_grid = {'clf__learning_rate': [0.1, 0.2], 'clf__n_estimators': [100, 200]}
xgb_grid = GridSearchCV(xgb_pipeline, param_grid=xgb_param_grid, cv=5)
xgb_grid.fit(X_train, y_train)

# Evaluate models
print("Best parameters for Logistic Regression: ", lr_grid.best_params_)
print("Best parameters for XGBoost: ", xgb_grid.best_params_)

lr_predictions = lr_grid.predict(X_test)
xgb_predictions = xgb_grid.predict(X_test)

# Accuracy
print("\nLogistic Regression Accuracy: ", accuracy_score(y_test, lr_predictions))
print("XGBoost Accuracy: ", accuracy_score(y_test, xgb_predictions))

# Classification Report
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, lr_predictions))
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_predictions))

# Confusion Matrix
print("\nLogistic Regression Confusion Matrix:\n", confusion_matrix(y_test, lr_predictions))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, xgb_predictions))

# Cross Validation
lr_cv_scores = cross_val_score(lr_pipeline, df['message'], df['label'], cv=5)
xgb_cv_scores = cross_val_score(xgb_pipeline, df['message'], df['label'], cv=5)

print("\nLogistic Regression CV Scores: ", lr_cv_scores)
print("XGBoost CV Scores: ", xgb_cv_scores)

print("\nLogistic Regression CV Mean Accuracy: ", np.mean(lr_cv_scores))
print("XGBoost CV Mean Accuracy: ", np.mean(xgb_cv_scores))
