In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

import pandas as pd


In [2]:
data = pd.read_csv("../data/train_data_preprocessed.csv")

In [3]:
X = data.drop(['smoking'], axis = 1)
y = data['smoking']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "K-Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"{name} Model")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("-" * 50)

In [6]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print("Random Forest Model")
print(f"Accuracy: {accuracy_rf}")
print("Classification Report:")
print(report_rf)
print("-" * 50)

Random Forest Model
Accuracy: 0.7806698950766747
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.78      0.80     22677
           1       0.72      0.79      0.75     16971

    accuracy                           0.78     39648
   macro avg       0.78      0.78      0.78     39648
weighted avg       0.78      0.78      0.78     39648

--------------------------------------------------


In [9]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 30, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=10)

In [10]:
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found by GridSearchCV:")
print(best_params)

best_grid_model = grid_search.best_estimator_
y_pred_grid = best_grid_model.predict(X_test)

accuracy_grid = accuracy_score(y_test, y_pred_grid)
report_grid = classification_report(y_test, y_pred_grid)

print(f"Accuracy with Grid Search: {accuracy_grid}")
print("Classification Report with Grid Search:")
print(report_grid)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


KeyboardInterrupt: 

In [17]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 787.7 kB/s eta 0:02:07
   ---------------------------------------- 0.2/99.8 MB 1.3 MB/s eta 0:01:16
   ---------------------------------------- 0.3/99.8 MB 1.9 MB/s eta 0:00:53
   ---------------------------------------- 0.6/99.8 MB 2.5 MB/s eta 0:00:40
   ---------------------------------------- 0.9/99.8 MB 3.3 MB/s eta 0:00:31
    --------------------------------------- 1.5/99.8 MB 4.7 MB/s eta 0:00:21
   - -------------------------------------- 3.0/99.8 MB 8.2 MB/s eta 0:00:12
   - -------------------------------------- 5.0/99.8 MB 12.2 MB/s eta 0:00:08
   -- ------------------

In [18]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 30, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],  # Bootstrap equivalent in XGBoost
    'tree_method': ['gpu_hist']  # Use GPU
}

# Note: XGBClassifier doesn't have 'min_samples_split' and 'min_samples_leaf', 
# these parameters are 'min_child_weight' and 'max_leaves' respectively.
# If you still want to specify similar functionalities, adjust accordingly.

grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42, use_label_encoder=False), 
                           param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model using GPU
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 648 candidates, totalling 1944 fits


KeyboardInterrupt: 