In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

import pandas as pd


In [2]:
data = pd.read_csv("../data/train_data_preprocessed.csv")

In [3]:
X = data.drop(['smoking'], axis = 1)
y = data['smoking']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "K-Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"{name} Model")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("-" * 50)

Decision Tree Model
Accuracy: 0.701725181598063
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.75      0.74     22677
           1       0.65      0.64      0.65     16971

    accuracy                           0.70     39648
   macro avg       0.70      0.69      0.69     39648
weighted avg       0.70      0.70      0.70     39648

--------------------------------------------------
SVM Model
Accuracy: 0.7663438256658596
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.73      0.78     22677
           1       0.69      0.81      0.75     16971

    accuracy                           0.77     39648
   macro avg       0.77      0.77      0.77     39648
weighted avg       0.78      0.77      0.77     39648

--------------------------------------------------
Logistic Regression Model
Accuracy: 0.7486884584342212
Classification Report:
              precision    recall

In [11]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print("Random Forest Model")
print(f"Accuracy: {accuracy_rf}")
print("Classification Report:")
print(report_rf)
print("-" * 50)

Random Forest Model
Accuracy: 0.7806698950766747
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.78      0.80     22677
           1       0.72      0.79      0.75     16971

    accuracy                           0.78     39648
   macro avg       0.78      0.78      0.78     39648
weighted avg       0.78      0.78      0.78     39648

--------------------------------------------------


In [12]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 30, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

In [13]:
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found by GridSearchCV:")
print(best_params)

best_grid_model = grid_search.best_estimator_
y_pred_grid = best_grid_model.predict(X_test)

accuracy_grid = accuracy_score(y_test, y_pred_grid)
report_grid = classification_report(y_test, y_pred_grid)

print(f"Accuracy with Grid Search: {accuracy_grid}")
print("Classification Report with Grid Search:")
print(report_grid)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits
