In [None]:
#DecisionTree
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import joblib

# Load dataset
filename = '/content/ReducedTweets.csv'
dataset = pd.read_csv(filename)

X = dataset.iloc[:, 1:16].values
y = dataset.iloc[:, 16].values

# Hyperparameters
dt_parameters = {'max_depth': range(1, 5), 'criterion': ['gini', 'entropy']}

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Cross-validation strategy
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize
dt_accuracy_scores_test = []
dt_precision_scores_test = []
dt_recall_scores_test = []
dt_f1_scores_test = []

# Storing the best model
best_dt_model_test = None
best_dt_score_test = 0.0

# Cross-validation
for train_index, test_index in outer_cv.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    # Decision Tree
    dt_classifier = DecisionTreeClassifier(random_state=42)
    dt_grid_search = GridSearchCV(dt_classifier, dt_parameters, cv=inner_cv)
    dt_grid_search.fit(X_train_fold, y_train_fold)
    dt_best_params = dt_grid_search.best_params_
    dt_best_model = DecisionTreeClassifier(random_state=42, **dt_best_params)

    # Fit the best model on training set
    dt_best_model.fit(X_train_fold, y_train_fold)

    # Evaluation on test set
    dt_predictions_test = dt_best_model.predict(X_test_fold)

    # Append metrics
    dt_accuracy_scores_test.append(accuracy_score(y_test_fold, dt_predictions_test))
    dt_precision_scores_test.append(precision_score(y_test_fold, dt_predictions_test))
    dt_recall_scores_test.append(recall_score(y_test_fold, dt_predictions_test))
    dt_f1_scores_test.append(f1_score(y_test_fold, dt_predictions_test))

    # Store the model
    current_score = accuracy_score(y_test_fold, dt_predictions_test)
    if current_score > best_dt_score_test:
        best_dt_score_test = current_score
        best_dt_model_test = dt_best_model

# Save the best model
if best_dt_model_test is not None:
    joblib.dump(best_dt_model_test, 'best_decision_tree_model_test.pkl')

# Calculate
dt_average_accuracy_test = np.mean(dt_accuracy_scores_test)
dt_average_precision_test = np.mean(dt_precision_scores_test)
dt_average_recall_test = np.mean(dt_recall_scores_test)
dt_average_f1_test = np.mean(dt_f1_scores_test)

# Print
print("\nDecision Tree (Test Set):")
print("Average Accuracy:", round(dt_average_accuracy_test, 4))
print("Average Precision:", round(dt_average_precision_test, 4))
print("Average Recall:", round(dt_average_recall_test, 4))
print("Average F1-score:", round(dt_average_f1_test, 4))



Decision Tree (Test Set):
Average Accuracy: 0.8293
Average Precision: 0.869
Average Recall: 0.777
Average F1-score: 0.8198
