In [24]:
import pandas as pd
import numpy as np
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from time_counter import time_counter

In [25]:
bbc_data = pd.read_csv("../data/clean_bbc_classification.csv")
sarc_data = pd.read_csv("../data/clean_sarcasm_classification.csv")

In [26]:
# Fine tuning params

num_iterations = 5
num_cv = 3

### DTC on BBC News Classification
#### Simple DTC

In [27]:
# Preprocessing

X = bbc_data['text']
y = bbc_data['label_ids']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [28]:
bbc_data[['labels', 'label_ids']].drop_duplicates()

Unnamed: 0,labels,label_ids
0,entertainment,0
386,business,1
896,sport,2
1407,politics,3
1824,tech,4


In [29]:
@time_counter
def model():
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    preds = dtc.predict(X_test)
    print("Simple Decision Tree Classifier")
    print(classification_report(y_test, preds))
    return preds

preds, runtime_dtc_simple = model()

print(f"Runtime: {runtime_dtc_simple} seconds")
performance_dtc_simple = classification_report(y_test, preds, output_dict=True)

Simple Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.80      0.85      0.82        65
           1       0.82      0.78      0.80       103
           2       0.94      0.92      0.93       114
           3       0.82      0.85      0.83        88
           4       0.85      0.84      0.85        75

    accuracy                           0.85       445
   macro avg       0.84      0.85      0.85       445
weighted avg       0.85      0.85      0.85       445

Runtime: 0.73 seconds


#### Fine-tuned DTC

In [30]:
@time_counter
def model():
    dtc = DecisionTreeClassifier()
    param_dist = {
        'max_depth': [3, 5, 10, 15, 20],
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2', None],
        'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
        'min_impurity_decrease': [0.0, 0.01, 0.1],
        'splitter': ['best', 'random'],
        'class_weight': [None, 'balanced']
    }

    rnd_cv = RandomizedSearchCV(dtc, param_distributions=param_dist, 
                                n_iter=num_iterations, cv=num_cv, scoring='accuracy', random_state=123, n_jobs=-1)
    rnd_cv.fit(X_train, y_train)
    print("Best parameters:", rnd_cv.best_params_)
    print("Best score:", rnd_cv.best_score_)
    dtc = DecisionTreeClassifier(**rnd_cv.best_params_)
    dtc.fit(X_train, y_train)
    preds = dtc.predict(X_test)
    print("Fine-tuned Decision Tree Classifier")
    print(classification_report(y_test, preds))
    return preds

preds, runtime_dtc_tuned = model()
print(f"Runtime: {runtime_dtc_tuned} seconds")
performance_dtc_tuned = classification_report(y_test, preds, output_dict=True)

Best parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': 100, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 18, 'splitter': 'best'}
Best score: 0.6078832166521879
Fine-tuned Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.90      0.40      0.55        65
           1       0.39      0.92      0.55       103
           2       0.85      0.61      0.71       114
           3       0.96      0.52      0.68        88
           4       0.86      0.51      0.64        75

    accuracy                           0.62       445
   macro avg       0.79      0.59      0.63       445
weighted avg       0.78      0.62      0.63       445

Runtime: 8.08 seconds


In [31]:
data = pd.DataFrame(columns=['Model', 'Runtime', 'Accuracy', 'F1', 'data'])

data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['Decision Tree', 'Decision Tree fine-tuned'],
                      'Runtime': [runtime_dtc_simple, runtime_dtc_tuned],
                      'Accuracy': [performance_dtc_simple['accuracy'], performance_dtc_tuned['accuracy']],
                      'F1': [performance_dtc_simple['weighted avg']['f1-score'], performance_dtc_tuned['weighted avg']['f1-score']],
                      'data': ['bbc news', 'bbc news']
                  })], ignore_index=True)

  data = pd.concat([data,


In [32]:
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.73,0.849438,0.849514,bbc news
1,Decision Tree fine-tuned,8.08,0.617978,0.632672,bbc news


### DTC on Sarcasm Detection
#### Simple DTC

In [33]:
# Preprocessing
sarc_data = sarc_data.dropna()
X = sarc_data['text']
y = sarc_data['is_sarcastic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [34]:
@time_counter
def model():
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    preds = dtc.predict(X_test)
    print("Simple Decision Tree Classifier")
    print(classification_report(y_test, preds))
    return preds


preds, runtime_dtc_simple = model()
print(f"Runtime: {runtime_dtc_simple} seconds")
performance_dtc_simple = classification_report(y_test, preds, output_dict=True)

@time_counter
def model():
    dtc = DecisionTreeClassifier()
    param_dist = {
        'max_depth': [3, 5, 10, 15, 20],
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2', None],
        'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
        'min_impurity_decrease': [0.0, 0.01, 0.1],
        'splitter': ['best', 'random'],
        'class_weight': [None, 'balanced']
    }

    rnd_cv = RandomizedSearchCV(dtc, param_distributions=param_dist, 
                                    n_iter=num_iterations, cv=num_cv, scoring='accuracy', random_state=42, n_jobs=-1)
    rnd_cv.fit(X_train, y_train)
    print("Best parameters:", rnd_cv.best_params_)
    print("Best score:", rnd_cv.best_score_)
    dtc = DecisionTreeClassifier(**rnd_cv.best_params_)
    dtc.fit(X_train, y_train)
    preds = dtc.predict(X_test)
    print("Fine-tuned Decision Tree Classifier")
    print(classification_report(y_test, preds))
    return preds

preds, runtime_dtc_tuned = model()
print(f"Runtime: {runtime_dtc_tuned} seconds")
performance_dtc_tuned = classification_report(y_test, preds, output_dict=True)

data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['Decision Tree', 'Decision Tree fine-tuned'],
                      'Runtime': [runtime_dtc_simple, runtime_dtc_tuned],
                      'Accuracy': [performance_dtc_simple['accuracy'], performance_dtc_tuned['accuracy']],
                      'F1': [performance_dtc_simple['weighted avg']['f1-score'], performance_dtc_tuned['weighted avg']['f1-score']],
                      'data': ['sarcasm detection', 'sarcasm detection']
                  })], ignore_index=True)

Simple Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      2978
           1       0.74      0.67      0.71      2746

    accuracy                           0.73      5724
   macro avg       0.73      0.73      0.73      5724
weighted avg       0.73      0.73      0.73      5724

Runtime: 14.38 seconds
Best parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': 20, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 7, 'min_samples_split': 12, 'splitter': 'best'}
Best score: 0.624399539980042
Fine-tuned Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.58      0.93      0.72      2978
           1       0.78      0.28      0.41      2746

    accuracy                           0.62      5724
   macro avg       0.68      0.60      0.56      5724
weighted avg       0.68      0.62      0.57      5724

Ru

In [35]:
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.73,0.849438,0.849514,bbc news
1,Decision Tree fine-tuned,8.08,0.617978,0.632672,bbc news
2,Decision Tree,14.38,0.730084,0.729078,sarcasm detection
3,Decision Tree fine-tuned,8.67,0.615828,0.567973,sarcasm detection


In [36]:
data.to_csv("../data/evaluation_data.csv", index=False)