In [5]:
import pandas as pd
import numpy as np
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time

In [6]:
bbc_data = pd.read_csv("../data/clean_bbc_classification.csv")
sarc_data = pd.read_csv("../data/clean_sarcasm_classification.csv")

### DTC on BBC News Classification
#### Simple DTC

In [7]:
# Preprocessing

X = bbc_data['text']
y = bbc_data['label_ids']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [8]:
bbc_data[['labels', 'label_ids']].drop_duplicates()

Unnamed: 0,labels,label_ids
0,entertainment,0
386,business,1
896,sport,2
1407,politics,3
1824,tech,4


In [9]:
t0 = time()
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)
print("Simple Decision Tree Classifier")
print(classification_report(y_test, preds))
t1 = time()
runtime_dtc_simple = round(t1-t0, 2)
print(f"Runtime: {runtime_dtc_simple} seconds")
performance_dtc_simple = classification_report(y_test, preds, output_dict=True)

Simple Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        65
           1       0.79      0.78      0.78       103
           2       0.93      0.91      0.92       114
           3       0.80      0.83      0.82        88
           4       0.87      0.81      0.84        75

    accuracy                           0.84       445
   macro avg       0.84      0.84      0.84       445
weighted avg       0.84      0.84      0.84       445

Runtime: 0.7 seconds


#### Fine-tuned DTC

In [10]:
t0 = time()
dtc = DecisionTreeClassifier()
param_dist = {
    'max_depth': [3, 5, 10, 15, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'splitter': ['best', 'random'],
    'class_weight': [None, 'balanced']
}

rnd_cv = RandomizedSearchCV(dtc, param_distributions=param_dist, 
                            n_iter=100, cv=5, scoring='accuracy', random_state=123, n_jobs=-1)
rnd_cv.fit(X_train, y_train)
print("Best parameters:", rnd_cv.best_params_)
print("Best score:", rnd_cv.best_score_)
dtc = DecisionTreeClassifier(**rnd_cv.best_params_)
dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)
print("Fine-tuned Decision Tree Classifier")
print(classification_report(y_test, preds))
t1 = time()
runtime_dtc_tuned = round(t1-t0, 2)
print(f"Runtime: {runtime_dtc_tuned} seconds")
performance_dtc_tuned = classification_report(y_test, preds, output_dict=True)

Best parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': 30, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 8, 'min_samples_split': 16, 'splitter': 'best'}
Best score: 0.801123595505618
Fine-tuned Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.59      0.86      0.70        65
           1       0.83      0.81      0.82       103
           2       0.94      0.87      0.90       114
           3       0.88      0.80      0.83        88
           4       0.86      0.75      0.80        75

    accuracy                           0.82       445
   macro avg       0.82      0.82      0.81       445
weighted avg       0.84      0.82      0.82       445

Runtime: 14.45 seconds


In [11]:
data = pd.DataFrame(columns=['Model', 'Runtime', 'Accuracy', 'F1', 'data'])

data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['Decision Tree', 'Decision Tree fine-tuned'],
                      'Runtime': [runtime_dtc_simple, runtime_dtc_tuned],
                      'Accuracy': [performance_dtc_simple['accuracy'], performance_dtc_tuned['accuracy']],
                      'F1': [performance_dtc_simple['weighted avg']['f1-score'], performance_dtc_tuned['weighted avg']['f1-score']],
                      'data': ['bbc news', 'bbc news']
                  })], ignore_index=True)

  data = pd.concat([data,


In [12]:
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.7,0.840449,0.840706,bbc news
1,Decision Tree fine-tuned,14.45,0.817978,0.822761,bbc news


### DTC on Sarcasm Detection
#### Simple DTC

In [13]:
# Preprocessing
sarc_data = sarc_data.dropna()
X = sarc_data['text']
y = sarc_data['is_sarcastic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [19]:
t0 = time()
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)
print("Simple Decision Tree Classifier")
print(classification_report(y_test, preds))
t1 = time()
runtime_dtc_simple = round(t1-t0, 2)
print(f"Runtime: {runtime_dtc_simple} seconds")
performance_dtc_simple = classification_report(y_test, preds, output_dict=True)

t0 = time()
dtc = DecisionTreeClassifier()
param_dist = {
    'max_depth': [3, 5, 10, 15, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'splitter': ['best', 'random'],
    'class_weight': [None, 'balanced']
}

rnd_cv = RandomizedSearchCV(dtc, param_distributions=param_dist, 
                                n_iter=100, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
rnd_cv.fit(X_train, y_train)
print("Best parameters:", rnd_cv.best_params_)
print("Best score:", rnd_cv.best_score_)
dtc = DecisionTreeClassifier(**rnd_cv.best_params_)
dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)
print("Fine-tuned Decision Tree Classifier")
print(classification_report(y_test, preds))
t1 = time()
runtime_dtc_tuned = round(t1-t0, 2)
print(f"Runtime: {runtime_dtc_tuned} seconds")
performance_dtc_tuned = classification_report(y_test, preds, output_dict=True)

data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['Decision Tree', 'Decision Tree fine-tuned'],
                      'Runtime': [runtime_dtc_simple, runtime_dtc_tuned],
                      'Accuracy': [performance_dtc_simple['accuracy'], performance_dtc_tuned['accuracy']],
                      'F1': [performance_dtc_simple['weighted avg']['f1-score'], performance_dtc_tuned['weighted avg']['f1-score']],
                      'data': ['sarcasm detection', 'sarcasm detection']
                  })], ignore_index=True)

Simple Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.72      0.77      0.75      2978
           1       0.73      0.68      0.70      2746

    accuracy                           0.73      5724
   macro avg       0.73      0.72      0.72      5724
weighted avg       0.73      0.73      0.73      5724

Runtime: 11.28 seconds
Best parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': 20, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 17, 'splitter': 'random'}
Best score: 0.6282870658316201
Fine-tuned Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.59      0.92      0.72      2978
           1       0.78      0.32      0.45      2746

    accuracy                           0.63      5724
   macro avg       0.69      0.62      0.59      5724
weighted avg       0.68      0.63      0.59      57

In [20]:
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.7,0.840449,0.840706,bbc news
1,Decision Tree fine-tuned,14.45,0.817978,0.822761,bbc news
2,Decision Tree,11.28,0.725891,0.725137,sarcasm detection
3,Decision Tree fine-tuned,58.69,0.629804,0.590813,sarcasm detection


In [21]:
data.to_csv("../data/evaluation_data.csv", index=False)