In [1]:
import pandas as pd
import numpy as np
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time

In [2]:
bbc_data = pd.read_csv("../data/clean_bbc_classification.csv")
sarc_data = pd.read_csv("../data/clean_sarcasm_classification.csv")

### DTC on BBC News Classification
#### Simple DTC

In [3]:
# Preprocessing

X = bbc_data['text']
y = bbc_data['label_ids']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [4]:
bbc_data[['labels', 'label_ids']].drop_duplicates()

Unnamed: 0,labels,label_ids
0,entertainment,0
386,business,1
896,sport,2
1407,politics,3
1824,tech,4


In [5]:
t0 = time()
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)
print("Simple Decision Tree Classifier")
print(classification_report(y_test, preds))
t1 = time()
runtime_dtc_simple = round(t1-t0, 2)
print(f"Runtime: {runtime_dtc_simple} seconds")
performance_dtc_simple = classification_report(y_test, preds, output_dict=True)

Simple Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.77      0.86      0.81        65
           1       0.79      0.77      0.78       103
           2       0.95      0.91      0.93       114
           3       0.83      0.83      0.83        88
           4       0.80      0.79      0.79        75

    accuracy                           0.83       445
   macro avg       0.83      0.83      0.83       445
weighted avg       0.84      0.83      0.83       445

Runtime: 1.19 seconds


#### Fine-tuned DTC

In [6]:
t0 = time()
dtc = DecisionTreeClassifier()
param_dist = {
    'max_depth': [3, 5, 10, 15, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'splitter': ['best', 'random'],
    'class_weight': [None, 'balanced']
}

rnd_cv = RandomizedSearchCV(dtc, param_distributions=param_dist, 
                                n_iter=100, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
rnd_cv.fit(X_train, y_train)
print("Best parameters:", rnd_cv.best_params_)
print("Best score:", rnd_cv.best_score_)
dtc = DecisionTreeClassifier(**rnd_cv.best_params_)
dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)
print("Fine-tuned Decision Tree Classifier")
print(classification_report(y_test, preds))
t1 = time()
runtime_dtc_tuned = round(t1-t0, 2)
print(f"Runtime: {runtime_dtc_tuned} seconds")
performance_dtc_tuned = classification_report(y_test, preds, output_dict=True)

Best parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': 30, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 8, 'min_samples_split': 16, 'splitter': 'best'}
Best score: 0.800561797752809
Fine-tuned Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.58      0.86      0.70        65
           1       0.83      0.81      0.82       103
           2       0.94      0.87      0.90       114
           3       0.88      0.80      0.83        88
           4       0.86      0.73      0.79        75

    accuracy                           0.82       445
   macro avg       0.82      0.81      0.81       445
weighted avg       0.84      0.82      0.82       445

Runtime: 16.05 seconds


In [7]:
data = pd.DataFrame(columns=['Model', 'Runtime', 'Accuracy', 'F1', 'data'])

data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['Decision Tree', 'Decision Tree fine-tuned'],
                      'Runtime': [runtime_dtc_simple, runtime_dtc_tuned],
                      'Accuracy': [performance_dtc_simple['accuracy'], performance_dtc_tuned['accuracy']],
                      'F1': [performance_dtc_simple['weighted avg']['f1-score'], performance_dtc_tuned['weighted avg']['f1-score']],
                      'data': ['bbc news', 'bbc news']
                  })], ignore_index=True)

  data = pd.concat([data,


In [8]:
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,1.19,0.833708,0.834099,bbc news
1,Decision Tree fine-tuned,16.05,0.81573,0.82067,bbc news


### DTC on Sarcasm Detection
#### Simple DTC

In [9]:
# Preprocessing

X = sarc_data['text']
y = sarc_data['is_sarcastic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [4]:
import pandas as pd

In [5]:
s = pd.read_csv("../data/clean_sarcasm_classification.csv")

In [6]:
s[pd.isna(s['text'])]

Unnamed: 0,text,is_sarcastic
26524,,0
