In [1]:
import graphviz as graphviz
import numpy as np
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

TRAIN_PROPORTION = 0.7
TEST_TO_VALIDATE_RATIO = 0.1


def load_data():
    # load clean data
    real_file = open('clean_real.txt', 'r')
    fake_file = open('clean_fake.txt', 'r')

    # build set of words, and store sentences as list of tokens
    real_sentences = [line for line in real_file]
    fake_sentences = [line for line in fake_file]
    corpus = real_sentences + fake_sentences

    # make labels
    labels = np.array(['real'] * len(real_sentences) + ['fake'] * len(fake_sentences))

    # split the data
    X_train, X_test, y_train, y_test = train_test_split(corpus, labels, train_size=TRAIN_PROPORTION)
    X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test,
                                                              train_size=TEST_TO_VALIDATE_RATIO)

    print(f"Training, validation, test split: ({len(X_train), len(X_test), len(X_validate)})")

    # vectorize the sentences
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_validate = vectorizer.transform(X_validate)
    X_test = vectorizer.transform(X_test)

    # return the train, validate, test data and the vectorizer

    return (X_train, X_validate, X_test, y_train, y_validate, y_test, vectorizer)


def measure_accuracy(test, predicted):
    assert len(test) == len(predicted)
    tot = 0
    correct = 0
    for i in range(len(test)):
        if test[i] == predicted[i]:
            correct += 1
        tot += 1
    return correct / tot


def select_model(x_train, x_validate, X_test, y_train, y_validate, y_test, plot_results=False):
    depths = np.arange(50, 300, 50)
    criteria = ['gini', 'entropy', 'log_loss']
    hyperparams = [(d, c) for d in depths for c in criteria]
    val_accuracies = np.zeros((len(criteria), len(depths)))

    for i, criterion in enumerate(criteria):
        for j, d in enumerate(depths):
            clf = tree.DecisionTreeClassifier(max_depth=d, criterion=criterion)
            clf = clf.fit(x_train, y_train)

            y_validation_prediction = clf.predict(x_validate)

            val_accuracies[i, j] = measure_accuracy(y_validate, y_validation_prediction)

            print(
                f"Depth {d:3} with {criterion:8} criterion had validation accuracy {measure_accuracy(y_validate, y_validation_prediction):0.5f} ")

    best_ind = np.argmax(val_accuracies)

    return hyperparams[best_ind]

    # fig = plt.figure()
    # for criterion in criteria:
    #     y_acc = []
    #     plt.scatter()

## Loading random splits of the data.

In [116]:
X_train, X_validate, X_test, y_train, y_validate, y_test, vectorizer = load_data()

Training, validation, test split: ((2286, 98, 882))


## Selecting a model.

In [117]:
depth, criterion = select_model(X_train, X_validate, X_test, y_train, y_validate, y_test)

Depth  50 with gini     criterion had validation accuracy 0.75397 
Depth 100 with gini     criterion had validation accuracy 0.76077 
Depth 150 with gini     criterion had validation accuracy 0.75510 
Depth 200 with gini     criterion had validation accuracy 0.76871 
Depth 250 with gini     criterion had validation accuracy 0.75283 
Depth  50 with entropy  criterion had validation accuracy 0.75397 
Depth 100 with entropy  criterion had validation accuracy 0.75737 
Depth 150 with entropy  criterion had validation accuracy 0.75283 
Depth 200 with entropy  criterion had validation accuracy 0.76531 
Depth 250 with entropy  criterion had validation accuracy 0.76190 
Depth  50 with log_loss criterion had validation accuracy 0.75850 
Depth 100 with log_loss criterion had validation accuracy 0.74943 
Depth 150 with log_loss criterion had validation accuracy 0.74943 
Depth 200 with log_loss criterion had validation accuracy 0.75624 
Depth 250 with log_loss criterion had validation accuracy 0.76

In [119]:
criterion = "entropy"

# train a model with the best hyperparameters
clf = tree.DecisionTreeClassifier(max_depth=depth, criterion=criterion)

clf.fit(X_train, y_train)

# report its accuracy on the test dataset

y_test_prediction = clf.predict(X_test)
acc = measure_accuracy(y_test, y_test_prediction)

print(f"\nA model trained on the best hyperparameters (depth={depth}, criterion={criterion}) had test accuracy {acc}")


A model trained on the best hyperparameters (depth=100, criterion=entropy) had test accuracy 0.7959183673469388


In [120]:
def calc_entropy(prob_array: np.array):
    return -sum(prob_array * np.log2(prob_array))

def calc_expectation(var_vals: np.array, var_probs: np.array):
    return sum(var_vals * var_probs)


def compute_information_gain(X_train, y_train, feature, threshold, vectorizer):

    feature_arr = vectorizer.get_feature_names_out()
    feature_ind = np.where(feature_arr == feature)[0][0]


    data = vectorizer.inverse_transform(X_train)

    above_t = np.transpose((X_train[:,feature_ind] >= threshold).toarray())[0] # contains True if the feature for a datapoint is above the threshold.
    below_t = np.logical_not(above_t) # contains True if the feature for a datapoint is below the threshold.

    reals = y_train == "real"
    fakes = y_train == "fake"

    counts: np.array = np.array([
        [np.logical_and(below_t, fakes).sum(), np.logical_and(below_t, reals).sum()],    # prob(Y = false)    prob(Y = real)  when feature <  T
        [np.logical_and(above_t, fakes).sum(), np.logical_and(above_t, reals).sum()]   # prob(Y = false)    prob(Y = real)  when feature >= T
    ])
    probs: np.array = counts / len(y_train)

    x_probs: np.array = probs.sum(axis=1)
    y_probs: np.array = probs.sum(axis=0)

    probs_y_given_x = probs / x_probs[:, None]

    entropy_y = calc_entropy(y_probs)

    conditional_entropies = [calc_entropy(probs_y_given_x[i]) for i in range(len(x_probs))]

    expected_conditional_entropy = calc_expectation(conditional_entropies, x_probs)

    inf_gain = entropy_y - expected_conditional_entropy

    return inf_gain

print(compute_information_gain(X_train, y_train, "the", 0.5, vectorizer))
print(compute_information_gain(X_train, y_train, "donald", 0.5, vectorizer))
print(compute_information_gain(X_train, y_train, "trumps", 0.5, vectorizer))
print(compute_information_gain(X_train, y_train, "hillary", 0.5, vectorizer))
print(compute_information_gain(X_train, y_train, "and", 0.5, vectorizer))

0.0429818196522872
0.052453112151929604
0.041374710968130834
0.03637243164347559
0.014628122746149663
