In [1]:
import graphviz as graphviz
import numpy as np
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

TRAIN_PROPORTION = 0.7
TEST_TO_VALIDATE_RATIO = 0.1


def load_data():
    # load clean data
    real_file = open('clean_real.txt', 'r')
    fake_file = open('clean_fake.txt', 'r')

    # build set of words, and store sentences as list of tokens
    real_sentences = [line for line in real_file]
    fake_sentences = [line for line in fake_file]
    corpus = real_sentences + fake_sentences

    # make labels
    labels = np.array(['real'] * len(real_sentences) + ['fake'] * len(fake_sentences))

    # split the data
    X_train, X_test, y_train, y_test = train_test_split(corpus, labels, train_size=TRAIN_PROPORTION)
    X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test,
                                                              train_size=TEST_TO_VALIDATE_RATIO)

    print(f"Training, validation, test split: ({len(X_train), len(X_test), len(X_validate)})")

    # vectorize the sentences
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_validate = vectorizer.transform(X_validate)
    X_test = vectorizer.transform(X_test)

    # return the train, validate, test data and the vectorizer

    return (X_train, X_validate, X_test, y_train, y_validate, y_test, vectorizer)


def measure_accuracy(test, predicted):
    assert len(test) == len(predicted)
    tot = 0
    correct = 0
    for i in range(len(test)):
        if test[i] == predicted[i]:
            correct += 1
        tot += 1
    return correct / tot


def select_model(x_train, x_validate, X_test, y_train, y_validate, y_test, plot_results=False):
    depths = np.arange(50, 300, 50)
    criteria = ['gini', 'entropy', 'log_loss']
    hyperparams = [(d, c) for d in depths for c in criteria]
    val_accuracies = np.zeros((len(criteria), len(depths)))

    for i, criterion in enumerate(criteria):
        for j, d in enumerate(depths):
            clf = tree.DecisionTreeClassifier(max_depth=d, criterion=criterion)
            clf = clf.fit(x_train, y_train)

            y_validation_prediction = clf.predict(x_validate)

            val_accuracies[i, j] = measure_accuracy(y_validate, y_validation_prediction)

            print(
                f"Depth {d:3} with {criterion:8} criterion had validation accuracy {measure_accuracy(y_validate, y_validation_prediction):0.5f} ")

    best_ind = np.argmax(val_accuracies)

    return hyperparams[best_ind]

    # fig = plt.figure()
    # for criterion in criteria:
    #     y_acc = []
    #     plt.scatter()

## Loading random splits of the data.

In [4]:
X_train, X_validate, X_test, y_train, y_validate, y_test, vectorizer = load_data()

Training, validation, test split: ((2286, 98, 882))


## Selecting a model.

In [2]:
depth, criterion = select_model(X_train, X_validate, X_test, y_train, y_validate, y_test)

Training, validation, test split: ((2286, 98, 882))
Depth  50 with gini     criterion had validation accuracy 0.77098 
Depth 100 with gini     criterion had validation accuracy 0.78231 
Depth 150 with gini     criterion had validation accuracy 0.79025 
Depth 200 with gini     criterion had validation accuracy 0.77778 
Depth 250 with gini     criterion had validation accuracy 0.76417 
Depth  50 with entropy  criterion had validation accuracy 0.78345 
Depth 100 with entropy  criterion had validation accuracy 0.77778 
Depth 150 with entropy  criterion had validation accuracy 0.77211 
Depth 200 with entropy  criterion had validation accuracy 0.78345 
Depth 250 with entropy  criterion had validation accuracy 0.78118 
Depth  50 with log_loss criterion had validation accuracy 0.76984 
Depth 100 with log_loss criterion had validation accuracy 0.76757 
Depth 150 with log_loss criterion had validation accuracy 0.77551 
Depth 200 with log_loss criterion had validation accuracy 0.77438 
Depth 250 

In [3]:
# train a model with the best hyperparameters
clf = tree.DecisionTreeClassifier(max_depth=depth, criterion=criterion)

clf.fit(X_train, y_train)

# report its accuracy on the test dataset

y_test_prediction = clf.predict(X_test)
acc = measure_accuracy(y_test, y_test_prediction)

print(f"\nA model trained on the best hyperparameters (depth={depth}, criterion={criterion}) had test accuracy {acc}")


A model trained on the best hyperparameters (depth=50, criterion=log_loss) had test accuracy 0.7448979591836735


In [49]:
def compute_information_gain(X_train, y_train, feature, threshold, vectorizer):

    feature_arr = vectorizer.get_feature_names_out()
    feature_ind = np.where(feature_arr == feature)[0][0]


    data = vectorizer.inverse_transform(X_train)

    print(sum([feature in row for row in data]))

    arr = X_train[:,feature_ind] >= 0.5

    print(arr.sum())


    # joint_pdf = np.array([
    #     X_train[]
    # ])
    #
    # entropy =

    # print(X_train[])

    pass

print(compute_information_gain(X_train, y_train, "true", 0.5, vectorizer))

8
8
None
