In [106]:
import numpy
import heapq
import sklearn.feature_extraction.text
import sklearn.model_selection
import sklearn.tree
import graphviz
import math
from collections import defaultdict
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz/bin'

In [157]:
def load_headlines():
    headlines = []
    with open('NEWS/clean_fake.txt', 'r') as f:
        headlines = f.read().split('\n')
        n_fake = len(headlines)
    with open('NEWS/clean_real.txt', 'r') as f:
        headlines.extend(f.read().split('\n'))
        n_real = len(headlines) - n_fake
    return headlines, n_fake, n_real

def Preprocess_Data(headlines, n_fake, n_real):
    X = vectorizer.fit_transform(headlines)
    Y = [0 for i in range(n_fake)] + [1 for i in range(n_real)]
    X_train, X_validate_test, Y_train, Y_validate_test = sklearn.model_selection.train_test_split(X, Y, test_size = 0.3)
    X_validate, X_test, Y_validate, Y_test = sklearn.model_selection.train_test_split(X_validate_test, Y_validate_test, test_size = 0.5)
    return X_train, X_validate, X_test, Y_train, Y_validate, Y_test 

def select_model(X_train, Y_train, X_validate, Y_validate, max_depths, criterions, k=2):
    best_trees = []
    for depth in max_depths:
        for criteria in criterions:
            tree = sklearn.tree.DecisionTreeClassifier(criterion=criteria, max_depth=depth)
            tree.fit(X_train,Y_train)
            Y_predict = tree.predict(X_validate, Y_validate)
            accuracy = float(sum([1 if Y_predict[i]== Y_validate[i] else 0 for i in range(len(Y_validate))]))/float(len(Y_validate))
            print("Tree max depth=%d, criterion=%s, validation_score=%f" % (depth, criteria, accuracy))
            heapq.heappush(best_trees, (-1*accuracy, tree))    
    trees = []
    for i in range(k):
        trees.append(heapq.heappop(best_trees))
    return trees 
    
            
def visualize_tree(tree, name):
    data = sklearn.tree.export_graphviz(tree, out_file=None, feature_names=vectorizer.get_feature_names())
    graph = graphviz.Source(data)
    graph.render(name)
    
def log2(x):
    return math.log(x)/math.log(2)
    
def calculate_entropy(Y):
    count = defaultdict(int)
    for i in range(len(Y)):
        count[Y[i]] += 1  
    entropy = 0
    for y in count:
        p = float(count[y])/len(Y)
        entropy -= p*log2(p)
    return entropy

def splitX(X, Y, feature, split):
    X_right, Y_right, X_left, Y_left = [[] for i in range(4)]
    X = X.toarray()
    for i in range(len(X)):
        if X[i][feature] < split:
            X_left.append(X[i])
            Y_left.append(Y[i])
        else:
            X_right.append(X[i])
            Y_right.append(Y[i])
    return X_right, Y_right, X_left, Y_left 

def compute_information_gain(X, Y, feature, Split):
    X_right, Y_right, X_left, Y_left = splitX(X, Y, feature, Split)
    parentEntropy = calculate_entropy(Y)
    leftEntropy = calculate_entropy(Y_left)
    rightEntropy = calculate_entropy(Y_right)
    IG = parentEntropy - float(len(Y_left)/len(Y))*leftEntropy - float(len(Y_right)/len(Y))*rightEntropy
    return IG
    
    
    

In [85]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer()
max_depths = range(1,5)
criterions = ['gini', 'entropy']
headlines, n_fake, n_real = load_headlines()
X_train, X_validate, X_test, Y_train, Y_validate, Y_test = Preprocess_Data(headlines, n_fake, n_real)
trees = select_model(X_train, Y_train, X_validate, Y_validate, max_depths, criterions)

Tree max depth=1, criterion=gini, validation_score=0.671429
Tree max depth=1, criterion=entropy, validation_score=0.579592
Tree max depth=2, criterion=gini, validation_score=0.673469
Tree max depth=2, criterion=entropy, validation_score=0.636735
Tree max depth=3, criterion=gini, validation_score=0.710204
Tree max depth=3, criterion=entropy, validation_score=0.626531
Tree max depth=4, criterion=gini, validation_score=0.714286
Tree max depth=4, criterion=entropy, validation_score=0.712245


In [102]:
visualize_tree(trees[0][1], 'news_tree1')

[(-0.7142857142857143, DecisionTreeClassifier(max_depth=4)), (-0.7122448979591837, DecisionTreeClassifier(criterion='entropy', max_depth=4))]


In [159]:
split_words = ['hillary', 'trump', 'korea']
for word in split_words:
    featureIndex = vectorizer.vocabulary_.get(word)
    print(compute_information_gain(X_train, Y_train, featureIndex, 0.5))

0.035609234170765
0.03690593571792811
0.016555315061006715


In [162]:
Y_test_predict_t1 = trees[0][1].predict(X_test, Y_test)
accuracy_test_t1 = sum([1 if Y_test_predict_t1[i]== Y_test[i] else 0 for i in range(len(Y_test))])/len(Y_test)
print(accuracy_test_t1)

0.7026476578411406


In [163]:
Y_test_predict_t2 = trees[1][1].predict(X_test, Y_test)
accuracy_test_t2 = sum([1 if Y_test_predict_t2[i]== Y_test[i] else 0 for i in range(len(Y_test))])/len(Y_test)
print(accuracy_test_t2)

0.7006109979633401
