# Real or Fake Trump Headline
- The goal of this project is to classify if a given deadline is real or fake. The exact documentation is from CSC411's homework assignment Q2 (http://www.cs.toronto.edu/~rgrosse/courses/csc411_f18/homeworks/hw1/hw1.pdf)

- Useful sklearn documentation: https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

- good tutorial on sklearn vectorization: https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

## 1. Load the data

In [1]:
# the input file consists of lines of headlines
# parse the deadline into words and store 
# each headline into a row of an np.array
from sklearn.feature_extraction.text import *
from random import shuffle
import numpy as np

DOWNLOAD_ROOT = 'http://www.cs.toronto.edu/~rgrosse/courses/csc411_f18/homeworks/hw1/'
FILE_URLS = [DOWNLOAD_ROOT + 'clean_fake.txt', DOWNLOAD_ROOT + 'clean_real.txt']
FILE_PATH = 'data'

# load_data : str, lst of float, method, str --> scipy.sparse.csr.csr_matrix, vectorizer class object
# download the file at file_url
# splits into ratios of training, validation, testing sets
# vectorizer using vectorizer
# return the vector from vectorizer.fit_transform(the downloaded file) and the vectorizer
def load_data(file_urls=FILE_URLS,
                   ratios=[0.7, 0.15, 0.15],
                   vectorizer=TfidfVectorizer(),
                   file_path=FILE_PATH,
                   test=False
                  ):
    import os
    from six.moves import urllib

    
    # download the file
    # store content of each file as an np.array of str in txts
    # dividers stores the lengths
    txts = []
    div = [0]
    
    for file_url in file_urls:
        if test:
            print(file_url)
        
        file_name = file_url.split('/')[-1]

        if not os.path.isdir(file_path):
            os.makedirs(file_path)

        file_loc = os.path.join(file_path, file_name)
        urllib.request.urlretrieve(file_url, file_loc)

        # vectorize
        with open(file_loc) as f:
            txt = f.read().split('\n')
            txts += txt
            div.append(div[-1] + len(txt))
        
        if test:
            print(txt[:3])
            print(len(txt))
        
    if test:
        print('div[-1]: ', div[-1])
        print('len(txts): ', len(txts))

    # labels
    labels = np.zeros(div[-1])
    for i in range(len(div)-1):
        labels[div[i]: div[i+1]] = i
    
    if test:
        print(len(labels[labels==0]))
        print(len(labels[labels==1]))
                       
    # shuffle the index
    index = np.random.choice(div[-1], div[-1], replace=False)
    
    
    # generate train/validation/tst divide
    trn_ind = int(div[-1] * ratios[0])
    val_ind = trn_ind + int(div[-1] * ratios[1])
       
    # vectorize
    vector = vectorizer.fit_transform(txts)
    
    if test:
        print('trn_index:', trn_ind)
        print('val_index:', val_ind)
        print(index[:100])
        print(vector[index[:trn_ind]].shape)
        print(vector[index[trn_ind:val_ind]].shape)
        print(vector[index[val_ind:]].shape)

    return (vector[index[:trn_ind]], labels[index[:trn_ind]]), (vector[index[trn_ind:val_ind]], labels[index[trn_ind:val_ind]]), (vector[index[val_ind:]], labels[index[val_ind:]]) #, vectorizer


    

# testing
# trn, val, tst = load_data(FILE_URLS, vectorizer=TfidfVectorizer(ngram_range=(1,2)))
trn, val, tst = load_data(FILE_URLS, vectorizer=TfidfVectorizer())

In [2]:
trn[0].shape

(2287, 5799)

In [3]:
trn[-1].shape

(2287,)

# 2 Model selection 

In [4]:
from sklearn.tree import DecisionTreeClassifier

def select_model(max_depth=5, split_crits=['gini', 'entropy'], trn=trn, val=val):
    models = []
    errs = []
    
    for d in range(max_depth):
        for s in split_crits:
            models.append(DecisionTreeClassifier(criterion=s, max_depth=d+1))
            models[-1].fit(trn[0], trn[1])
            pred = models[-1].predict(val[0])
            target = val[1]
            errs.append(len(target[target != pred])/len(target))
    
    b_index = errs.index(min(errs))
    
    return (models[b_index], errs[b_index])
    
    
            

In [5]:
A = select_model(max_depth=20)

In [6]:
A

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=17,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'), 0.17346938775510204)

In [8]:
pred = A[0].predict(tst[0])
target = tst[1]
e = len(target[target != pred])/len(target)
print(e)

0.18329938900203666


In [None]:
# error rate of approximately 18% in determining if a headline is a real trump news or not