# Fake news classifier 

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import csv
from pandas.tools.plotting import scatter_matrix
from sklearn.feature_extraction import stop_words

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

from sklearn.dummy import DummyClassifier

# Class, for use in pipelines, to select certain columns from a DataFrame and convert to a numpy array
# From A. Geron: Hands-On Machine Learning with Scikit-Learn & TensorFlow, O'Reilly, 2017
# Modified by Derek Bridge to allow for casting in the same ways as pandas.DataFrame.astype
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values
    
# Class, for use in pipelines, to binarize nominal-valued features (while avoiding the dummy variable trap)
# By Derek Bridge, 2017
class FeatureBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features_values):
        self.features_values = features_values
        self.num_features = len(features_values)
        self.labelencodings = [LabelEncoder().fit(feature_values) for feature_values in features_values]
        self.onehotencoder = OneHotEncoder(sparse=False,
            n_values=[len(feature_values) for feature_values in features_values])
        self.last_indexes = np.cumsum([len(feature_values) - 1 for feature_values in self.features_values])
    def fit(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        return self.onehotencoder.fit(X)
    def transform(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        onehotencoded = self.onehotencoder.transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def fit_transform(self, X, y=None):
        onehotencoded = self.fit(X).transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def get_params(self, deep=True):
        return {"features_values" : self.features_values}
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self

# similar to CS4618, but plays nice with cross_val_score
# for some reason, in cross_val_score, it didn't like the inheritance from Imputer class
class MissingNominalImputer:
    def __init__(self, missing_values, strategy):
        self.missing_values = missing_values
        self.strategy = strategy
        if(self.missing_values not in ['NaN','nan']):
                print("MissingNominalImputer can't deal with not NaN values")
    def fit(self, X, y=None):
        if self.strategy == "most_frequent":
            self.fills = pd.DataFrame(X).mode(axis=0).squeeze() 
            return self
        else:
            print('ERROR AT MISSING_NOMINAL_IMPUTER')
    def transform(self, X):
        if hasattr(self, "fills"):
            return pd.DataFrame(X).fillna(self.fills).values
        else:
            print('ERROR AT MISSING_NOMINAL_IMPUTER')

After computing a new feature, the number of true statements and outputing a df.describe(), I saw that the minimum of number of true statements was -1 (it is leakage, but I deleted that feature after, it was for testing purposes).

After a lot of debugging I found a particular example:
>1606	mostly-true	"Hospitals, doctors, MRIs, surgeries and so forth are more extensively used and far more expensive in this country than they are in many other countries.''	health-care	mitt-romney	Former governor	Massachusetts	republican	34	32	58	33	19	a Fox News Sunday interview  

The problem: "some text'', at closing, instead of double quotes, two single quotes were used. By default, read_csv considers a text between two double quotes as a token, and in our case another " used for matching was only some rows below.

We need not to consider a token between quotes, so the parameter quoting must be set to csv.QUOTE_NONE

In [5]:
#First, read the data
#it is a tab separated file, so sep argument is a tab
#take care at quotes, some end in '' instead of "
df = pd.read_csv("dataset_statements.tsv", sep = '\t', quoting = csv.QUOTE_NONE)

#also shuffle the data and reset index (better safe than sorry)
df = df.take(np.random.permutation(len(df)))
df.reset_index(drop=True, inplace=True)

In [6]:
df.shape

(12836, 14)

In [7]:
df.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state',
       'party', 'num_barely_trues', 'num_falses', 'num_half_trues',
       'num_mostly_trues', 'num_pants_fires', 'location'],
      dtype='object')

In [8]:
df.dtypes

id                   int64
label               object
statement           object
subject             object
speaker             object
job                 object
state               object
party               object
num_barely_trues     int64
num_falses           int64
num_half_trues       int64
num_mostly_trues     int64
num_pants_fires      int64
location            object
dtype: object

In [9]:
df.head(3)

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,num_barely_trues,num_falses,num_half_trues,num_mostly_trues,num_pants_fires,location
0,1635,true,"""Just this week we received the news that for ...",social-security,marco-rubio,U.S. Senator,Florida,republican,33,24,32,35,5,a U.S. Senate primary debate on FOX News Sunday
1,7214,barely-true,"Hurricane Sandy, the most destructive Atlantic...","climate-change,environment,weather",environment-new-jersey,,New Jersey,none,1,0,0,0,0,an online post about global warming
2,10285,true,At the 50 Milwaukee schools serving at least 8...,education,alberta-darling,"State Senator, 8th District",Wisconsin,republican,1,1,2,1,1,an interview


By looking at the column meaning, it is clear that 'id' has no relation to the statement, so this column should be deleted.

The data types of the columns look ok.

In [10]:
#clear 'id' feature
df.drop('id', axis=1, inplace=True)
df.shape

(12836, 13)

In [11]:
df.describe(include = 'all')

Unnamed: 0,label,statement,subject,speaker,job,state,party,num_barely_trues,num_falses,num_half_trues,num_mostly_trues,num_pants_fires,location
count,12836,12836,12836,12836,9261,10084,12836,12836.0,12836.0,12836.0,12836.0,12836.0,12707
unique,6,12810,4547,3318,1360,85,24,,,,,,5161
top,half-true,On a cap-and-trade plan.,health-care,barack-obama,President,Texas,republican,,,,,,a news release
freq,2638,3,475,616,620,1263,5687,,,,,,310
mean,,,,,,,,11.59598,13.369975,17.218838,16.526955,6.246261,
std,,,,,,,,18.996727,24.150879,35.910604,36.225691,16.162788,
min,,,,,,,,0.0,0.0,0.0,0.0,0.0,
25%,,,,,,,,0.0,0.0,0.0,0.0,0.0,
50%,,,,,,,,2.0,2.0,3.0,3.0,1.0,
75%,,,,,,,,12.0,15.0,13.0,12.0,5.0,


We should have a look at every individual feature

 * label - no missing values, is the target class
 * statement
     * no missing values
     * has some duplicates, I don't think it is wrong
     * free form text
 * subject - no missing values, set-valued feature
 * speaker - no missing values, nominal-valued feature
 * job
     * lots of missing values
     * should not drop column, since some jobs involve more hiding of truth
     * will input mode in pipeline
     * nominal-valued feature
 * state
     * lots of missing values
     * will input mode in pipeline
     * nominal-valued feature
 * party - no missing values, nominal-valued feature
 * num_barely_trues, num_falses, num_half_trues, num_mostly_trues, num_pants_fires
     * no missing values
     * contain information about the whole dataset, that is **leakage**; so they should be dropped
     * numeric-valued features
 * location
     * few missing examples
     * should drop those examples
     * nominal-valued feature

Should check for good intentions of those who made the data, maybe instead of leaving 2 tabs for missing value they wrote ? or UNK or something

In [12]:
features = ['statement', 'subject', 'speaker', 'job', 'state', 'party', 'location']
for col in features:
    l = df[col].unique()
    for val in ['?', 'N/A', 'UNK', 'n/a', 'unk', 'NaN', 'nan', 'none']:
        if val in l:
            print(col, ' has ', val, ' for ', (df[col]==val).sum(), 'examples')

job  has  none  for  4 examples
party  has  none  for  2185 examples


Party feature could have none because they are not affiliated to any parties.  
Job also could be none, so it's not wrong

In [13]:
#drop missing location
df.dropna(subset = ['location'], inplace = True)
df.reset_index(drop = True, inplace = True)

df.shape

(12707, 13)

In [14]:
#drop columns with leakage
leak_col = ['num_barely_trues', 'num_falses', 'num_half_trues', 'num_mostly_trues', 'num_pants_fires']
for col in leak_col:
    df.drop(col, axis = 1, inplace = True)
df.shape

(12707, 8)

# Evaluation of classifiers

I've used stratified-k-fold-cross-validation because the test data is not so big (only 12 thousand examples).
Holdout does not reuse values for training and testing, so we use k-fold. Since we want the distribution of examples with respect to labels to be the same in test set and in training set, I used stratified k-fold.

When _k=10_, the number of examples in each fold is ~1200, so there are enough for testing and for training.

For comparing different classifiers, I use the following:
 * confusion matrix
 * accuracy (could also be seen out of confusion matrix)
 * recall - shows the ability of classifier to find positive examples; when used in multilabel classification and with average parameter = 'macro', it computes this score for each label and returns the mean
 
source for recall https://scikit-learn.org/stable/modules/model_evaluation.html

In [15]:
y = df["label"].values
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

label_domain = ['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true']
labels_encoded_ord = encoder.transform(label_domain)
#labels_encoded_ord is used for ordering the confusion matrix

In [16]:
#prints the confusion matrix, accuracy and recall for a pipeline, evaluated using stratified k-fold
def evaluate(pipeline_name, pipeline, df = df, y_encoded = y_encoded, cv = 10):  
    y_predicted = cross_val_predict(pipeline, df, y_encoded, cv=cv)

    print(pd.DataFrame(confusion_matrix(y_encoded, y_predicted, labels = labels_encoded_ord), 
                       index = label_domain, columns = label_domain))
    print(pipeline_name, ' accuracy: ', accuracy_score(y_encoded, y_predicted))
    print(pipeline_name, ' recall:   ', recall_score(y_encoded, y_predicted, average = 'macro'))
    

First, let's build the majority-class classifier so we have a refference point

In [17]:
df['dummy'] = 1 #need a numerical feature for the dummyClassifier, we dropped all from dataset already

maj_pipeline = Pipeline([("selector", DataFrameSelector(['dummy'])), 
                         ("estimator", DummyClassifier(strategy = "most_frequent"))])

evaluate('Majority-class classifier: ', maj_pipeline)

#drop the dummy column
df.drop('dummy', axis=1, inplace=True)

             pants-fire  false  barely-true  half-true  mostly-true  true
pants-fire            0      0            0       1035            0     0
false                 0      0            0       2486            0     0
barely-true           0      0            0       2091            0     0
half-true             0      0            0       2615            0     0
mostly-true           0      0            0       2444            0     0
true                  0      0            0       2036            0     0
Majority-class classifier:   accuracy:  0.20579208310380107
Majority-class classifier:   recall:    0.16666666666666666


Now I will make a basic classifier which takes into account all features and uses a TfidfVectorizer for the 'statement'.
For encoding the 'subject', since it is a set-valued feature, a CountVectorizer will do the job but it is needed a different regex (words can be hyphenated as in "job-accomplishments")

In [18]:
# The features we want to select
nominal_features = ['speaker', 'job', 'state', 'party', 'location']

nominal_pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)), 
        ("imputer", MissingNominalImputer(missing_values="nan", strategy="most_frequent")), 
        ("binarizer", FeatureBinarizer([df[feature].dropna().unique() for feature in nominal_features]))
        ])

#need a count vectorizer for set-valued feature
sbj_pipeline = Pipeline([("selector", DataFrameSelector('subject')), 
                        ("vectorizer", CountVectorizer(token_pattern = '(?u)\\b[^,]+\\b'))
                         ])

text_pipeline = Pipeline([
        ("selector", DataFrameSelector('statement')),
        ("vectorizer", TfidfVectorizer(stop_words = 'english'))
        ])

union_step = ("union", FeatureUnion([("nominal_pipeline", nominal_pipeline),
                                     ("sbj_pipeline", sbj_pipeline),
                                     ("text_pipeline", text_pipeline)
                                    ]))

ovr_pipeline = Pipeline([union_step,            
                         ("estimator", LogisticRegression())
                        ])

cent_pipeline = Pipeline([union_step,
                          ("estimator", LogisticRegression(multi_class = "multinomial", solver = "newton-cg"))
                        ])

In [19]:
evaluate('One-vs-rest', ovr_pipeline)
evaluate('Cross-entropy', cent_pipeline)

             pants-fire  false  barely-true  half-true  mostly-true  true
pants-fire          229    322          151        172           93    68
false               162    750          385        483          395   311
barely-true          77    520          389        534          361   210
half-true            56    496          387        708          638   330
mostly-true          44    372          279        635          685   429
true                 29    361          177        451          544   474
One-vs-rest  accuracy:  0.2545840875108208
One-vs-rest  recall:    0.248802374992091
             pants-fire  false  barely-true  half-true  mostly-true  true
pants-fire          234    318          158        163           95    67
false               165    738          393        485          392   313
barely-true          82    514          401        524          357   213
half-true            61    477          403        705          631   338
mostly-true          46    

It doesn't look too promising, only for recall score an improvement is seen from the dummy classifier. No advantage is seen for using the cross-entropy function over the one-vs-rest approact.  
I will use less features and set parameters for the TfidfVectorizer, we may have the curse of dimensionality.

In [25]:
nominal_features = ['speaker']
nominal_pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)), 
        ("imputer", MissingNominalImputer(missing_values="nan", strategy="most_frequent")), 
        ("binarizer", FeatureBinarizer([df[feature].dropna().unique() for feature in nominal_features])),
        ])

text_pipeline = Pipeline([
        ("selector", DataFrameSelector('statement')),
        ("vectorizer", TfidfVectorizer(stop_words = 'english', max_df = 0.8, min_df = 0.2))
        ])

union_step = ("union", FeatureUnion([("nominal_pipeline", nominal_pipeline),
                                     ("text_pipeline", text_pipeline)
                                    ]))

ovr_pipeline = Pipeline([ union_step,            
                        ("estimator", LogisticRegression())
                        ])

evaluate('One-vs-rest', ovr_pipeline)

             pants-fire  false  barely-true  half-true  mostly-true  true
pants-fire          201    264           89        367           79    35
false                97    614          247        973          367   188
barely-true          47    484          223        848          358   131
half-true            33    453          226       1131          560   212
mostly-true          25    384          170       1033          563   269
true                 16    320          130        840          514   216
One-vs-rest  accuracy:  0.23199811127724876
One-vs-rest  recall:    0.219464793311105


Doesn't look like an improvement.  
Another idea is to add new features. From the labeled training set, we could compute for each label and each speaker the ratio of pants-fire, false, .. , true statetents with respect to total number of speaker's statenemts.  
For this, we need a step in pipeline which given the dataset will return the numpy 2d array with those ratios.

In [21]:
#the label_col_name must be last value in grouping list
class AddNumericColumnsRatios:
    def __init__(self, label_col_name = 'label', label_domain = label_domain, grouping = ['speaker', 'label']):
        self.label_domain = label_domain
        self.label_col_name = label_col_name
        self.grouping = grouping
        
    def fit(self, X, y=None):
        #ensure all labels are in the dataset by adding a dummy speaker with all possible label values
        X_au = X
        for val in self.label_domain:
            X_au = X_au.append({'speaker' : '__xzasdf__', 'label' : val}, ignore_index=True)
        
        #count number of occurences of each label for each speaker
        #next code line is adapted from https://stackoverflow.com/questions/37003100/pandas-groupby-for-zero-values, https://stackoverflow.com/questions/37077898/pandas-dataframe-how-to-add-column-with-number-of-occurrences-in-other-column
        self.cnt = X_au.groupby(self.grouping)[self.label_col_name].count().unstack(fill_value = 0).stack()

    def transform(self, X):

        res = X.apply(lambda x: self.cnt[x['speaker']] if x['speaker'] in self.cnt.index 
                      else pd.Series({'barely-true':1, 'false':1, 'half-true':1, 'mostly-true':1, 'pants-fire':1, 'true':1})
                      , axis = 1)#assume equal probability for each statement for speakers not known
        
        res['total'] = 0
        
        #make total number of statements
        for val in self.label_domain :
            res['total'] += res[val]
        
        #divide each statement by the total number of statements
        for val in self.label_domain :
            res[val] /= res['total']
        
        #erase the additional 'total' column
        res.drop('total', axis=1, inplace=True)
        
        #return np array
        return res.values
    
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X)
        return self.transform(X)

We will use a SGDClassifier with a different penalty function and bigger alpha.

In [34]:
# The features we want to select
ratios_pip = Pipeline([("ratios_numeric", AddNumericColumnsRatios())])

text_pipeline = Pipeline([
        ("selector", DataFrameSelector('statement')),
        ("vectorizer", TfidfVectorizer(stop_words = 'english'))
        ])

pipeline_ratios = Pipeline([("union", FeatureUnion([("ratios_pipeline", ratios_pip),
                                                    ("text_pipeline", text_pipeline)
                                            ])),
            ("classifier", SGDClassifier(penalty='l1', alpha=4e-3, max_iter=1000))                            
                           ])

evaluate('Ratios pipeline', pipeline_ratios)

             pants-fire  false  barely-true  half-true  mostly-true  true
pants-fire          418    232          131        140           58    56
false               618    530          364        389          307   278
barely-true         433    399          369        354          309   227
half-true           542    377          352        489          503   352
mostly-true         460    330          285        482          474   413
true                419    286          222        325          403   381
Ratios pipeline  accuracy:  0.2094121350436767
Ratios pipeline  recall:    0.22693388006938853


Maybe we ignored too many features and considered the label ratios to be more meaningful than they are. But most probably the parameters for SGDClassifier were not good.
We should now try to use a CountVectorizer instead of a TfidVecetorizer and try to change the solver for logistic regression.

In [23]:
# The features we want to select
nominal_features = ['job', 'state', 'party', 'location']

nominal_pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)), 
        ("imputer", MissingNominalImputer(missing_values="nan", strategy="most_frequent")), 
        ("binarizer", FeatureBinarizer([df[feature].dropna().unique() for feature in nominal_features]))
        ])

text_pipeline = Pipeline([
        ("selector", DataFrameSelector('statement')),
        ("vectorizer", CountVectorizer(stop_words = 'english'))
        ])

union_step = ("union", FeatureUnion([("nominal_pipeline", nominal_pipeline),
                                     ("sbj_pipeline", sbj_pipeline),
                                     ("ratios_pipeline", ratios_pip),
                                     ("text_pipeline", text_pipeline)
                                    ]))

cent_pipeline = Pipeline([union_step,
                          ("estimator", LogisticRegression(multi_class = "multinomial", solver = "lbfgs"))
                        ])

evaluate("With count vectorizer", cent_pipeline)

             pants-fire  false  barely-true  half-true  mostly-true  true
pants-fire          254    292          158        154           87    90
false               217    680          404        452          382   351
barely-true         129    476          407        468          354   257
half-true           116    482          447        618          594   358
mostly-true          76    404          325        578          602   459
true                 77    373          251        394          468   473
With count vectorizer  accuracy:  0.2387660344691902
With count vectorizer  recall:    0.23809179540110023


In [24]:
# The features we want to select
nominal_features = ['speaker', 'job', 'state']

nominal_pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)), 
        ("imputer", MissingNominalImputer(missing_values="nan", strategy="most_frequent")), 
        ("binarizer", FeatureBinarizer([df[feature].dropna().unique() for feature in nominal_features]))
        ])

text_pipeline = Pipeline([
        ("selector", DataFrameSelector('statement')),
        ("vectorizer", CountVectorizer(min_df = 0.3))
        ])

union_step = ("union", FeatureUnion([("nominal_pipeline", nominal_pipeline),
                                     ("text_pipeline", text_pipeline)
                                    ]))

cent_pipeline = Pipeline([union_step,
                          ("classifier", SGDClassifier(loss = "log", alpha = 0.01, max_iter = 1000))])

evaluate("With count vectorizer", cent_pipeline)

             pants-fire  false  barely-true  half-true  mostly-true  true
pants-fire           33    483           29        350          112    28
false                 8    982           77        851          481    87
barely-true           6    709           81        812          422    61
half-true             6    676           78       1086          689    80
mostly-true           4    555           53        950          768   114
true                  0    465           48        795          612   116
With count vectorizer  accuracy:  0.24128433147084286
With count vectorizer  recall:    0.2086905585203386


# Conclusions
The best classifiers were the first ones, which took into account all features.  
Computing ratio of each label for speakers did not help very much. Neither changing the solvers and TfIdfVectorizer with CountVectorizer did much difference.