# Predictor with complexity and stylometric/lexical features

1. Load model
2. Extract features
3. Predictions

## 1. Load the model

In [1]:
import pickle

model = pickle.load(open('../predictors/fake_news_predictorv2.pkl', 'rb'))
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=1, nthread=1,
              num_parallel_tree=1, random_state=43, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=0.9775967413441956, seed=43, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## 2. Extract features

We are going tu use the last feature extractor located on /feature_extraction, in this case the version v4. This extractor does:

- Clean the text from the headline and the news content
- Extract complexity features from headline
- Extract complexity features from news text content
- Extract stylometrix features from news text content

In [2]:
import pandas as pd
import numpy as np

import nltk
import spacy
from nltk import FreqDist
from lexical_diversity import lex_div as ld

def get_news_features(headline, text):
    nlp = spacy.load('es_core_news_lg')
    
    ## headline ##
    headline = headline.replace(r"http\S+", "")
    headline = headline.replace(r"http", "")
    headline = headline.replace(r"@\S+", "")
    headline = headline.replace(r"(?<!\n)\n(?!\n)", " ")
    headline = headline.lower()
    doc_h = nlp(headline)

    list_tokens_h = []
    list_tags_h = []
    n_sents_h = 0

    for sentence_h in doc_h.sents:
        n_sents_h += 1
        for token in sentence_h:
            list_tokens_h.append(token.text)

    fdist_h = FreqDist(list_tokens_h)

    # headline complexity features
    n_words_h = len(list_tokens_h)
    word_size_h = sum(len(word) for word in list_tokens_h) / n_words_h
    unique_words_h = (len(fdist_h.hapaxes()) / n_words_h) * 100
    ttr_h = ld.ttr(list_tokens_h) * 100
    mltd_h = ld.mtld(list_tokens_h)



    ## text content ##   
    text = text.replace(r"http\S+", "")
    text = text.replace(r"http", "")
    text = text.replace(r"@\S+", "")
    text = text.replace(r"(?<!\n)\n(?!\n)", " ")
    text = text.lower()
    doc = nlp(text)

    list_tokens = []
    list_pos = []
    list_tag = []
    n_sents = 0

    for sentence in doc.sents:
        n_sents += 1
        for token in sentence:
            list_tokens.append(token.text)
            list_pos.append(token.pos_)
            list_tag.append(token.tag_)

    n_pos = nltk.Counter(list_pos)
    n_tag = nltk.Counter(list_tag)
    fdist = FreqDist(list_tokens)

    # complexity features
    n_words = len(list_tokens)
    avg_word_sentences = (float(n_words) / n_sents)
    word_size = sum(len(word) for word in list_tokens) / n_words
    unique_words = (len(fdist.hapaxes()) / n_words) * 100
    ttr = ld.ttr(list_tokens) * 100
    mltd = ld.mtld(list_tokens)

    # lexical features
    n_quotes = n_tag['PUNCT__PunctType=Quot']
    quotes_ratio = (n_quotes / n_words) * 100
    propn_ratio = (n_pos['PROPN'] / n_words) * 100 
    noun_ratio = (n_pos['NOUN'] / n_words) * 100 
    adp_ratio = (n_pos['ADP'] / n_words) * 100
    det_ratio = (n_pos['DET'] / n_words) * 100
    punct_ratio = (n_pos['PUNCT'] / n_words) * 100 
    pron_ratio = (n_pos['PRON'] / n_words) * 100
    verb_ratio = (n_pos['VERB'] / n_words) * 100
    adv_ratio = (n_pos['ADV'] / n_words) * 100
    sym_ratio = (n_tag['SYM'] / n_words) * 100

    # create df

    df_features = pd.DataFrame({'n_sents': [n_sents], 'n_words': [n_words], 'avg_words_sents': [avg_word_sentences], 
                'word_size': [word_size], 'unique_words': [unique_words], 'ttr': [ttr], 'mltd': [mltd], 'n_words_h': [n_words_h],
                'word_size_h': [word_size_h], 'unique_words_h': [unique_words_h], 'mltd_h': [mltd_h], 'n_quotes': [n_quotes],
                'quotes_ratio': [quotes_ratio], 'propn_ratio': [propn_ratio], 'noun_ratio': [noun_ratio], 'adp_ratio': [adp_ratio],
                'det_ratio': [det_ratio], 'punct_ratio': [punct_ratio], 'pron_ratio': [pron_ratio], 'verb_ratio': [verb_ratio],
                'adv_ratio': [adv_ratio], 'sym_ratio': [sym_ratio]})
    
    return df_features

## 3. Predictions

In [17]:
X_predict = df_features

if (model.predict(X_predict)[0]) == 0:
    print('This is a fake new \nWith a probability: %.0f' % ((model.predict_proba(X_predict)[0][0])*100), '%')
else:
    print('This is a legitimate new\nWith a probability: %.0f' % ((model.predict_proba(X_predict)[0][1])*100), '%')

This is a legitimate new
With a probability: 98 %


## Full program

In [4]:
%%time

import pandas as pd
import numpy as np

import nltk
import spacy
from nltk import FreqDist
from lexical_diversity import lex_div as ld
import pickle

model = pickle.load(open('../predictors/fake_news_predictorv2.pkl', 'rb'))
nlp = spacy.load('es_core_news_lg')

headline = input("New's headline text: ")
text = input("New's content text: ")

df_features = get_news_features(headline, text)

########## PREDICTIONS ##########
X_predict = df_features

if (model.predict(X_predict)[0]) == 0:
    print('This is a fake new! \nWith a probability: %.0f' % ((model.predict_proba(X_predict)[0][0])*100), '%')
else:
    print('This is a legitimate! new\nWith a probability: %.0f' % ((model.predict_proba(X_predict)[0][1])*100), '%')

New's headline text: Los estudiantes convocan huelga contra Isabel Celaá e instan a los ministros de Podemos a apoyar el paro "por coherencia"
New's content text: En este otoño caliente para la escuela por rebrotes de coronavirus habrá padres que no lleven a sus hijos a clase porque tienen miedo, profesores que se ausentarán de sus tareas para protestar contra las medidas de autonomías como Madrid, Andalucía o Murcia y estudiantes que se plantan para denunciar "la inacción" y "la desidia" de la ministra Isabel Celaá. La comunidad educativa considera que las medidas que este jueves cerrarán el Gobierno y las CCAA "llegan tarde".  El Sindicato de Estudiantes ha convocado este miércoles cuatro días de huelga en toda España en las etapas de ESO, Bachillerato y FP. A diferencia de los sindicatos de docentes, apuntan directamente al Gobierno central porque "ha echado balones fuera y ha descargado toda la tarea sobre las consejerías": "La última responsabilidad la tiene el Ministerio", dicen.