## Importando Dataframe de Notícias

##### Alex

In [64]:
import pandas as pd

In [65]:
df = pd.read_csv('fakenews1.csv', sep=',')

In [66]:
df2 = pd.read_csv('fakenews2.csv', sep=',')

In [67]:
df_filter = df[['title', 'real']]

In [68]:
df2 = df2.dropna().reset_index()
df2 = df2.rename(columns={'label': 'real'})


In [69]:
df2_filter = df2[['title', 'real']]


In [70]:
df2_filter

Unnamed: 0,title,real
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",0
3,SATAN 2: Russia unvelis an image of its terrif...,1
4,About Time! Christian Group Sues Amazon and SP...,1
...,...,...
71532,Russians steal research on Trump in hack of U....,0
71533,WATCH: Giuliani Demands That Democrats Apolog...,1
71534,Migrants Refuse To Leave Train At Refugee Camp...,0
71535,Trump tussle gives unpopular Mexican leader mu...,0


### Juntando os DataFrames

In [71]:
noticias = pd.concat([df_filter, df2_filter], axis=0)

In [72]:
noticias

Unnamed: 0,title,real
0,Kandi Burruss Explodes Over Rape Accusation on...,1
1,People's Choice Awards 2018: The best red carp...,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,1
3,Colombian singer Maluma sparks rumours of inap...,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,1
...,...,...
71532,Russians steal research on Trump in hack of U....,0
71533,WATCH: Giuliani Demands That Democrats Apolog...,1
71534,Migrants Refuse To Leave Train At Refugee Camp...,0
71535,Trump tussle gives unpopular Mexican leader mu...,0


In [73]:
noticias.to_csv('noticias.csv', sep=';')

### Processamento com NLTK

In [74]:
import re, string, nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer

#nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
tb = TreebankWordTokenizer()

def preprocess(text):
    if not isinstance(text, str):
        text = '' if text is None else str(text)

    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)   # URLs
    text = re.sub(r'<.*?>', ' ', text)              # HTML
    tokens = tb.tokenize(text)

    # limpa tokens: tira pontuação e stopwords, mantém só letras com len>2
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

    # stemming
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens


In [75]:
print(preprocess("Breaking News: This shocking claim is 100% true!!!"))


['break', 'news', 'shock', 'claim', 'true']


In [76]:
def tokens_to_features(tokens):
    return {word: True for word in tokens}

In [77]:
dataset = [(tokens_to_features(preprocess(text)), label) 
           for text, label in zip(noticias['title'], df['real'])]

## Modelo de classificação usando NLTK

In [78]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(dataset, test_size=0.3, random_state=42)


In [None]:
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

classifier = NaiveBayesClassifier.train(train_set)

print("Acurácia:", accuracy(classifier, test_set))
classifier.show_most_informative_features(10)


Acurácia: 0.7932174162954447
Most Informative Features
                    neri = True                0 : 1      =     36.9 : 1.0
                   oxman = True                0 : 1      =     32.9 : 1.0
                 caitlyn = True                0 : 1      =     28.2 : 1.0
                    pitt = True                0 : 1      =     27.0 : 1.0
                  debunk = True                0 : 1      =     25.7 : 1.0
                    brad = True                0 : 1      =     23.5 : 1.0
              scientolog = True                0 : 1      =     21.0 : 1.0
                    foxx = True                0 : 1      =     18.8 : 1.0
                    dump = True                0 : 1      =     18.3 : 1.0
                     beg = True                0 : 1      =     18.2 : 1.0


In [80]:
exemplo = "Breaking news: Scientists found proof that aliens built the pyramids!"
tokens = preprocess(exemplo)
features = tokens_to_features(tokens)
print("Classificação:", classifier.classify(features))


Classificação: 0


## Modelo de classificação com regressão

##### Arthur