In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Algorytmy do wykrywania fake news
Porównanie działania dwóch modeli na podstawie algorytmów regresji logistycznej i Passive Aggressive Classifier. Wiadomości zebrane w bazie danych pobranej ze strony kaggle.com zostały przygotowane używając reprezentacji TF-IDF. Po stworzeniu modeli sprawdzono działanie każdego algorytmu używając accuracy_score i confusion matrix.

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [3]:
news = train_data["text"].fillna("")
news.head()

0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    Ever get the feeling your life circles the rou...
2    Why the Truth Might Get You Fired October 29, ...
3    Videos 15 Civilians Killed In Single US Airstr...
4    Print \nAn Iranian woman has been sentenced to...
Name: text, dtype: object

In [4]:
labels = train_data["label"]
labels.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(news, labels, test_size=0.33, random_state=44)

# Passive Aggressive Classifier

In [48]:
#tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)
vocab = vectorizer.get_feature_names()

In [37]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)
y_pred = pac.predict(tfidf_test)

In [38]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9672202797202797

In [39]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred, labels=[0, 1])
#[true positives, false positives,
#false negatives, true negatives]

array([[3341,  111],
       [ 114, 3298]], dtype=int64)

# Test PAC na danych testowych

In [10]:
test_news = test_data["text"].fillna('')

In [46]:
test_data.head(10)

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
5,20805,Trump is USA's antique hero. Clinton will be n...,,Trump is USA's antique hero. Clinton will be n...
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori..."
7,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...
8,20808,Urban Population Booms Will Make Climate Chang...,,Urban Population Booms Will Make Climate Chang...
9,20809,,cognitive dissident,don't we have the receipt?


In [40]:
test_matrix = vectorizer.transform(test_news)

In [41]:
y_pred_test = pac.predict(test_matrix)

In [42]:
y_pred_test

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

In [43]:
answers = pd.read_csv('submit.csv')
test_labels = answers["label"].values
test_labels


array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [44]:
test_acc = accuracy_score(test_labels, y_pred_test)
test_acc

0.6332692307692308

In [45]:
confusion_matrix(test_labels, y_pred_test, labels=[0,1])
#[true positives, false positives,
#false negatives, true negatives]

array([[1527,  812],
       [1095, 1766]], dtype=int64)

# Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(tfidf_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
y_pred_lr = lr.predict(tfidf_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(accuracy_lr)

0.9484265734265734


In [19]:
confusion_matrix(y_test, y_pred_lr, labels=[0, 1])
#[true positives, false positives,
#false negatives, true negatives]

array([[3274,  178],
       [ 176, 3236]], dtype=int64)

# Test LR na danych testowych

In [20]:
y_pred_test_lr = lr.predict(test_matrix)
y_pred_test_lr

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

In [21]:
test_acc_lr = accuracy_score(test_labels, y_pred_test_lr)
test_acc_lr

0.64

In [22]:
confusion_matrix(test_labels, y_pred_test_lr, labels=[0,1])
#[true positives, false positives,
#false negatives, true negatives]

array([[1536,  803],
       [1069, 1792]], dtype=int64)