In [23]:
import pandas as pd

columns = ['text', 'subject']
fake = pd.read_csv('./Dataset/Fake.csv')[columns]
true = pd.read_csv('./Dataset/True.csv')[columns]
fake['label'] = 0
true['label'] = 1

In [24]:
df = pd.concat([fake, true], axis=0)
df.reset_index(drop=True, inplace=True)
df['label'].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

[CV 1/5] END log_reg__C=10.0, log_reg__penalty=l2, tfidf__ngram_range=(1, 1), tfidf__tokenizer=<function tokenizer at 0x12bdb5000>;, score=0.995 total time=   8.6s
[CV 4/5] END log_reg__C=10.0, log_reg__penalty=l2, tfidf__ngram_range=(1, 1), tfidf__tokenizer=<function tokenizer at 0x12df05000>;, score=0.994 total time=   8.6s
[CV 3/5] END log_reg__C=10.0, log_reg__penalty=l2, tfidf__ngram_range=(1, 1), tfidf__tokenizer=<function tokenizer at 0x12d9d1000>;, score=0.994 total time=   8.6s
[CV 5/5] END log_reg__C=10.0, log_reg__penalty=l2, tfidf__ngram_range=(1, 1), tfidf__tokenizer=<function tokenizer at 0x128fd5000>;, score=0.996 total time=   8.7s
[CV 2/5] END log_reg__C=10.0, log_reg__penalty=l2, tfidf__ngram_range=(1, 1), tfidf__tokenizer=<function tokenizer at 0x12b4d1000>;, score=0.996 total time=   8.9s


In [22]:
df['text'] = df['text'] + ' ' + df['subject']
df.drop(columns='subject', inplace=True)

In [4]:
import re
from nltk.stem.porter import PorterStemmer

# def preprocessor(text):
#     clean_text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
#     return clean_text

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + 
            ' '.join(emoticons).replace('-', '')) 
    return text

def tokenizer(text):
    return text.split(' ')

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in tokenizer(text)]

In [5]:
df['text'] = df['text'].apply(preprocessor)

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

# df = df.reindex(np.random.permutation(df.index))
x, y = df.drop(columns=['label']).values, df['label'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
x_train = x_train.flatten()
x_test = x_test.flatten()

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, token_pattern=None)

param_grid = [
    {
        'tfidf__ngram_range': [(1, 1)],
        'tfidf__tokenizer': [tokenizer],
        'log_reg__penalty': ['l2'],
        'log_reg__C':[10.0]
    }
]

lr_tfidf = Pipeline([
    ('tfidf', tfidf),
    ('log_reg', LogisticRegression(solver='liblinear'))
])

gs_lr_tfidf = GridSearchCV(
    estimator=lr_tfidf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=5,
    error_score='raise'
)

In [15]:
gs_lr_tfidf.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [16]:
print(f'Best parameters: {gs_lr_tfidf.best_params_}')
print(f'Best score: {gs_lr_tfidf.best_score_:.3f}')

clf = gs_lr_tfidf.best_estimator_
print(f'Test accuracy: {clf.score(x_test, y_test):.3f}');

Best parameters: {'log_reg__C': 10.0, 'log_reg__penalty': 'l2', 'tfidf__ngram_range': (1, 1), 'tfidf__tokenizer': <function tokenizer at 0x12a06b2e0>}
Best score: 0.995
Test accuracy: 0.997


In [17]:
feature_names = clf.named_steps['tfidf'].get_feature_names_out()
coefficients = clf.named_steps['log_reg'].coef_.flatten()
sorted_indices = np.argsort(coefficients)
top_n = 20

print(f'Top {top_n} important 2-grams for TRUE news:\n')
for i in sorted_indices[-top_n:]:
    print(f'{feature_names[i]}: {coefficients[i]:.2f}')

print(f'\n\nTop {top_n} important 2-grams for FAKE news:\n')
for i in sorted_indices[:top_n]:
    print(f'{feature_names[i]}: {coefficients[i]:.2f}')

Top 20 important 2-grams for TRUE news:

i: 4.69
a: 4.96
comment: 5.13
some: 5.16
tuesday: 5.20
minister: 5.55
wednesday: 5.56
saying: 5.75
had: 5.98
in: 6.23
edt: 6.38
nov: 6.49
republican: 7.25
washington: 7.34
u: 11.04
on: 12.60
worldnews: 22.73
said: 24.89
politicsnews: 32.03
reuters: 40.89


Top 20 important 2-grams for FAKE news:

via: -20.15
politics: -19.12
news: -18.70
read: -10.81
left: -10.33
t: -9.96
us: -9.91
this: -8.74
image: -8.50
featured: -8.41
just: -8.36
gop: -7.99
mr: -7.81
com: -7.69
hillary: -7.51
watch: -6.59
that: -6.35
sen: -6.32
america: -6.30
obama: -6.18


In [198]:
pred = gs_lr_tfidf.predict(x_test)
mask = (y_test != pred)
x_misclassified = x_test[mask]
y_misclasified = pred[mask]
print(x_misclassified[1], y_misclasified[1])

remember when the us had a president who took the lead in the war on terror russia has carried out a series of deadly airstrikes against the terrorist group and vladimir putin has now sent the country s most elite special forces team into the war zone and speculation is heightening that offensive will be bolstered by the china s people s liberation army following a number of reports of military movements in the region backed up by strong words from a senior government member at a united nations meeting reports emanating from the middle east last week said china was planning on joining the fight against isis in the coming weeks according to a syrian army official while beijing insists it will abide by the united nations un in the region hints of an action were backed up when it spoke strongly about a coordinated response to the rising terrorist threat speaking of the syrian crisis china s foreign minister wang yi said at the un security council session in new york the world cannot affor