In [47]:
import re
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV

In [48]:
df = pd.read_csv('corona_fake.csv')

In [49]:
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1164 entries, 0 to 1163
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   1082 non-null   object
 1   text    1154 non-null   object
 2   source  1144 non-null   object
 3   label   1159 non-null   object
dtypes: object(4)
memory usage: 36.5+ KB


In [51]:
df.title.fillna('missing', inplace=True)
df.source.fillna('missing', inplace=True)
df.text.fillna(df.title, inplace=True)

In [52]:
df.label.value_counts()

TRUE    584
Fake    345
fake    230
Name: label, dtype: int64

In [53]:
df.loc[~df['source'].str.contains('^http',flags = re.I, regex=True)].source.value_counts()

Facebook                                    49
missing                                     20
YouTube                                     13
facebook                                    12
twitter                                      8
HealingOracle.ch                             5
Youtube                                      5
Tin woodman                                  2
JoanneWrightForCongress                      2
Instagram                                    2
infowars.com                                 2
gurunanda.com                                1
www.purevitalsilver.com                      1
coronavirusmedicalkit.com                    1
cdc.gov                                      1
herbalamy.com                                1
canada.ca                                    1
To Vaccinate Or Not To Vaccinate             1
vivifyholistic.ca                            1
strategic-culture.org                        1
jimbakkershow.com                            1
utro.ru      

In [54]:
df.source = df.source.apply(lambda x: x.lower())

In [55]:
df.loc[~df['source'].str.contains('^http',flags = re.I, regex=True)].source.value_counts().head()

facebook            61
missing             20
youtube             18
twitter              8
healingoracle.ch     5
Name: source, dtype: int64

In [56]:
df.loc[5]

title     CORONA UNMASKED: Chinese Intelligence Officer ...
text      CORONA UNMASKED: Chinese Intelligence Officer ...
source                                              missing
label                                                   NaN
Name: 5, dtype: object

In [57]:
df.loc[5]['label'] = 'FAKE'
df.loc[15]['label'] = 'TRUE'
df.loc[43]['label'] = 'FAKE'
df.loc[131]['label'] = 'TRUE'
df.loc[242]['label'] = 'FAKE'

In [58]:
df.label = df.label.apply(lambda x: x.lower())

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1164 entries, 0 to 1163
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   1164 non-null   object
 1   text    1164 non-null   object
 2   source  1164 non-null   object
 3   label   1164 non-null   object
dtypes: object(4)
memory usage: 36.5+ KB


In [60]:
df['title_text'] = df['title'] + ' ' + df['text']

In [61]:
df = df.sample(frac=1).reset_index(drop=True)
df['title_text'][10]

'VACCINES DON’T HEAL; THEIR PRODUCTION IS PART OF THE AGENDA FOR A NEW WORLD ORDER Cooperation instead of competition, doesn’t occur in the west. It’s all profit-driven. With a number of different vaccines from different pharma giants coming on the market, who will tell the patient which one is the best, most suitable for the patient’s condition? It smells like an utter chaotic scam.The real question is – are vaccines – or a vaccine – even necessary?  Maybe – maybe not. The production of vaccines is pushed for profit motives and for an important political agenda for a New World Order – that has been planned to change human life as we know it, or thought we knew it.Vaccines don’t heal, they may prevent the virus from hitting as hard as it might otherwise do, or not at all, depending on the age, physical and health condition of a person. Worldwide statistics show that usually a person up to the age of 40 or 50, who is infected by the COVID-19, has none or only slight symptoms, nothing to

In [62]:
def preprocessor(text):
    
    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower()

    return text

In [63]:
df['title_text'] = df['title_text'].apply(preprocessor)

In [65]:
df['title_text'][10]

'vaccines dont heal their production is part of the agenda for a new world order cooperation instead of competition doesnt occur in the west its all profitdriven with a number of different vaccines from different pharma giants coming on the market who will tell the patient which one is the best most suitable for the patients condition it smells like an utter chaotic scamthe real question is  are vaccines  or a vaccine  even necessary  maybe  maybe not the production of vaccines is pushed for profit motives and for an important political agenda for a new world order  that has been planned to change human life as we know it or thought we knew itvaccines dont heal they may prevent the virus from hitting as hard as it might otherwise do or not at all depending on the age physical and health condition of a person worldwide statistics show that usually a person up to the age of 40 or 50 who is infected by the covid19 has none or only slight symptoms nothing to worry about'

In [66]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [67]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer_porter,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)
X = tfidf.fit_transform(df['title_text'])
y = df.label.values

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5, shuffle=False)

clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, y_train)

fake_news_model = open('fake_news_model.sav', 'wb')
pickle.dump(clf, fake_news_model)
fake_news_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   17.1s remaining:   25.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.5s finished


In [95]:
filename = 'fake_news_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

saved_clf.score(X_test, y_test)

0.9243986254295533

In [96]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = clf.predict(X_test)
print("---Test Set Results---")
print("Accuracy with logreg: {}".format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

---Test Set Results---
Accuracy with logreg: 0.9243986254295533
              precision    recall  f1-score   support

        fake       0.91      0.93      0.92       278
        true       0.94      0.92      0.93       304

    accuracy                           0.92       582
   macro avg       0.92      0.92      0.92       582
weighted avg       0.92      0.92      0.92       582

