### Imports

In [1]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import nltk
from nltk.corpus import words
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time
from collections import Counter

from sklearn.model_selection import train_test_split    # splitting the data 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

### Loading the dataset

In [2]:
import pandas as pd
import regex as re
from cleantext import clean
raw_data = pd.read_csv("data/sample_preprocessed_ver_2023-03-21-1679398852.csv", index_col=0)

In [3]:
# convert the strings to lists
pattern = re.compile(r"\w+")
def string_to_list(s):
    return pattern.findall(s)
raw_data['content'] = raw_data['content'].apply(string_to_list)

In [4]:
len(raw_data)

3109

In [5]:
def fix_labels(df): 
    df.type = df.type.replace({'political': 'reliable', 'junksci': 'fake', 'bias' : 'fake', 'satire': 'fake', 'conspiracy': 'fake', 'rumor': 'fake', 'unreliable' : 'fake', 'clickbait': 'fake', 'hate': 'fake'})
    df = df[df.type != 'unknown']
    df = df[df.type != 'type']
    df = df[df.type.notnull()]
    return df

In [6]:
def labelperc(df): 
    labeldict = {}
    for i in df.type: 
        if i in labeldict: 
            labeldict[i] +=1
        else: 
            labeldict[i] = 1
    for i in labeldict: 
        labeldict[i] = labeldict[i]/len(df)*100
    return labeldict

In [7]:
data = fix_labels(raw_data)

In [8]:
print(labelperc(data))

{'fake': 52.94306851077517, 'reliable': 47.05693148922483}


In [9]:
data.head()

Unnamed: 0,level_0,index,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0,7024,8011,beforeitsnews.com,fake,http://beforeitsnews.com/tea-party/2017/12/ame...,"[american, told, surrend, gun, reader, think, ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Americans Are Being Told to Surrender Their Guns,Freedom Bunker,,[''],,,,
1,1,7539,8580,canadafreepress.com,fake,http://canadafreepress.com/article/why-arent-t...,"[obama, rate, continu, deterior, even, within,...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Why Aren’t the Republicans Running Away with t...,"A.J. Cameron, Because Without America, There I...",,[''],,,,
2,2,1715,13230,beforeitsnews.com,fake,http://beforeitsnews.com/tea-party/2018/01/eco...,"[economi, beat, forecast, jobless, claim, num,...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Economy Beats Forecasters Again: Jobless Claim...,,,[''],,,,
3,3,1779,13301,beforeitsnews.com,fake,http://beforeitsnews.com/tea-party/2018/01/wap...,"[wapo, editori, war, russia, reader, think, st...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,WaPo’s Editorial War on Russia,Freedom Bunker,,[''],,,,
4,4,2299,13888,washingtonexaminer.com,reliable,http://www.washingtonexaminer.com/in-wake-of-m...,"[matt, lauer, year, alleg, sexual, harass, wen...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"In wake of Matt Lauer's firing, NBC reportedly...",Emily Jashinsky,,"['Matt Lauer', 'Emily Jashinsky', 'Sexual Assa...",The network's painfully specific guidelines on...,,,


In [10]:
data['content'][0]

['american',
 'told',
 'surrend',
 'gun',
 'reader',
 'think',
 'stori',
 'fact',
 'add',
 'two',
 'cent',
 'headlin',
 'bitcoin',
 'blockchain',
 'search',
 'exceed',
 'trump',
 'blockchain',
 'stock',
 'next',
 'dear',
 'black',
 'bag',
 'confidenti',
 'reader',
 'week',
 'must',
 'read',
 'articl',
 'touch',
 'mani',
 'topic',
 'close',
 'heart',
 'second',
 'amend',
 'right',
 'better',
 'home',
 'secur',
 'privaci',
 'govern',
 'prepar',
 'global',
 'warfar',
 'strive',
 'self',
 'suffici',
 'lot',
 'great',
 'inform',
 'pack',
 'rundown',
 'wast',
 'time',
 'let',
 'get',
 'busi',
 'num',
 'hawaii',
 'polic',
 'order',
 'peopl',
 'medic',
 'pot',
 'surrend',
 'gun',
 'state',
 'wherev',
 'stand',
 'issu',
 'medic',
 'marijuana',
 'absolut',
 'alarm',
 'unit',
 'state',
 'law',
 'enforc',
 'entiti',
 'tri',
 'coerc',
 'american',
 'citizen',
 'give',
 'second',
 'amend',
 'right',
 'back',
 'februari',
 'ran',
 'articl',
 'discuss',
 'whether',
 'convict',
 'felon',
 'right',
 'be

### Simple Model - Logistic Regression

In [11]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [12]:
vectorizer = CountVectorizer(analyzer=lambda x : x, max_features=4000)
vectorizer.fit(X_train)

In [13]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [14]:
X_train.shape

(2487, 4000)

In [15]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [16]:
pred = model.predict(X_val)

In [17]:
accuracy_score(y_val, pred)

0.7459807073954984

### Simple Model - Naive Bayes

In [18]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x : x, max_features=10000)
vectorizer.fit(X_train)

In [20]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [21]:
vectorizer.get_feature_names_out()

array(['___', 'aaron', 'abandon', ..., 'я', 'является', 'яндекса'],
      dtype=object)

In [22]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [23]:
pred_val = model.predict(X_val)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, pred_val)

0.7588424437299035