### Imports

In [583]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import nltk
from nltk.corpus import words
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time
from collections import Counter

from sklearn.model_selection import train_test_split    # splitting the data 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [584]:
def word_freq(dataframe):
    # get all words appearing in reliable articles
    reliable_words = [word for list in list(itertools.chain(dataframe.loc[dataframe['type'] == 'reliable'].content)) for word in list]
    # get all words appearing in fake articles
    fake_words = [word for list in list(itertools.chain(dataframe.loc[dataframe['type'] == 'fake'].content)) for word in list]
    # count words in both lists
    count_reliable = Counter(reliable_words)
    count_fake = Counter(fake_words)
    return pd.DataFrame({'reliable': pd.Series(count_reliable), 'fake': pd.Series(count_fake)})

### Loading the dataset

In [585]:
import pandas as pd
import regex as re
from cleantext import clean
data = pd.read_csv("data/sample_preprocessed_ver_2023-03-21-1679410790.csv", index_col=0)

In [586]:
# convert the strings to lists
pattern = re.compile(r"\w+")
def string_to_list(s):
    return pattern.findall(s)
data['content'] = data['content'].apply(string_to_list)

In [587]:
len(data)

59170

In [589]:
data.head()

Unnamed: 0,level_0,index,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0.0,0,63.0,79,awm.com,fake,http://awm.com/heres-the-video-causing-people-...,"[staff, australian, reptil, park, realiz, spid...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Here’s The Video Causing People To Gag And Run...,Alexander Smith,,[''],,,,
1.0,1,79.0,97,betootaadvocate.com,fake,http://www.betootaadvocate.com/advocate-in-foc...,"[loui, burk, cultur, contact, begin, privat, c...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Turnbull Celebrates Just What It Is To Be Aust...,,,[''],,,,
2.0,2,296.0,348,barenakedislam.com,fake,http://barenakedislam.com/2018/01/24/michigani...,"[isnt, best, kind, muslim, student, organ, thi...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,MICHIGANISTAN: Designated terrorist group CAIR...,"Kab Ashraf, Don Spilman, Kristi Ann, Linda Riv...",,[''],,,,
3.0,3,356.0,419,familysecuritymatters.org,fake,http://www.familysecuritymatters.org/publicati...,"[islam, nation, domin, list, countri, danger, ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Family Security Matters,,,[''],,"Islam, Religion, TAXES, SANCTUARY CITIES, Immi...",,
4.0,4,612.0,694,familysecuritymatters.org,fake,http://www.familysecuritymatters.org/publicati...,"[profess, christ, becom, hate, crime, west, da...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Family Security Matters,,,[''],,"Islam, Religion, TAXES, SANCTUARY CITIES, Immi...",,


### Simple Model - Logistic Regression

In [590]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [591]:
vectorizer = CountVectorizer(analyzer=lambda x : x, max_features=170000)
vectorizer.fit(iter(X_train))

In [592]:
X_train.shape

(47336,)

In [593]:
vectorizer.get_feature_names_out()

array(['a', 'aa', 'aaa', ..., '️tat', '️tom', '️url'], dtype=object)

In [594]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [595]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [596]:
pred = model.predict(X_val)

In [597]:
accuracy_score(y_val, pred)

0.8044617204664526

In [598]:
from sklearn.metrics import f1_score

# y_val er pandas series, så omdanner også til ndarray
# positive = reliable
f1_score(pd.Series.to_numpy(y_val), pred_val, pos_label='reliable')

0.789561170212766

### Advanced Model - Naive Bayes

In [599]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [600]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=lambda x : x, max_features=170000)
vectorizer.fit(X_train)

In [601]:
X_train.shape

(47336,)

In [602]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [603]:
vectorizer.get_feature_names_out()

array(['a', 'aa', 'aaa', ..., '️tat', '️tom', '️url'], dtype=object)

In [604]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [605]:
pred_val = model.predict(X_val)

In [606]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, pred_val)

0.7860402230860233

In [607]:
from sklearn.metrics import f1_score

# y_val er pandas series, så omdanner også til ndarray
# positive = reliable
f1_score(pd.Series.to_numpy(y_val), pred_val, pos_label='reliable')

0.789561170212766