### Imports

In [338]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import nltk
from nltk.corpus import words
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time
from collections import Counter

from sklearn.model_selection import train_test_split    # splitting the data 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

### Loading the dataset

In [339]:
import pandas as pd
import regex as re
from cleantext import clean
raw_data = pd.read_csv("data/sample_preprocessed_ver_2023-03-21-1679408688.csv", index_col=0)

In [340]:
# convert the strings to lists
pattern = re.compile(r"\w+")
def string_to_list(s):
    return pattern.findall(s)
raw_data['content'] = raw_data['content'].apply(string_to_list)

In [341]:
len(raw_data)

29731

In [342]:
def fix_labels(df): 
    df.type = df.type.replace({'political': 'reliable', 'junksci': 'fake', 'bias' : 'fake', 'satire': 'fake', 'conspiracy': 'fake', 'rumor': 'fake', 'unreliable' : 'fake', 'clickbait': 'fake', 'hate': 'fake'})
    df = df[df.type != 'unknown']
    df = df[df.type != 'type']
    df = df[df.type.notnull()]
    return df

In [343]:
def labelperc(df): 
    labeldict = {}
    for i in df.type: 
        if i in labeldict: 
            labeldict[i] +=1
        else: 
            labeldict[i] = 1
    for i in labeldict: 
        labeldict[i] = labeldict[i]/len(df)*100
    return labeldict

In [344]:
data = fix_labels(raw_data)

In [345]:
print(labelperc(data))

{'fake': 51.82145379932053, 'reliable': 48.17854620067947}


In [346]:
data.head()

Unnamed: 0,level_0,index,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0.0,0,79,97,betootaadvocate.com,fake,http://www.betootaadvocate.com/advocate-in-foc...,"[loui, burk, cultur, contact, begin, privat, c...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Turnbull Celebrates Just What It Is To Be Aust...,,,[''],,,,
1.0,1,296,348,barenakedislam.com,fake,http://barenakedislam.com/2018/01/24/michigani...,"[isnt, best, kind, muslim, student, organ, thi...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,MICHIGANISTAN: Designated terrorist group CAIR...,"Kab Ashraf, Don Spilman, Kristi Ann, Linda Riv...",,[''],,,,
2.0,2,356,419,familysecuritymatters.org,fake,http://www.familysecuritymatters.org/publicati...,"[islam, nation, domin, list, countri, danger, ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Family Security Matters,,,[''],,"Islam, Religion, TAXES, SANCTUARY CITIES, Immi...",,
3.0,3,665,756,theshovel.com.au,fake,http://www.theshovel.com.au/2018/01/25/slow-ca...,"[travel, num, km, per, hour, speed, limit, pas...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Slow Car In Front To Increase Speed For Overta...,The Shovel,,[''],,"satire, comedy",,
4.0,4,736,837,collectivelyconscious.net,fake,http://collectivelyconscious.net/articles-arch...,"[sourc, url, origin, post, date, date, num, al...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Sociological,,,[''],Hive Mind for the Awakened,"Sociological, Joe Rogan, Magic Mushrooms, Enli...",,


### Simple Model - Logistic Regression

In [348]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [349]:
vectorizer = CountVectorizer(analyzer=lambda x : x, max_features=4000)
vectorizer.fit(X_train)

In [350]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [351]:
X_train.shape

(23783, 4000)

In [352]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [353]:
pred = model.predict(X_val)

In [354]:
accuracy_score(y_val, pred)

0.7847292297342752

### Advanced Model - Naive Bayes

In [355]:
data = pd.read_csv("data/sample_structured.csv")

In [356]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [357]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=lambda x : x)
vectorizer.fit(X_train)

In [358]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [359]:
vectorizer.get_feature_names_out()

array(['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
       ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
       'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
       'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
       '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
       'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
       'z', '{', '|', '}', '~', '\x7f', '\x80', '\x81', '\x83', '\x84',
       '\x86', '\x8a', '\x8e', '\x8f', '\x90', '\x91', '\x92', '\x93',
       '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9c', '\x9d',
       '\x9f', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬',
       '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¹', 'º',
       '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Ç', 'È',
       'É', 'Ë', 'Í', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ö', '

In [360]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [361]:
pred_val = model.predict(X_val)

In [362]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, pred_val)

0.6488395560040363

In [363]:
from sklearn.metrics import f1_score

# y_val er pandas series, så omdanner også til ndarray
# positive = reliable
f1_score(pd.Series.to_numpy(y_val), pred_val, pos_label='reliable')

0.7035775127768313