### Imports

In [128]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import nltk
from nltk.corpus import words
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time
from collections import Counter

from sklearn.model_selection import train_test_split    # splitting the data 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

### Loading the dataset

In [129]:
import pandas as pd
import regex as re
from cleantext import clean
data = pd.read_csv("data/sample_preprocessed_ver_2023-03-23-153030.csv", index_col=0)

In [130]:
# convert the strings to lists
pattern = re.compile(r"\w+")
def string_to_list(s):
    return pattern.findall(s)
data['content'] = data['content'].apply(string_to_list)

In [131]:
len(data)

29731

In [132]:
data.head()

Unnamed: 0,level_0,index,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0.0,0,79,97,betootaadvocate.com,fake,http://www.betootaadvocate.com/advocate-in-foc...,"[loui, burk, cultur, contact, begin, privat, c...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Turnbull Celebrates Just What It Is To Be Aust...,,,[''],,,,
1.0,1,296,348,barenakedislam.com,fake,http://barenakedislam.com/2018/01/24/michigani...,"[best, kind, muslim, student, organ, think, en...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,MICHIGANISTAN: Designated terrorist group CAIR...,"Kab Ashraf, Don Spilman, Kristi Ann, Linda Riv...",,[''],,,,
2.0,2,356,419,familysecuritymatters.org,fake,http://www.familysecuritymatters.org/publicati...,"[islam, nation, domin, list, countri, danger, ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Family Security Matters,,,[''],,"Islam, Religion, TAXES, SANCTUARY CITIES, Immi...",,
3.0,3,665,756,theshovel.com.au,fake,http://www.theshovel.com.au/2018/01/25/slow-ca...,"[travel, num, km, per, hour, speed, limit, pas...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Slow Car In Front To Increase Speed For Overta...,The Shovel,,[''],,"satire, comedy",,
4.0,4,736,837,collectivelyconscious.net,fake,http://collectivelyconscious.net/articles-arch...,"[sourc, url, origin, post, date, date, num, al...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Sociological,,,[''],Hive Mind for the Awakened,"Sociological, Joe Rogan, Magic Mushrooms, Enli...",,


### Simple Model - Logistic Regression

In [133]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [134]:
vectorizer = CountVectorizer(analyzer=lambda x : x, max_features=4000)
vectorizer.fit(X_train)

In [135]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [136]:
X_train.shape

(23784, 4000)

In [137]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [138]:
pred = model.predict(X_val)

In [193]:
accuracy_score(y_val, pred)

ValueError: Found input variables with inconsistent numbers of samples: [14662, 2973]

### Advanced Model - Naive Bayes

In [213]:
data = pd.read_csv("data/sample_preprocessed_ver_2023-03-23-155618.csv", index_col=0)

In [214]:
# convert the strings to lists
pattern = re.compile(r"\w+")
def string_to_list(s):
    return pattern.findall(s)
data['content'] = data['content'].apply(string_to_list)

In [227]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [228]:
len(X_train)

117298

In [229]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=lambda x : x)
vectorizer.fit(X_train)

In [230]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [231]:
vectorizer.get_feature_names_out()

array(['a', 'aa', 'aaa', ..., 'zzzzz', 'zzzzzz', 'zzzzzzzzz'],
      dtype=object)

In [232]:
len(vectorizer.get_feature_names_out())

259378

In [233]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [234]:
pred_val = model.predict(X_val)

In [235]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, pred_val)

0.7985268039830855

In [240]:
pred_val

array(['reliable', 'reliable', 'fake', ..., 'reliable', 'reliable',
       'reliable'], dtype='<U8')

In [239]:
y_val

142530.0    reliable
63685.0         fake
136708.0    reliable
124171.0    reliable
86261.0     reliable
              ...   
96699.0         fake
47882.0     reliable
31654.0     reliable
118188.0    reliable
130259.0    reliable
Name: type, Length: 14662, dtype: object

In [238]:
from sklearn.metrics import f1_score

# y_val er pandas series, så omdanner også til ndarray
# positive = reliable
f1_score(y_val, pred_val, pos_label='reliable', average='binary')

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].