# A small-scale version of our pipeline

### Imports

In [1]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import nltk
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time
from collections import Counter

from sklearn.model_selection import train_test_split    # splitting the data 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

### Loading the dataset

In [2]:
import pandas as pd
import regex as re
from cleantext import clean
raw_data = pd.read_csv("data/sample_preprocessed_ver_2023-03-19-1679240881")

  raw_data = pd.read_csv("data/sample_preprocessed_ver_2023-03-19-1679240881")


In [3]:
raw_data = raw_data.sample(frac=0.1, random_state=0)

In [4]:
# convert the strings to lists
pattern = re.compile(r"\w+")
def string_to_list(s):
    return pattern.findall(s)
raw_data['content'] = raw_data['content'].apply(string_to_list)

In [5]:
len(raw_data)

108260

In [6]:
def fix_labels(df): 
    df.type = df.type.replace({'political': 'reliable', 'junksci': 'fake', 'bias' : 'fake', 'satire': 'fake', 'conspiracy': 'fake', 'rumor': 'fake', 'unreliable' : 'fake', 'clickbait': 'fake', 'hate': 'fake'})
    df = df[df.type != 'unknown']
    df = df[df.type != 'type']
    df = df[df.type.notnull()]
    return df

In [7]:
def labelperc(df): 
    labeldict = {}
    for i in df.type: 
        if i in labeldict: 
            labeldict[i] +=1
        else: 
            labeldict[i] = 1
    for i in labeldict: 
        labeldict[i] = labeldict[i]/len(df)*100
    return labeldict

In [8]:
data = fix_labels(raw_data)

In [9]:
print(labelperc(data))

{'reliable': 48.63466725787976, 'fake': 51.36533274212024}


In [10]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,level_0,index,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
989008,988910.0,988910,1376175,805,9319867,nytimes.com,reliable,https://www.nytimes.com/2010/04/03/opinion/03s...,"[week, donor, confer, haiti, unit, nation, str...",2018-02-11 00:44:13.682142,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Promises for and From Haiti,,,"['Haiti', 'Foreign Aid', 'United Nations', 'Ea...",The United Nations donor conference for Haiti ...,,,nytimes
890417,890328.0,890328,1273941,5697,8724756,nytimes.com,reliable,https://www.nytimes.com/2004/02/29/weekinrevie...,"[john, kerri, bare, open, mouth, tuesday, some...",2018-02-11 00:38:28.151933,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,The Nation; For Kerry and Edwards . . .,David M. Halbfinger,,"['KERRY JOHN', 'SPEECHES AND STATEMENTS', 'SL...",Article on John Kerry's favorite slogans and t...,,,nytimes
743885,743811.0,743811,1116529,4731,7764327,rinf.com,reliable,http://rinf.com/alt-news/newswire/us-threatens...,"[us, led, coalit, said, may, bomb, convoy, isl...",2017-12-09T22:10:09.846576,2018-02-08 19:18:34.468038,2018-02-08 19:18:34.468066,US threatens ISIS convoy that agreed to leave ...,,,[''],"Published time: 30 Aug, 2017 16:51 The US-led ...",,,
994748,994649.0,994649,1382128,5235,9354297,nytimes.com,reliable,https://www.nytimes.com/2010/08/27/science/spa...,"[kepler, team, also, observ, definit, two, gia...",2018-02-11 00:44:35.742188,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Telescope Detects Possible Earth-Size Planet,Kenneth Chang,,"['Planets', 'Stars and Galaxies', 'Astronomy a...","NASA scientists said the body, 1.5 times the d...",,,nytimes
339162,339129.0,339129,509943,3380,3421320,us.blastingnews.com,fake,http://us.blastingnews.com/showbiz-tv/2017/01/...,"[back, februari, comic, book, adapt, deadpool,...",2017-11-18T20:01:27.400599,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,And the Oscar goes to 'Deadpool'?,"Blasting News, Katherine Stinson, Sergida Dolo...",,[''],The comic book movie adaptation 'Deadpool' has...,"The Voice, The Walking Dead, CW, Movies, Celeb...",,


### Simple Model - Logistic Regression

In [12]:
X_train, X_val, y_train, y_val = train_test_split(data['content'], data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [13]:
vectorizer = CountVectorizer(analyzer=lambda x : x)
vectorizer.fit(X_train)

In [14]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [18]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [19]:
pred = model.predict(X_val)

In [20]:
accuracy_score(y_val, pred)

0.8128406466512702