### Get subset of full dataset

First we import the necessary libraries

In [1]:
import csv
import pandas as pd
import random
import pyarrow.feather as feather
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time

We define a function to load a subset of the articles

In [2]:
#Indhenter et sample på ca. 1,45M artikler, meget tidseffektivt
def getSample(csvstring: str, sample_size: int):
    random.seed(0)
    n = 11000000 #number of records in file (excludes header)
    s = sample_size #desired sample size
    skip = sorted(random.sample(range(1,n+1),n-s))
    # Read the CSV file, skipping the randomly selected rows
    sampled_data = pd.read_csv(csvstring, on_bad_lines='skip', skiprows=skip, index_col=0)
    sampled_data = sampled_data.reset_index()
    return sampled_data

In [3]:
data = getSample("data/news_cleaned_2018_02_13-1.csv", 5000)

In [4]:
len(data)

3905

### Inspecting The Sample

In [5]:
def get_type_freq(dataframe):
    typedict = {}
    for i in dataframe['type']: 
        if str(i) in typedict:
            typedict[str(i)] +=1 
        else: 
            typedict[str(i)] =1 
    typedict
    typedictperc = typedict.copy()
    #laver et nyt dictionairy som viser det i procenttal
    for i in typedictperc:
        typedictperc[i] = (typedictperc[i]/(len(data)))*100
    return typedictperc

In [6]:
get_type_freq(data)

{'fake': 10.345710627400768,
 'conspiracy': 9.807938540332907,
 'political': 18.56594110115237,
 'bias': 13.4955185659411,
 'clickbait': 2.9193341869398206,
 'nan': 5.172855313700384,
 'unreliable': 3.6619718309859155,
 'rumor': 5.761843790012804,
 'junksci': 1.3060179257362354,
 'unknown': 4.12291933418694,
 'reliable': 22.458386683738794,
 'hate': 1.1011523687580027,
 'satire': 1.2804097311139564}

In [7]:
allArticlesCount = 928083 + 146080 + 1300444 + 905981 + 144939 + 117374 + 292201 + 2435471 + 319830 + 1920139
#Jeg henter data from README.md om hvor mange artikler af hver type, der er i det fulde datasæt
realtypeperc = {"reliable": (1920139/allArticlesCount)*100, "political": (2435471/allArticlesCount)*100, "bias": (1300444/allArticlesCount)*100, "Satire": (146080/allArticlesCount)*100, "fake": (928083/allArticlesCount)*100, "conspiracy":(905981/allArticlesCount)*100, "unreliable": (319830/allArticlesCount)*100, "clickbait":(292201/allArticlesCount)*100, "junksci":(144939/allArticlesCount)*100 , "hate":(117374/allArticlesCount)*100}
realtypeperc
#jeg præsenterer det som procenter
#ved sammenligning er det tydeligt, at Mikkels forkortede datasæt ikke har markant anderledes proportion ift. artikeltyper, end det fulde datasæt
#Største forskel er at der er tilføjet nye typer af artikler som rumor til datasættet, siden README blev skrevet. 
#Vi kan bruge dette til at forsikre os selv om at vores forkortede datasæt er repræsentativt for det fulde datasæt. 

{'reliable': 22.561888537768805,
 'political': 28.617108052577617,
 'bias': 15.280389897611691,
 'Satire': 1.716459421738357,
 'fake': 10.905098641191126,
 'conspiracy': 10.645397202669349,
 'unreliable': 3.7580450222794273,
 'clickbait': 3.4334005989277774,
 'junksci': 1.703052520039264,
 'hate': 1.3791601051965903}

### Structure Sample For Preprocessing

Before we preprocess there are some articles we can remove from the data set. We also change the labels so that
they are either 'reliable' or 'fake'

In [8]:
def structure_dataframe(df): 
    # set all lables to either 'reliable' or 'fake' using the rules below
    df.type = df.type.replace({'political': 'reliable', 'junksci': 'fake', 'bias' : 'fake', 'satire': 'fake', 'conspiracy': 'fake', 'rumor': 'fake', 'unreliable' : 'fake', 'clickbait': 'fake', 'hate': 'fake'})
    # there are some junk labels due to bad formatting, so we remove those
    df = df[(df.type == 'reliable') |(df.type == 'fake')]
    # remove all NaN types
    df = df[df.type.notnull()]
    # remove all NaN content
    df = df[df.content.notnull()]
    # remove duplicates. Some entries have simmilar content, like a notification from Google Plus
    # or a message saying that the TOR browser needs to be installed in order view the article.
    df = df.drop_duplicates(subset = 'content', keep = 'last')
    
    df = df.reset_index()
    return df

In [9]:
data = structure_dataframe(data)

In [10]:
len(data)

3109

In [11]:
def labelperc(df): 
    labeldict = {}
    for i in df.type: 
        if i in labeldict: 
            labeldict[i] +=1
        else: 
            labeldict[i] = 1
    for i in labeldict: 
        labeldict[i] = labeldict[i]/len(df)*100
    return labeldict

In [12]:
labelperc(data)

{'fake': 52.94306851077517, 'reliable': 47.05693148922483}

In [13]:
data.to_csv("data/sample_structured.csv")

### Preprocessing the larger sample

Then, we preprocess the sample

In [14]:
from datetime import datetime
import preprocessing_functions
import cleaning_functions
import pandas as pd
import csv
import time
file_name = "data/sample_preprocessed_ver_" + datetime.today().strftime('%Y-%m-%d-%s') + ".csv"
for chunck in pd.read_csv("data/sample_structured.csv", chunksize=10000, index_col=0):
    cleaning_functions.clean_dataframe(chunck)
    preprocessing_functions.preprocess(chunck)
    start = time.time()
    chunck.to_csv(file_name, mode='a')
    end = time.time()
    print("writing to csv took " + str(end - start) + " seconds")

NameError: name 'functions' is not defined