### Get subset of full dataset

First we import the necessary libraries

In [154]:
import csv
import pandas as pd
import random
import pyarrow.feather as feather
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time

We define a function to load a subset of the articles

In [155]:
#Indhenter et sample på ca. 1,45M artikler, meget tidseffektivt
def getSample(csvstring: str, sample_size: int):
    random.seed(0)
    n = 11000000 #number of records in file (excludes header)
    s = sample_size #desired sample size
    skip = sorted(random.sample(range(1,n+1),n-s))
    # Read the CSV file, skipping the randomly selected rows
    sampled_data = pd.read_csv(csvstring, on_bad_lines='skip', skiprows=skip, index_col=0)
    sampled_data = sampled_data.reset_index()
    return sampled_data

In [156]:
data = getSample("data/news_cleaned_2018_02_13-1.csv", 5000)

  sampled_data = pd.read_csv(csvstring, on_bad_lines='skip', skiprows=skip, index_col=0)


In [157]:
len(data)

77430

### Inspecting The Sample

In [158]:
def get_type_freq(dataframe):
    typedict = {}
    for i in dataframe['type']: 
        if str(i) in typedict:
            typedict[str(i)] +=1 
        else: 
            typedict[str(i)] =1 
    typedict
    typedictperc = typedict.copy()
    #laver et nyt dictionairy som viser det i procenttal
    for i in typedictperc:
        typedictperc[i] = (typedictperc[i]/(len(data)))*100
    return typedictperc

In [159]:
get_type_freq(data)

{'unreliable': 3.4844375565026473,
 'satire': 1.312152912307891,
 'hate': 0.9414955443626501,
 'conspiracy': 9.816608549657756,
 'political': 19.45886607258169,
 'junksci': 1.3780188557406692,
 'fake': 10.515304145679968,
 'bias': 13.462482242025056,
 'reliable': 22.384088854449182,
 'clickbait': 2.6940462353093118,
 'rumor': 5.453958414051401,
 'nan': 4.68035645098799,
 'unknown': 4.416892677256877,
 '2018-02-10 13:43:39.521661': 0.0012914890869172157}

In [160]:
allArticlesCount = 928083 + 146080 + 1300444 + 905981 + 144939 + 117374 + 292201 + 2435471 + 319830 + 1920139
#Jeg henter data from README.md om hvor mange artikler af hver type, der er i det fulde datasæt
realtypeperc = {"reliable": (1920139/allArticlesCount)*100, "political": (2435471/allArticlesCount)*100, "bias": (1300444/allArticlesCount)*100, "Satire": (146080/allArticlesCount)*100, "fake": (928083/allArticlesCount)*100, "conspiracy":(905981/allArticlesCount)*100, "unreliable": (319830/allArticlesCount)*100, "clickbait":(292201/allArticlesCount)*100, "junksci":(144939/allArticlesCount)*100 , "hate":(117374/allArticlesCount)*100}
realtypeperc
#jeg præsenterer det som procenter
#ved sammenligning er det tydeligt, at Mikkels forkortede datasæt ikke har markant anderledes proportion ift. artikeltyper, end det fulde datasæt
#Største forskel er at der er tilføjet nye typer af artikler som rumor til datasættet, siden README blev skrevet. 
#Vi kan bruge dette til at forsikre os selv om at vores forkortede datasæt er repræsentativt for det fulde datasæt. 

{'reliable': 22.561888537768805,
 'political': 28.617108052577617,
 'bias': 15.280389897611691,
 'Satire': 1.716459421738357,
 'fake': 10.905098641191126,
 'conspiracy': 10.645397202669349,
 'unreliable': 3.7580450222794273,
 'clickbait': 3.4334005989277774,
 'junksci': 1.703052520039264,
 'hate': 1.3791601051965903}

### Structure Sample For Preprocessing

Before we preprocess there are some articles we can remove from the data set. We also change the labels so that
they are either 'reliable' or 'fake'

In [161]:
#Klassificerer alt som reliable/fake
def binary_labels(df): 
    #klassificerer alle de artikler vi vil bruge ind i reliable eller fake
    df.type = df.type.replace({'political': 'reliable', 'junksci': 'fake', 'bias' : 'fake', 'satire': 'fake', 'conspiracy': 'fake', 'rumor': 'fake', 'unreliable' : 'fake', 'clickbait': 'fake', 'hate': 'fake'})
    return df

In [162]:
data = binary_labels(data)

In [163]:
#Fjerner alle de artikler vi ikke skal bruges. NB: SKAL kaldes på en dataframe, som allerede har været igennem binary_labels
def remove_bad_articles(df): 
    #fjerner artikler som har volapyk types (inklusiv 'unknown')
    df = df[(df.type == 'reliable') |(df.type == 'fake')]
    #fjerner artikler som ikke har nogen type
    df = df[df.type.notnull()]
    #fjerner artikler uden content
    df = df[df.content.notnull()]
    #fjerner duplerede artikler, ud over en enkelt
    df = df.drop_duplicates(subset = 'content', keep = 'last')
    #fjerner de artikler som ikke indeholder mindst et latinsk bogstav
    df = df[df.content.str.contains('[a-z]')]
    #reset index gør, at hvis vi fjerner artikle [2], bliver artikel [3] rykket ned på index [2] osv. dernedad.
    df = df.reset_index()
    return df 

In [164]:
data = remove_bad_articles(data)

In [165]:
len(data)

59165

In [166]:
def labelperc(df): 
    labeldict = {}
    for i in df.type: 
        if i in labeldict: 
            labeldict[i] +=1
        else: 
            labeldict[i] = 1
    for i in labeldict: 
        labeldict[i] = labeldict[i]/len(df)*100
    return labeldict

In [167]:
labelperc(data)

{'fake': 51.54398715456774, 'reliable': 48.45601284543227}

In [168]:
data.to_csv("data/sample_structured.csv")

### Preprocessing the larger sample

Then, we preprocess the sample

In [169]:
from datetime import datetime
import preprocessing_functions
import cleaning_functions
import pandas as pd
import csv
import time

In [170]:
file_name = "data/sample_preprocessed_ver_" + datetime.today().strftime('%Y-%m-%d-%s') + ".csv"
for chunck in pd.read_csv("data/sample_structured.csv", chunksize=10000, index_col=0):
    cleaning_functions.clean_dataframe(chunck)
    preprocessing_functions.preprocess(chunck)
    start = time.time()
    chunck.to_csv(file_name, mode='a')
    end = time.time()
    print("writing to csv took " + str(end - start) + " seconds")

cleaning took 39.1953649520874 seconds
Tokenizing took 8.940755844116211 seconds
Removing stopwords took 0.03564906120300293 seconds
Stemming took 0.28874802589416504 seconds
Converting to list took 31.445418119430542 seconds
writing to csv took 0.5369808673858643 seconds
cleaning took 40.923473834991455 seconds
Tokenizing took 9.286044836044312 seconds
Removing stopwords took 0.01805400848388672 seconds
Stemming took 0.0022308826446533203 seconds
Converting to list took 32.17430830001831 seconds
writing to csv took 0.5546891689300537 seconds
cleaning took 36.35886788368225 seconds
Tokenizing took 8.207499027252197 seconds
Removing stopwords took 0.18811607360839844 seconds
Stemming took 0.0019729137420654297 seconds
Converting to list took 28.507335901260376 seconds
writing to csv took 0.4858560562133789 seconds
cleaning took 32.03664016723633 seconds
Tokenizing took 7.440240144729614 seconds
Removing stopwords took 0.16337108612060547 seconds
Stemming took 0.0022242069244384766 secon