### Get subset of full dataset

First we import the necessary libraries

In [19]:
import csv
import pandas as pd
import random
import pyarrow.feather as feather
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time

We define a function to load a subset of the articles

In [20]:
#Indhenter et sample på ca. 1,45M artikler, meget tidseffektivt
def getSample(csvstring: str, sample_size: int):
    random.seed(0)
    n = 11000000 #number of records in file (excludes header)
    s = sample_size #desired sample size
    skip = sorted(random.sample(range(1,n+1),n-s))
    # Read the CSV file, skipping the randomly selected rows
    sampled_data = pd.read_csv(csvstring, on_bad_lines='skip', skiprows=skip, index_col=0)
    sampled_data = sampled_data.reset_index()
    return sampled_data

In [21]:
data = getSample("data/news_cleaned_2018_02_13-1.csv",1900000)

In [22]:
len(data)

7748

### Inspecting The Sample

In [23]:
def get_type_freq(dataframe):
    typedict = {}
    for i in dataframe['type']: 
        if str(i) in typedict:
            typedict[str(i)] +=1 
        else: 
            typedict[str(i)] =1 
    typedict
    typedictperc = typedict.copy()
    #laver et nyt dictionairy som viser det i procenttal
    for i in typedictperc:
        typedictperc[i] = (typedictperc[i]/(len(data)))*100
    return typedictperc

In [24]:
get_type_freq(data)

{'fake': 10.441404233350541,
 'junksci': 1.2906556530717606,
 'political': 18.856479091378418,
 'conspiracy': 9.537945276200311,
 'nan': 4.981930820856995,
 'bias': 13.461538461538462,
 'clickbait': 2.8523489932885906,
 'unreliable': 3.523489932885906,
 'rumor': 5.846670108415076,
 'unknown': 4.168817759421787,
 'reliable': 22.676819824470833,
 'hate': 1.0712441920495612,
 'satire': 1.2906556530717606}

In [25]:
allArticlesCount = 928083 + 146080 + 1300444 + 905981 + 144939 + 117374 + 292201 + 2435471 + 319830 + 1920139
#Jeg henter data from README.md om hvor mange artikler af hver type, der er i det fulde datasæt
realtypeperc = {"reliable": (1920139/allArticlesCount)*100, "political": (2435471/allArticlesCount)*100, "bias": (1300444/allArticlesCount)*100, "Satire": (146080/allArticlesCount)*100, "fake": (928083/allArticlesCount)*100, "conspiracy":(905981/allArticlesCount)*100, "unreliable": (319830/allArticlesCount)*100, "clickbait":(292201/allArticlesCount)*100, "junksci":(144939/allArticlesCount)*100 , "hate":(117374/allArticlesCount)*100}
realtypeperc
#jeg præsenterer det som procenter
#ved sammenligning er det tydeligt, at Mikkels forkortede datasæt ikke har markant anderledes proportion ift. artikeltyper, end det fulde datasæt
#Største forskel er at der er tilføjet nye typer af artikler som rumor til datasættet, siden README blev skrevet. 
#Vi kan bruge dette til at forsikre os selv om at vores forkortede datasæt er repræsentativt for det fulde datasæt. 

{'reliable': 22.561888537768805,
 'political': 28.617108052577617,
 'bias': 15.280389897611691,
 'Satire': 1.716459421738357,
 'fake': 10.905098641191126,
 'conspiracy': 10.645397202669349,
 'unreliable': 3.7580450222794273,
 'clickbait': 3.4334005989277774,
 'junksci': 1.703052520039264,
 'hate': 1.3791601051965903}

### Structure Sample For Preprocessing

Before we preprocess there are some articles we can remove from the data set. We also change the labels so that
they are either 'reliable' or 'fake'

In [26]:
#Klassificerer alt som reliable/fake
def binary_labels(df): 
    #klassificerer alle de artikler vi vil bruge ind i reliable eller fake
    df.type = df.type.replace({'political': 'reliable', 'junksci': 'fake', 'bias' : 'fake', 'satire': 'fake', 'conspiracy': 'fake', 'rumor': 'fake', 'unreliable' : 'fake', 'clickbait': 'fake', 'hate': 'fake'})
    return df

In [27]:
data = binary_labels(data)

In [28]:
#Fjerner alle de artikler vi ikke skal bruges. NB: SKAL kaldes på en dataframe, som allerede har været igennem binary_labels
def remove_bad_articles(df): 
    #fjerner artikler som har volapyk types (inklusiv 'unknown')
    df = df[(df.type == 'reliable') |(df.type == 'fake')]
    #fjerner artikler som ikke har nogen type
    df = df[df.type.notnull()]
    #fjerner artikler uden content
    df = df[df.content.notnull()]
    #fjerner duplerede artikler, ud over en enkelt
    df = df.drop_duplicates(subset = 'content', keep = 'last')
    #fjerner de artikler som ikke indeholder mindst et latinsk bogstav
    df = df[df.content.str.contains('[a-z]')]
    #reset index gør, at hvis vi fjerner artikle [2], bliver artikel [3] rykket ned på index [2] osv. dernedad.
    df = df.reset_index()
    return df 

In [29]:
data = remove_bad_articles(data)

In [30]:
len(data)

6071

In [31]:
def labelperc(df): 
    labeldict = {}
    for i in df.type: 
        if i in labeldict: 
            labeldict[i] +=1
        else: 
            labeldict[i] = 1
    for i in labeldict: 
        labeldict[i] = labeldict[i]/len(df)*100
    return labeldict

In [32]:
labelperc(data)

{'fake': 52.18250700049415, 'reliable': 47.817492999505845}

In [33]:
temp = "data/sample_structured.csv"

In [34]:
data.to_csv(temp)

### Preprocessing the larger sample

Then, we preprocess the sample

In [35]:
from datetime import datetime
from ipywidgets import IntProgress
from IPython.display import display
import preprocessing_functions
import cleaning_functions
import pandas as pd
import csv
import time
import itertools

In [36]:
file_name = "data/sample_preprocessed_ver_" + datetime.today().strftime('%Y-%m-%d-%s') + ".csv"
meta_data = "meta_data/preprocess_info_" + datetime.today().strftime('%Y-%m-%d-%s') + "csv"
f = IntProgress(min=0, max=len(data))
display(f)
CHUNK_SIZE = 10000
for chunk in pd.read_csv(temp, chunksize=CHUNK_SIZE, index_col=0):
    #chunk = chunk[['type', 'content']]
    cleaning_functions.clean_dataframe(chunk) # clean chunk
    chunk['content'] = chunk['content'].apply(preprocessing_functions.tokenizer()) # tokenize    
    #vocab = set(itertools.chain.from_iterable(chunk['content'])) # get vocabulary
    chunk['content'] = chunk['content'].apply(preprocessing_functions.stopwords_remover()) # remove stopwords
    #vocab_no_stopwords = set(itertools.chain.from_iterable(chunk['content'])) # get vocabulary
    chunk['content'] = chunk['content'].apply(preprocessing_functions.token_stemmer()) # stem tokens
    #vocab_stemmed = set(itertools.chain.from_iterable(chunk['content'])) # get vocabulary
    chunk.to_csv(file_name, mode="a")

    #with open("data/content_test.csv", 'a') as file:
    #    writer = csv.writer(file)
    #    writer.writerows(chunk['content'])

    #chunk.to_csv("data/type_test.csv", mode="a", columns=['type'])

    #with open(meta_data,'a') as meta_data_file:
    #    writer = csv.writer(meta_data_file)
    #    writer.writerow([(len(vocab_no_stopwords) / len(vocab)),(len(vocab_no_stopwords) / len(vocab_stemmed))])

    f.value += CHUNK_SIZE


IntProgress(value=0, max=6071)