# Preprocessing of data

### Preprocess 'news_sample.csv'

In [None]:
import pandas as pd
import lib.process_a as process_a

# load 'news_sample.csv' file from git source
df_sample = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv', index_col=0)

# Apply preprocess to dataframe: cleanup -> remove stopword -> stemming
# and get word frequencies and reduction rates
df_sample = process_a.preprocess(df_sample)

# save csv file copy of preprocessed dataframe
df_sample.to_csv("data/news_sample_cleaned.csv")

### Preprocess '995,000_rows.csv' dataset

In [None]:
import pandas as pd
import lib.process_b as process_b

# load data
src = 'data/995,000_rows.csv'
raw_data = pd.read_csv(src)

# Apply preprocess: cleanup -> remove stopword -> stemming
# NB: Takes about 1.5h on an M1 MacBook pro
clean_data = process_b.preprocess(raw_data)

# save csv file of processed data
dst = 'data/995,000_rows_cleaned.csv'
clean_data.to_csv(dst)

### Process word frequencies for data exploration

In [None]:
import pandas as pd
import lib.process_methods as pm
import swifter

src = 'data/995,000_rows_cleaned.csv'
# src = 'data/995,000_rows_cleaned_SAMPLE.csv'
clean_data = pd.read_csv(src)

# get vocabulary sizes (word frequencies) for data exploration
word_freq_data = pd.DataFrame()

# types
word_freq_data['type'] = clean_data['type']

# word freqs
word_freq_data['content_word_freq'] = clean_data['content_clean'].swifter.apply(pm.get_word_freq)
word_freq_data['stop_word_freq'] = clean_data['content_stopword'].swifter.apply(pm.get_word_freq)
word_freq_data['stem_word_freq'] = clean_data['content_stem'].swifter.apply(pm.get_word_freq)

# reduction rates
word_freq_data['stop_reduction_rate'] = pm.reduction_rate(word_freq_data, 'content_word_freq', 'stop_word_freq')
word_freq_data['stem_reduction_rate'] = pm.reduction_rate(word_freq_data, 'content_word_freq', 'stem_word_freq')

# save to file
word_freq_data.to_csv('data/word_freq.csv')

### Split 995k dataset into a training, validation, and test sets

In [None]:
import pandas as pd
import lib.process_methods as pm

src = 'data/995,000_rows_cleaned.csv'
split_data = pd.read_csv(src)
pm.train_valid_test(split_data)

### Group 'fake' and 'reliable' types

In [None]:
import pandas as pd
import lib.process_methods as pm

# load data
src_train = 'data/training_data.csv'
train_data = pd.read_csv(src_train)

src_valid = 'data/validation_data.csv'
valid_data = pd.read_csv(src_valid)

src_test = 'data/test_data.csv'
test_data = pd.read_csv(src_test)

# rows with omitted types
omitted_types = {'political',
                'bias',
                'rumor',
                'unknown',
                'unreliable',
                'clickbait',
                'junksci',
                'hate',
                '2018-02-10 13:43:39.521661'
}

# reassigned labels into 'fake' and 'reliable'
fake_types = {'fake', 
              'satire',
              'conspiracy',
}

reliable_types = {'reliable'}

# group training data and save to file
dst = 'data/training_data_grouped.csv'
train_data_group = pm.group_data(train_data, omitted_types, fake_types)
train_data_group.to_csv(dst)

# group valid data and save to file
dst = 'data/validation_data_grouped.csv'
valid_data_group = pm.group_data(valid_data, omitted_types, fake_types)
valid_data_group.to_csv(dst)

# group test data and save to file
dst = 'data/test_data_grouped.csv'
test_data_group = pm.group_data(test_data, omitted_types, fake_types)
test_data_group.to_csv(dst)

In [None]:
# destribution of types in grouped training data
type_dist = train_data_group['type'].value_counts(normalize=True) * 100
print("TRANING DATA:")
print(type_dist)

# destribution of types in grouped validation data
type_dist = valid_data_group['type'].value_counts(normalize=True) * 100
print("VALIDATION DATA:")
print(type_dist)

# destribution of types in grouped validation data
type_dist = test_data_group['type'].value_counts(normalize=True) * 100
print("TEST DATA:")
print(type_dist)