In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from cleantext import clean
from collections import Counter
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
def clean_data(input_text, regex_filter):
    cleaned_text = re.sub(r'(\S+\.com*\S+)', '<url>', input_text)
    cleaned_text = re.sub(r'(\S+\.net*\S+)', '<url>', cleaned_text)
    cleaned_text = clean(cleaned_text,  # does not remove special characters such as < , ^ etc.
        normalize_whitespace=True,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
        no_urls=True,  # replace all URLs with a special token
        no_emails=True,  # replace all email addresses with a special token
        no_phone_numbers=True,  # replace all phone numbers with a special token
        no_numbers=True,  # replace all numbers with a special token
        no_digits=True,  # replace all digits with a special token
        no_currency_symbols=True,  # replace all currency symbols with a special token
        no_punct=True,  # remove punctuations
        no_emoji=True,
        replace_with_punct="",  # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="<DIGIT>",
        replace_with_currency_symbol="<CUR>",
        lang="en")

    word_filter_list = []

    for i in regex_filter:
        words = re.findall(i, cleaned_text)
        word_filter_list.append((i, len(words)))

    for i in word_filter_list:
        cleaned_text = re.sub(i[0], '', cleaned_text)
        cleaned_text = re.sub(' +', ' ', cleaned_text)

    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    
    stemmed_and_filtered = [ps.stem(w) for w in word_tokenize(cleaned_text) if w not in stop_words]

    return stemmed_and_filtered

DO NOT RUN THE CODE BELOW! ONLY RUN THIS ONCE (ALREADY DONE)

In [None]:
# Method to only clean the data and keep the types intact
# THIS CODE SHOULD ONLY BE RUN ONCE, AS IT TAKES A LONG TIME
end_result = []
def clean_and_store(row):
    data_new = clean_data(row['content'], ['<url>', '<email>', '<phone>', '<number>', '<digit>', '<cur>'])
    end_result.append([' '.join(data_new), row['type']])

#Take it in chunk to not overload memory
for chunk in pd.read_csv('news_cleaned_2018_02_13.csv', encoding='utf8', nrows=2_500_000, chunksize=100_000, lineterminator='\n', dtype={'content':'string', 'type':'string'}):
    chunk.apply(clean_and_store, axis = 1).dropna()

#Save the end result and remove nan rows
df_processed_end_results = pd.DataFrame(end_result, columns=['articles', 'type'])
df_processed_end_results.dropna(subset=['articles'], inplace=True) #Removes nan articles
df_processed_end_results.to_csv('new_processed.csv', index = False)

In [None]:
#Also changes the type
df_cleaned = pd.read_csv('new_processed.csv', encoding='utf8', lineterminator='\n', dtype={'content':'string', 'type':'string'})
df_cleaned.columns = df_cleaned.columns.str.strip() #Remove unecessary \r
df_cleaned['type'] = df_cleaned['type'].replace('\r', '', regex=True) #Remove unecessary r from types

type_change = {'unreliable' : 'fake', 'bias' : 'fake', 'clickbait' : 'fake', 'junksci' : 'fake', 'political' : 'fake', 'conspiracy' : 'fake', 'hate' : 'fake', 'rumor' : 'fake', 'satire' : 'fake'}
df_cleaned.loc[df_cleaned['type'].isin(type_change.keys()), 'type'] = df_cleaned['type'].map(type_change) #Maps the dict and changes values
df_cleaned = df_cleaned[~df_cleaned['type'].isin(['nan', 'unknown'])] #Removes these rows directly

df_cleaned.to_csv('new_processed_type.csv', index = False)

In [None]:
df_cleaned_and_changed = pd.read_csv('new_processed_type.csv', encoding='utf8', lineterminator='\n', dtype={'content':'string', 'type':'string'})
df_cleaned_and_changed.columns = df_cleaned.columns.str.strip() #Remove unecessary \r
df_cleaned_and_changed['type'] = df_cleaned['type'].replace('\r', '', regex=True) #Remove unecessary r from types

In [None]:
X = df_cleaned_and_changed['articles']
vectorizer = CountVectorizer() #Counts and vectorizes
X = vectorizer.fit_transform(X)

y = df_cleaned_and_changed['type']
encoder = LabelEncoder() #Good for binary use, and sets fake as 0 and reliable as 1
y = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [None]:
#Baseline models
DecisionTree = DecisionTreeClassifier()
LogisticReg = LogisticRegression(max_iter=1000)

DecisionTree.fit(X_train, y_train)
LogisticReg.fit(X_train, y_train)

y_pred_decision = DecisionTree.predict(X_test)
y_pred_logistic = LogisticReg.predict(X_test)

acc_decision = accuracy_score(y_test, y_pred_decision)
acc_logistic = accuracy_score(y_test, y_pred_logistic)

print(acc_decision)
print(acc_logistic)