In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from cleantext import clean
from collections import Counter
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import swifter

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [8]:
end_result = pd.read_csv('news_cleaned_2018_02_13.csv', encoding='utf8', nrows=1_600_000, dtype={'content':'string', 'type':'string'}, usecols=['content', 'type'], lineterminator='\n', skip_blank_lines=True).dropna(subset=['content'])
print("Content without empty: ", len(end_result.index))
end_result = end_result[~end_result['type'].isin(['nan', 'unknown'])]
print("Content without Nan and unknown: ", len(end_result.index))
end_result = end_result.drop_duplicates(subset=['content'])
print("Content without duplicates: ", len(end_result.index))
end_result.to_csv('content_type_data.csv', columns=['content', 'type'])

Content without empty:  1600000
Content without Nan and unknown:  1546834
Content without duplicates:  1077683


In [2]:
def clean_data(input_text):
    cleaned_text = re.sub(r'(\S+\.com*\S+)', '<url>', input_text)
    cleaned_text = re.sub(r'(\S+\.net*\S+)', '<url>', cleaned_text)
    cleaned_text = re.sub(r'\-', ' ', cleaned_text)
    cleaned_text = re.sub(r'\|', ' ', cleaned_text)
    cleaned_text = clean(cleaned_text,  # does not remove special characters such as < , ^ etc.
        normalize_whitespace=True,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
        no_urls=True,  # replace all URLs with a special token
        no_emails=True,  # replace all email addresses with a special token
        no_phone_numbers=True,  # replace all phone numbers with a special token
        no_numbers=True,  # replace all numbers with a special token
        no_digits=True,  # replace all digits with a special token
        no_currency_symbols=True,  # replace all currency symbols with a special token
        no_punct=True,  # remove punctuations
        no_emoji=True,
        replace_with_punct="",  # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="<DIGIT>",
        replace_with_currency_symbol="<CUR>",
        lang="en")

    return cleaned_text

In [3]:
#uses 5-6 gb of ram slow 2h
df = pd.read_csv('content_type_data.csv', encoding='utf8', dtype={'content':'string', 'type':'string'})
data_size = len(df.index)
pd.DataFrame(columns=['content', 'type']).to_csv("cleaned_text.csv")

for i in range(0,data_size):
    content_result = clean_data(df.iloc[i]['content'])
    type_result = df.iloc[i]['type']

    result = {'content':content_result, 'type':type_result}
    new_data = pd.DataFrame(result, index=[i])
    new_data.to_csv('cleaned_text.csv', mode='a', header=False)

In [4]:
df_cleaned = pd.read_csv('cleaned_text.csv', encoding='utf8', dtype={'content':'string', 'type':'string'}).dropna(subset=['content'])
df_cleaned['type'] = df_cleaned['type'].replace(['unreliable', 'bias', 'junksci', 'conspiracy', 'hate', 'rumor', 'satire', 'state'], 'fake')
df_cleaned['type'] = df_cleaned['type'].replace(['political', 'clickbait'], 'reliable')
df_cleaned.columns = df_cleaned.columns.str.strip() #Remove unecessary \r
df_cleaned['type'] = df_cleaned['type'].replace('\r', '', regex=True) #Remove unecessary r from types
df_cleaned.to_csv('cleaned_changed_types.csv', columns=['content', 'type'])

In [2]:
def remove_placeholder_words(input_text):
    cleaned_text = re.sub(r'\<\w+\>', ' ', input_text)
    cleaned_text = re.sub(' +', ' ', cleaned_text)

    return cleaned_text

In [5]:
#runs 1gb of data in ca 30 min
df = pd.read_csv('cleaned_changed_types.csv', encoding='utf8', dtype={'content':'string', 'type':'string'})
data_size = len(df.index)
pd.DataFrame(columns=['content', 'type']).to_csv("cleaned_removed_placeholder.csv")

for i in range(0,data_size):
    content_result = remove_placeholder_words(df.iloc[i]['content'])
    type_result = df.iloc[i]['type']

    result = {'content':content_result, 'type':type_result}
    new_data = pd.DataFrame(result, index=[i])
    new_data.to_csv('cleaned_removed_placeholder.csv', mode='a', header=False)

In [7]:
df = pd.read_csv('cleaned_removed_placeholder.csv', encoding='utf8', dtype={'content':'string', 'type':'string'})
print(len(df.index))
df.dropna(subset=['content'])
print(len(df.index))
df.to_csv('cleaned_removed_placeholder.csv', columns=['content', 'type'])

1077038


In [2]:
def remove_stop_words(input_text):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = []
    cleaned_text = word_tokenize(input_text)

    for w in cleaned_text:
        if w not in stop_words:
            filtered_sentence.append(w)

    return ' '.join(filtered_sentence)

In [5]:
#about 1h
df = pd.read_csv('cleaned_removed_placeholder.csv', encoding='utf8', dtype={'content':'string', 'type':'string'}) #about as slow as clean text 1,5h
data_size = len(df.index)
pd.DataFrame(columns=['content', 'type']).to_csv("cleaned_removed_stop_words.csv")

for i in range(0,data_size):
    content_result = remove_stop_words(df.iloc[i]['content'])
    type_result = df.iloc[i]['type']

    result = {'content':content_result, 'type':type_result}
    new_data = pd.DataFrame(result, index=[i])
    new_data.to_csv('cleaned_removed_stop_words.csv', mode='a', header=False)

In [11]:
df = pd.read_csv('cleaned_removed_stop_words.csv', encoding='utf8', dtype={'content':'string', 'type':'string'}).dropna(subset=['content'])
print(len(df.index))
df.to_csv('cleaned_removed_stop_words.csv', columns=['content', 'type'])

1075189
1075189


In [2]:
def stemming_words(input_text):
    ps = PorterStemmer()

    stemmed_words = []

    for word in word_tokenize(input_text):
        stemmed_words.append(ps.stem(word))

    return ' '.join(stemmed_words)

In [3]:
#heavy on memory and cpu usage 3h
df = pd.read_csv('cleaned_removed_stop_words.csv', encoding='utf8', dtype={'content':'string', 'type':'string'}) #about as slow as clean text 1,5h
data_size = len(df.index)
pd.DataFrame(columns=['content', 'type']).to_csv("cleaned_stemmed.csv")

for i in range(0,data_size):
    content_result = stemming_words(df.iloc[i]['content'])
    type_result = df.iloc[i]['type']

    result = {'content':content_result, 'type':type_result}
    new_data = pd.DataFrame(result, index=[i])
    new_data.to_csv('cleaned_stemmed.csv', mode='a', header=False)

In [4]:
df = pd.read_csv('cleaned_stemmed.csv', encoding='utf8', dtype={'content':'string', 'type':'string'})
print(len(df.index))
df.dropna(subset=['content'])
print(len(df.index))

1075189
1075189


everything works upto here

In [None]:
def placeholder_word_counter(input_text, input_list):
    placeholder_counter = [['<url>', 0], ['<email>', 0], ['<phone>', 0], ['<number>', 0], ['<digit>', 0], ['<cur>', 0]]

    for i in input_text:
        for j in range(0, len(placeholder_counter)-1):
            placeholder_counter[j][1] += len(re.findall(placeholder_counter[j][0], i[0]))

    return placeholder_counter

In [3]:
def word_counter(input_text):
    word_counter = Counter()
    for i in input_text:
        word_counter += Counter(word_tokenize(i[0]))

    return word_counter

In [None]:
data_size = len(pd.read_csv('cleaned_changed_types.csv', encoding='utf8').index)
df = pd.read_csv('cleaned_changed_types.csv', encoding='utf8', nrows=100000, dtype={'content':'string', 'type':'string'})
s = pd.Series(df['content']).str.split().value_counts()[:100] #uses up to 9gb of ram for 100k
#s = s.str.split()
#s.columns = ['content']
print(s)


"""word_counter = Counter()
for i in range(0, data_size):
    if i%50000 == 0:
        print("hej")
    count = things.groupby(['colors']).size()
    word_counter += Counter(word_tokenize(df.iloc[i]['content']))
print(word_counter)"""

"""new_df = df['content'].str.split(expand=True).stack().value_counts().reset_index()

new_df.columns = ['Word', 'Frequency']
print(new_df)"""

In [None]:
end_result = []
def clean_and_store(row):
    data_new = remove_placeholder_words(row['content'])
    end_result.append([''.join(data_new), row['type']])

datasize = len(pd.read_csv('cleaned_text_new.csv', encoding='utf8').index)
for chunk in pd.read_csv('cleaned_text_new.csv', encoding='utf8', nrows=datasize, chunksize=100000, dtype={'content':'string', 'type':'string'}):
    cleaned_row_chunk = chunk.apply(clean_and_store, axis = 1).dropna()

word_counter_results = word_counter(end_result)

df_processed_end_results = pd.DataFrame(end_result, columns=['content', 'type'])
df_processed_end_results.to_csv('filtered_placeholder_words.csv')

In [None]:
end_result = []
def clean_and_store(row):
    data_new = clean_data(row['content'])
    end_result.append([data_new, row['type']])

datasize = len(pd.read_csv('content_type_data.csv', encoding='utf8').index)
for chunk in pd.read_csv('content_type_data.csv', encoding='utf8', nrows=datasize, chunksize=100000, dtype={'content':'string', 'type':'string'}):
    cleaned_row_chunk = chunk.apply(clean_and_store, axis = 1)


word_counter_result = word_counter(end_result)
#print(word_counter_result)
placeholder_word_counter_result = placeholder_word_counter(end_result, ['<url>', '<email>', '<phone>', '<number>', '<digit>', '<cur>'])
#print(placeholder_word_counter_result)

df_processed_end_results = pd.DataFrame(end_result, columns=['content', 'type'])
df_processed_end_results.to_csv('cleaned_text_new.csv')

In [None]:
end_result = []
def clean_and_store(row):
    data_new = stemming_words(row['content'])
    end_result.append([' '.join(data_new), row['type']])

datasize = len(pd.read_csv('filtered_placeholder_words.csv', encoding='utf8').index)
for chunk in pd.read_csv('filtered_placeholder_words.csv', encoding='utf8', nrows=datasize, chunksize=10000, dtype={'content':'string', 'type':'string'}):
    cleaned_row_chunk = chunk.apply(clean_and_store, axis = 1).dropna()

print(word_counter(end_result))

df_processed_end_results = pd.DataFrame(end_result, columns=['content', 'type'])
df_processed_end_results.to_csv('filtered_and_stemmed_words.csv')

In [None]:
end_result = []
def clean_and_store(row):
    if row['type'] in ['unreliable', 'state', 'clickbait', 'junksci', 'conspiracy', 'hate', 'rumor', 'satire']:
        type_name = 'fake'
    else:
        type_name = 'reliable'
    end_result.append([row['content'], type_name])

for chunk in pd.read_csv('filtered_placeholder_words.csv', encoding='utf8', nrows=10000, chunksize=1000, dtype={'content':'string', 'type':'string'}):
    cleaned_row_chunk = chunk.apply(clean_and_store, axis = 1).dropna()


df_processed_end_results = pd.DataFrame(end_result, columns=['content', 'type'])
df_processed_end_results.to_csv('removed_.csv')

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
df_processed_end_results = pd.read_csv('removed_.csv')

X = df_processed_end_results['content']
vectorizer = CountVectorizer() #Counts and vectorizes
X = vectorizer.fit_transform(X)

y = df_processed_end_results['type']
encoder = LabelEncoder() #Good for binary use, and sets fake as 0 and reliable as 1
y = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

#Baseline models
DecisionTree = DecisionTreeClassifier()
LogisticRegression = LogisticRegression(max_iter=1000)
LinearRegression = LinearRegression()

DecisionTree.fit(X_train, y_train)
LogisticRegression.fit(X_train, y_train)

y_pred_decision = DecisionTree.predict(X_test)
y_pred_logistic = LogisticRegression.predict(X_val)

acc_decision = accuracy_score(y_test, y_pred_decision)
acc_logistic = accuracy_score(y_test, y_pred_logistic)

print(acc_decision)
print(acc_logistic)