# 1. Import Dependencies and Load Dataset

In [1]:
import numpy as np
import pandas as pd
import string
import re

#load training dataset from file into pandas dataframe
rows_list = []
with open('mediaeval-2015-trainingset.txt', 'r', encoding="utf-8") as file:
    for line in file:
        entry = [l.strip() for l in line.split('\t')]
        rows_list.append(entry)
training = pd.DataFrame(rows_list[1:], columns = rows_list[0])

#import test data from file, turn into DataFrame
trows = []
with open('mediaeval-2015-testset.txt', 'r', encoding="utf-8") as file:
    for line in file:
        entry = [l.strip() for l in line.split('\t')]
        trows.append(entry)
testing = pd.DataFrame(trows[1:], columns = trows[0])

# 2. Preprocessing

To improve classifier performance, the training dataset must go through some preprocessing.

A regular expression is used to remove URLs and remove punctuation marks from the tweet text, which is then converted to lower case.

We also convert all 'humor' labels to 'fake' labels since humor tweets are to be considered fake when evaluating the dataset, and it was found that better results were achieved when the two classes were merged in training instead of keeping them separate in training, and then converting 'humor' predictions into 'fake' predictions in testing.

We also apply the same tweet preprocessing steps to the tweets in the test dataset

In [2]:
#processes a tweet's text to remove certain punctuation, remove urls and trim to lower case
def processTweet(tweet):
    return re.sub('https?:\/\/t.co\/[0-9a-zA-Z]*|\\n|&amp;|&gt;|&lt;|#|\"|<|>|\(|\)|\'|\*|\-|_|=|\+|%', '', tweet).lower()

#process each tweet, replace tweet series with new list of processed tweets
processed_tweets = []
for tweet in training['tweetText']:
    processed_tweets.append(processTweet(tweet))
training['tweetText'] = processed_tweets  

#change humour label to fake label in training
new_labels = []
for label in training['label']:
    if label == 'humor':
        new_labels.append('fake')
    else:
        new_labels.append(label)
training['label'] = new_labels

#preprocess test set in the same way
processed_test_tweets = []
for tweet in testing['tweetText']:
    processed_test_tweets.append(processTweet(tweet))
testing['tweetText'] = processed_test_tweets


# 3. Create algorithm pipeline, make predictions and evaluate classifier performance

We build our classifier in the form of a Pipeline to make the process of building and training the classifier more concise.
First, a CountVectorizer tokenises tweets, converts them into n-grams and adds them into a bag of words.
Next, a TfidfTransformer performs Text Frequency Inverse Document Frequency (TF-IDF) on the n-grams in each tweet, which identifies how frequently each phrase appears in a compared to the rest of the bag of words.
Finally, a MultinomialNB is trained on the tf-idf feature vectors of each tweet.

After training the classifier, we pass in the preprocessed test dataset and obtain class predictions for each tweet. We then evaluate the classifier by comparing the predictions to the ground truths provided, and calculate an F1 score for the classifier.

In [3]:
#create algorithm pipeline
#first tokenise text using CountVectorizer, removal of stopwords and creation of ngrams occurs here
#then calculate tf-idf of tokens
#then train MultinomialNB classifier using tf-idf
#this pipeline contains the best parameters found for chosen algorithm design, use this to check f1 scores
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

tweet_clf = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,7), max_features=17500)),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB())
])
tweet_clf.fit(training['tweetText'], training['label'])
predicted = tweet_clf.predict(testing['tweetText'])

#get metrics
#print(metrics.f1_score(testing['label'], predicted, average='micro', labels=labels))
print(metrics.f1_score(testing['label'], predicted, average='micro'))
print(metrics.classification_report(testing['label'], predicted))

0.8614123247818037
              precision    recall  f1-score   support

        fake       0.87      0.93      0.90      2564
        real       0.84      0.71      0.77      1217

    accuracy                           0.86      3781
   macro avg       0.85      0.82      0.83      3781
weighted avg       0.86      0.86      0.86      3781



# 4. Parameter tuning

Different combinations of n-gram ranges and max features lead to different F1 scores. Because of this, we want to find the combination of the two which yields the best score.

We do this here by iteration through different classifier configurations and obtaining their F1 score.

In [102]:
#evaluate different configurations of algorithm design
#adjust list values for different intervals
feature_intervals = [2000, 3000, 5000, 7500, 10000, 15000, 20000, 30000, 40000]
intervals = [10000,10500,11000,11500,12000,12500,13000,13500,14000,14500,15000]
#adjust ranges to try different n-gram ranges
for min_ngrams in range(1,2):
    for max_ngrams in range(3,5):
        for features in intervals:
            config_clf = Pipeline([
             ('vect', CountVectorizer(ngram_range=(min_ngrams,max_ngrams), max_features = features)),
             ('tfidf', TfidfTransformer()),
             ('clf', MultinomialNB()),])
            config_clf.fit(training['tweetText'], training['label'])
            config_predict = config_clf.predict(testing['tweetText'])
            config_score = metrics.f1_score(testing['label'], config_predict, average='micro')
            print("("+str(min_ngrams)+","+str(max_ngrams)+"), "+str(features)+": "+str(config_score))

(1,3), 10000: 0.7751917482147579
(1,3), 10500: 0.8328484527902671
(1,3), 11000: 0.7741338270298864
(1,3), 11500: 0.8328484527902671
(1,3), 12000: 0.7746627876223221
(1,3), 12500: 0.775456228510976
(1,3), 13000: 0.7751917482147579
(1,3), 13500: 0.7746627876223221
(1,3), 14000: 0.775456228510976
(1,3), 14500: 0.7667283787357841
(1,3), 15000: 0.7667283787357841
(1,4), 10000: 0.6612007405448294
(1,4), 10500: 0.7889447236180903
(1,4), 11000: 0.7900026448029621
(1,4), 11500: 0.7929119280613595
(1,4), 12000: 0.7913250462840519
(1,4), 12500: 0.7915895265802699
(1,4), 13000: 0.7923829674689234
(1,4), 13500: 0.7913250462840519
(1,4), 14000: 0.7918540068764877
(1,4), 14500: 0.7931764083575774
(1,4), 15000: 0.7929119280613595
