# Setup

In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv('./dataset/mediaeval-2015-trainingset.txt', sep='\t', index_col=None)
test_df = pd.read_csv('./dataset/mediaeval-2015-testset.txt', sep='\t', index_col=None)

# Preprocessing

In [4]:
from nltk import TweetTokenizer, WordNetLemmatizer
import re

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
lemmatizer = WordNetLemmatizer()
url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
emoji_regex = re.compile("["
                         u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                         "]+", flags=re.UNICODE)

def count_emojis(text: str) -> int:
    return len(re.findall(emoji_regex, text))

def count_urls(text: str) -> int:
    return len(re.findall(url_regex, text))

def count_hashtags(text: str) -> int:
    return len([word for word in tokenizer.tokenize(text) if word[0] == '#'])

In [5]:
def filter_text(text: str) -> str:
    text = re.sub(emoji_regex, '', text)
    tokens = tokenizer.tokenize(text)
    tokens = [lemmatize_word(word) for word in tokens]

    return ' '.join(tokens)

def lemmatize_word(word: str) -> str:
    if word[0] == '#':
        return '#' + lemmatizer.lemmatize(word[1:])
    else:
        return lemmatizer.lemmatize(word)


## Feature Extraction

In [6]:
train_df['emojis'] = train_df['tweetText'].apply(lambda text: count_emojis(text))
train_df['hashtags'] = train_df['tweetText'].apply(lambda text: count_hashtags(text))
train_df['filteredTweetText'] = train_df['tweetText'].apply(lambda text: filter_text(text))

In [7]:
test_df['emojis'] = test_df['tweetText'].apply(lambda text: count_emojis(text))
test_df['hashtags'] = test_df['tweetText'].apply(lambda text: count_hashtags(text))
test_df['filteredTweetText'] = test_df['tweetText'].apply(lambda text: filter_text(text))

In [8]:
train_df[['tweetId', 'tweetText', 'emojis', 'hashtags', 'filteredTweetText']].head(25)

Unnamed: 0,tweetId,tweetText,emojis,hashtags,filteredTweetText
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,0,1,¿ se acuerdan de la película : “ el día despué...
1,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,0,0,@milenagimon : miren a sandy en ny ! tremenda ...
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",0,2,"buena la foto del huracán sandy , me recuerda ..."
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,0,2,scary shit #hurricane #ny http://t.co/e4JLBUfH
4,263018881839411200,My fave place in the world #nyc #hurricane #sa...,1,4,my fave place in the world #nyc #hurricane #sa...
5,263364439582060545,42nd #time #square #NYC #subway #hurricane htt...,0,5,42nd #time #square #nyc #subway #hurricane htt...
6,262927032705490944,Just in time for #halloween a photo of #hurric...,0,4,just in time for #halloween a photo of #hurric...
7,263321078884077568,Crazy pic of #Hurricane #Sandy prayers go out ...,0,2,crazy pic of #hurricane #sandy prayer go out t...
8,263111677485142017,#sandy #newyork #hurricane #statueofliberty #U...,0,5,#sandy #newyork #hurricane #statueofliberty #u...
9,262977091983785985,#nyc #hurricane http://t.co/Gv3QxZlq,0,2,#nyc #hurricane http://t.co/Gv3QxZlq


## Label Changes

In [9]:
train_df['label'] = train_df['label'].apply(lambda label: 'fake' if label == 'humor' else label)
test_df['label'] = test_df['label'].apply(lambda label: 'fake' if label == 'humor' else label)

## Training set Changes

In [10]:
train_df.drop_duplicates(subset=['filteredTweetText'], keep='first', inplace=True, ignore_index=False)

# Prediction on Test Set

In [11]:
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## With Feature Extraction

In [12]:
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(5, 5))
emojis_scalar = MinMaxScaler()
hashtags_scalar = MinMaxScaler()
model = MultinomialNB()

X_train = hstack([
            tfidf.fit_transform(train_df['filteredTweetText']),
            emojis_scalar.fit_transform(train_df['emojis'].values.reshape(-1, 1)),
            hashtags_scalar.fit_transform(train_df['hashtags'].values.reshape(-1, 1))
        ])
y_train = train_df['label']

model.fit(X_train, y_train)

X_test = hstack([
            tfidf.transform(test_df['filteredTweetText']),
            emojis_scalar.transform(test_df['emojis'].values.reshape(-1, 1)),
            hashtags_scalar.transform(test_df['hashtags'].values.reshape(-1, 1))
        ])
y_test = test_df['label']

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9051930758988016

## Without Feature Extraction

In [13]:
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(5, 5))
model = MultinomialNB()

X_train = tfidf.fit_transform(train_df['filteredTweetText'])
y_train = train_df['label']

model.fit(X_train, y_train)

X_test = tfidf.transform(test_df['filteredTweetText'])
y_test = test_df['label']

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9051930758988016