In [33]:
import pandas as pd

In [34]:
train_df = pd.read_csv('./dataset/mediaeval-2015-trainingset.txt', sep='\t', index_col=None)
test_df = pd.read_csv('./dataset/mediaeval-2015-testset.txt', sep='\t', index_col=None)

# Exploration

In [35]:
train_df['tweetText'].duplicated().sum()

1901

In [36]:
from nltk import WordNetLemmatizer, TweetTokenizer

print(f'Number of duplicate tweets before filter: {train_df["tweetText"].duplicated().sum()}')

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
lemmatizer = WordNetLemmatizer()

def lemmatize_word(word: str) -> str:
    if word[0] == '#':
        return '#' + lemmatizer.lemmatize(word[1:])
    else:
        return lemmatizer.lemmatize(word)

def filter_text(text: str) -> str:
    # Before tokenization

    tokens = tokenizer.tokenize(text)

    # After tokenization
    tokens = [lemmatize_word(word) for word in tokens]

    return ' '.join(tokens)

train_df['filteredTweetText'] = train_df['tweetText'].apply(lambda text: filter_text(text))
print(f'Number of duplicate tweets after filter: {train_df["filteredTweetText"].duplicated().sum()}')

Number of duplicate tweets before filter: 1901
Number of duplicate tweets after filter: 1902


In [37]:
train_df['tweetText'].head(15)

0     ¿Se acuerdan de la película: “El día después d...
1     @milenagimon: Miren a Sandy en NY!  Tremenda i...
2     Buena la foto del Huracán Sandy, me recuerda a...
3        Scary shit #hurricane #NY http://t.co/e4JLBUfH
4     My fave place in the world #nyc #hurricane #sa...
5     42nd #time #square #NYC #subway #hurricane htt...
6     Just in time for #halloween a photo of #hurric...
7     Crazy pic of #Hurricane #Sandy prayers go out ...
8     #sandy #newyork #hurricane #statueofliberty #U...
9                  #nyc #hurricane http://t.co/Gv3QxZlq
10    robertosalibaba  god be with u brother #sandy ...
11        #Crazy #Hurricane #Sandy http://t.co/0zrMsgvs
12    #shark #newjersey #swim #sandy #hurricane  ...
13    Good luck #ny #newyork #usa #hurricane #sandy ...
14    Wow.... Fishing anyone? #hurricane #sandy http...
Name: tweetText, dtype: object

In [38]:
import re
def count_emojis(text: str) -> int:
    emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)

    count = len(re.findall(emoji_pattern, text))
    return count

In [39]:
def count_hashtags(text:str) -> int:
    words = text.split(' ')
    return len([word for word in words if word[0] == '#'])

# Model

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [41]:
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(5, 5))

In [42]:
model = MultinomialNB()