In [42]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
import words_repeated_char
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [17]:
def read_csv(file_path, delimiter = ' '):
    """ We had some issues loading the data using pandas.read_csv, so we built our own loader.
        Read a csv file, returns a pandas.Dataframe
    """
    fp = open(file_path)
    line = fp.readline()
    data_dict = dict()
    labels = line[:-1].split(delimiter)
    line = fp.readline()
    for label in labels:
        data_dict[label] = []
    while line:
        for i, j in enumerate(line[:-1].split(delimiter)):
            data_dict[labels[i]].append(j)
        line = fp.readline()
    return pd.DataFrame(data_dict)

In [22]:
def preprocess(tweets):
    lemmatizer = WordNetLemmatizer() 
    X = []
    for tweet in tweets:
        tmp = tweet.lower().replace("\\n",' ').replace('\\xa0', ' ').replace('\\r', ' ').replace("\\'","'").replace("&lt;",'<')\
             .replace("&gt;",'>').replace(" &amp; ", " and ").replace("&amp;", "&")
        # lower the tweet and replacing characters that tweeter has tranlated to their hmtl numeric code to their original value
        tmp = re.sub("(http(s)?://)?(www\.)?([a-zA-Z0-9])+\.[a-z]{1,3}(/\S*)?",'URL', tmp) # 
        tmp = re.sub("#\w+", 'HASHTAG', tmp)
        tmp = re.sub("@\w+", 'USER', tmp)
        tmp = re.sub("\w+@\w+\.[a-z]{2,3}", "EMAIL", tmp)
        tmp = re.sub("[0-9]{1,2}/[0-9]{1,2}/([0-9]{4}|[0-9]{2})|([0-9]{4}|[0-9]{2})/[0-9]{1,2}/[0-9]{1,2}|[0-9]{2}/[0-9]{2}", "DATE", tmp)
        tmp = re.sub("[0-9]{2}(:[0-9]{2}){1,2}( ?(am|pm))?", "TIME", tmp)
        tmp = re.sub("(([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF]))+", ' EMOJI ', tmp)
        tmp = tmp.encode(encoding='ascii', errors='ignore').decode()
        # get rid of the non ascii characters
        tmp = re.sub("\.{2,}", "...", tmp)
        for c in ['!', '-', ',']:
            tmp = re.sub("{}+".format(c), c, tmp)
        tmp = re.sub("\?+", "?", tmp)
        tmp = re.sub("`+", "'", tmp)
        tmp = re.sub("'{2,}", "'", tmp)
        tmp = tmp.translate(str.maketrans(dict.fromkeys('#*+/<=>@[\\]^_`{|}~'))) #removing all the other special characters
        tokens = [t if t not in ["''", "``"] else '"' for t in word_tokenize(tmp) ]
        # tokenizing using nltk.word_tokenize. althought it transforms '"' into '``' or "''" and this is a behaviour do not want
        # so we make sure that the '"' are changed to their original form
        for i, token in enumerate(tokens):
            if re.search(r"([a-z])\1{2,}", token):
                # cleaning the words containing a letter repeated 3 times or more, using the list of the Ensglish words
                tokens[i] = words_repeated_char.clean(token)
            tokens[i] = lemmatizer.lemmatize(tokens[i])
        X.append(tokens)
    return X

In [38]:
data = read_csv("data/hydrated/hateful_tweets_filtered.csv", delimiter='\t')
data2 = pd.read_pickle("data/originals/labeled_data.p")

In [None]:
label2int = {"hateful" : 0, "abusive" : 1, "normal":2, "spam":3}
X = preprocess(data["tweet_content"].values)
y = [label2int[c] for c in data["label"].values]
X2 = preprocess(data2["tweet"].values)
X = X + X2
y = y + list(data2["class"].values)
assert len(X) == len(y)

In [50]:
data3 = pd.DataFrame({"tweet" : X, "label" : y})

In [52]:
data3.to_pickle("data/preprocess.pkl")