## Part 1: Data enrichment
Count length and count laugh length

In [1]:
import pandas as pd
import re

In [2]:
full_dataset = pd.read_csv('datasets\\full_dataset.csv')

In [3]:
full_dataset

Unnamed: 0,data,Hate.speech
0,"""não come mel, morde marimbondo""",0
1,"não tem pinto, tem orgulho !",0
2,Não vê essa merda de Crepúsculo! Pra isso temo...,0
3,"não da tapa na bundinha, da cotovelada nas cos...",0
4,o diminutivo INHO não acompanha a trajetória d...,1
...,...,...
2759,'LIXO DO CRL',1
2760,'ela esta fazendo isso para aparecer Nao e gay...,1
2761,'Volta pra jaula sua macaca vou pegar meu xico...,1
2762,'Achei que a macaca vivia apenas na floresta o...,1


In [4]:
# get common portuguese laughts online, like kkkkk, hahaha, hehe and rsrs
laughs = "k{2,}|a*ha+h[ha]*|e*he+h[he]*|s*rs+r[rs]*" # get regex for laughs
def count_laughs(text):
    all_laughs = re.findall(laughs, text)
    all_laughs = ''.join(all_laughs)
    return len(all_laughs)

In [5]:
length = full_dataset['data'].apply(len)
laughs = full_dataset['data'].apply(count_laughs)

## Part 2: Basic text processing
Put everything in lower case, remove url links, punctuations and duplicated letters.

In [6]:
from spellchecker import SpellChecker
spell = SpellChecker(language='pt') # spell checker in portuguese

In [7]:
def lower(text):
    return text.lower()

In [8]:
def remove_break_line(text):
    word = text.split()
    new_text = []
    if '\n' in word:
         new_text.append(word.replace('\n', ' '))
    else:
        return text
    return " ".join(new_text)

In [9]:
def remove_links(text):
    separated_text = text.split(' ')
    for word in separated_text:
        if 'http' in word or '.com' in word:
            separated_text.remove(word)
    new_text = ' '.join(separated_text)
    if new_text == '':
        return text
    else:
        return new_text

In [10]:
def remove_punctuation(text):
    return re.sub(r'["!$%&\'()*+,-./:;<=>?\\^_`{|}~]', '', text)

In [11]:
def is_unknown(word):
    return spell.unknown([word]) != set()

def remove_duplicated_letters(text):    
    text = text.split(' ')
    new_text = []
    for word in text:
        if is_unknown(word) and word != '':
            new_word = [word[0]] # start new word with 1st letter
            for i, letter in enumerate(word[1:]):
                if letter != word[i]:
                    new_word.append(letter)
            new_word = ''.join(new_word)
        else:
            new_word = word
        new_text.append(new_word)
    text = ' '.join(new_text)
    return text

In [12]:
def process_text1(text):
    """
    Basic text processing: lower, remove links, punctuations and duplicated letters.
    """
    text = lower(text)
    text = remove_break_line(text)
    text = remove_links(text)
    text = remove_punctuation(text)
    text = remove_duplicated_letters(text)
    return text

In [13]:
full_dataset['data'] = full_dataset['data'].apply(process_text1)

In [14]:
full_dataset

Unnamed: 0,data,Hate.speech
0,não come mel morde marimbondo,0
1,não tem pinto tem orgulho,0
2,não vê essa merda de crepúsculo pra isso temos...,0
3,não da tapa na bundinha da cotovelada nas costas,0
4,o diminutivo inho não acompanha a trajetória d...,1
...,...,...
2759,lixo do crl,1
2760,ela esta fazendo isso para aparecer nao e gay ...,1
2761,volta pra jaula sua macaca vou pegar meu xicot...,1
2762,achei que a macaca vivia apenas na floresta ou...,1


## Part 3: more enrichment

In [15]:
f = open("datasets\\lista-palavroes-bloqueio.txt", "r", encoding="utf8")
list_bad_words = f.readlines()
list_bad_words = [x.strip() for x in list_bad_words]
f.close()


def count_bad_words(text):
    bad = []
    text = text.split(' ')
    for bad_word in list_bad_words:
        try:
            x = [re.findall('^' + bad_word.lower(), word) for word in text]
            x = [y for y in x if y != []]
            if x != []:
                bad.append(x[0][0])
        except:
            pass
    if bad != []:
        bad = set(bad) # get the unique values
    return len(bad)

In [16]:
def count_misspell(text):
    count = 0
    for word in text.split():
        if '#' not in word and '@' not in word and word != 'rt':
            if is_unknown(word) and word != '':
                count += 1
    return count

In [17]:
bad_words = full_dataset['data'].apply(count_bad_words)
misspell = full_dataset['data'].apply(count_misspell)

### Part 4: more processing

In [18]:
import nltk
nltk.download('stopwords')
nltk.download('rslp')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
stopwords = set(nltk.corpus.stopwords.words('portuguese')) # get stop words in portuguese

def remove_stopwords(text):
    text_without_stopwords = [word for word in text.split() if word not in stopwords] # remove stopwords
    # some elements of the dataset were only stopwords (eg: "quando foi isso")
    # if it's the case, we won't remove stopwords
    if text_without_stopwords != []:
        text = text_without_stopwords
    return ' '.join(text)

In [20]:
def remove_laughs(text):
    return re.sub(r'k{2,}|a*ha+h[ha]*|e*he+h[he]*|s*rs+r[rs]*', '', text)

In [21]:
def stemming(text):
    stemmer = nltk.stem.RSLPStemmer() #steamming in portuguese
    words = []
    for word in text.split():
        words.append(stemmer.stem(word))
    return " ".join(words)

In [22]:
def bag_of_words(text):
    return nltk.word_tokenize(text)

In [23]:
def process_text2(text):
    text = remove_laughs(text)
    text = remove_stopwords(text)
    text = stemming(text)
    text = bag_of_words(text)
    return text

In [24]:
full_dataset['data'] = full_dataset['data'].apply(process_text2)

In [25]:
full_dataset['data']

0                                [com, mel, mord, marimb]
1                                          [pint, orgulh]
2       [vê, merd, crepúscul, pra, cinebiograf, chuck,...
3                              [tap, bund, cotovel, cost]
4          [diminu, inh, acompanh, trajetór, hom, verdad]
                              ...                        
2759                                           [lix, crl]
2760    [faz, aparec, nao, gay, cois, nenhum, vergonh,...
2761    [volt, pra, jaul, macac, vou, peg, xicot, pret...
2762              [ach, macac, viv, apen, florest, zolog]
2763            [esper, nev, derret, usa, pra, lav, louc]
Name: data, Length: 2764, dtype: object

## Part 5: Putting it all together and saving
* length
* laughs
* bad_words
* misspell

In [26]:
full_dataset

Unnamed: 0,data,Hate.speech
0,"[com, mel, mord, marimb]",0
1,"[pint, orgulh]",0
2,"[vê, merd, crepúscul, pra, cinebiograf, chuck,...",0
3,"[tap, bund, cotovel, cost]",0
4,"[diminu, inh, acompanh, trajetór, hom, verdad]",1
...,...,...
2759,"[lix, crl]",1
2760,"[faz, aparec, nao, gay, cois, nenhum, vergonh,...",1
2761,"[volt, pra, jaul, macac, vou, peg, xicot, pret...",1
2762,"[ach, macac, viv, apen, florest, zolog]",1


In [27]:
full_dataset['length'] = length
full_dataset['laughs'] = laughs
full_dataset['bad_words'] = bad_words
full_dataset['misspell'] = misspell

In [28]:
full_dataset

Unnamed: 0,data,Hate.speech,length,laughs,bad_words,misspell
0,"[com, mel, mord, marimb]",0,32,0,0,1
1,"[pint, orgulh]",0,28,0,1,0
2,"[vê, merd, crepúscul, pra, cinebiograf, chuck,...",0,83,0,1,2
3,"[tap, bund, cotovel, cost]",0,51,0,1,0
4,"[diminu, inh, acompanh, trajetór, hom, verdad]",1,68,0,0,1
...,...,...,...,...,...,...
2759,"[lix, crl]",1,13,0,0,1
2760,"[faz, aparec, nao, gay, cois, nenhum, vergonh,...",1,85,0,1,0
2761,"[volt, pra, jaul, macac, vou, peg, xicot, pret...",1,66,0,2,1
2762,"[ach, macac, viv, apen, florest, zolog]",1,60,0,1,1


In [29]:
full_dataset.to_csv('datasets\\final_dataset.csv', index=False)