In [1]:
import os
import pandas as pd
from num2words import num2words
from decimal import Decimal
import math
from bs4 import BeautifulSoup
import re
import unidecode
from string import punctuation
import nltk
from nltk.corpus import stopwords
from emoticons_list import EMOTICONS_EMO
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk import tokenize

In [2]:
def change_directory(path):
    print("Current Working Directory ", os.getcwd())
    os.chdir(path)
    print("Changed Working Directory ", os.getcwd())

In [3]:
def read_data(file):
    data = pd.read_csv(file)
    return data

In [4]:
def lower_case_convertion(text):
    lower_text = text.lower()
    return lower_text

In [5]:
def remove_punctuation(text):
    """
    Return :- String after removing punctuations
    Input :- String
    Output :- String
    """
    return text.translate(str.maketrans('', '', punctuation))

In [6]:
def remove_numbers(text):
    """
    Return :- String without numbers
    input :- String
    Output :- String
    """
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern,repl=" ", string=text)
    return without_number

In [7]:
def remove_html_tags_beautifulsoup(text):
    """
    Return :- String without Html tags
    input :- String
    Output :- String
    """
    parser = BeautifulSoup(text, "html.parser")
    without_html = parser.get_text(separator = " ")
    return without_html

In [8]:
def remove_urls(text):
    """
    Return :- String without URLs
    input :- String
    Output :- String
    """
    url_pattern = r'https?://\S+|www\.\S+'
    without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
    return without_urls

In [9]:
def accented_to_ascii(text):
    """
    Return :- text after converting accented characters
    Input :- string
    Output :- string
    """
    # apply unidecode function on text to convert
    # accented characters to ASCII values
    text = unidecode.unidecode(text)
    return text

In [10]:
def remove_extra_spaces(text):
    """
    Return :- string after removing extra whitespaces
    Input :- String
    Output :- String
    """
    space_pattern = r'\s+'
    without_space = re.sub(pattern=space_pattern, repl=" ", string=text)
    return without_space

In [11]:
def remove_single_char(text):
    """
    Return :- string after removing single characters
    Input :- string
    Output:- string
    """
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

In [12]:
# convert emo_unicode to unicode_emo
UNICODE_EMO = {v: k for k, v in EMOTICONS_EMO.items()}

def emoji_words(text):
    for emot in UNICODE_EMO:
        emoji_pattern = r'('+emot+')'
        # replace
        emoji_words = UNICODE_EMO[emot]
        replace_text = emoji_words.replace(",","")
        replace_text = replace_text.replace(":","")
        replace_text_list = replace_text.split()
        emoji_name = '_'.join(replace_text_list)
        text = re.sub(emoji_pattern, emoji_name, text)
    return text


In [13]:
lemma = WordNetLemmatizer()
def lemmatization(text):

    # word tokenization
    tokens = word_tokenize(text)

    for index in range(len(tokens)):
        # lemma word
        lemma_word = lemma.lemmatize(tokens[index])
        tokens[index] = lemma_word

    return ' '.join(tokens)

In [14]:
path = "C:\\Users\\ompra\\OneDrive\\Documents\\Machine Learning Projects\\Automated Decision Support System for Cyberbullying Detection - Version 2.0\\Data\\raw_data"
change_directory(path)

Current Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Notebooks
Changed Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Data\raw_data


In [15]:
file = 'integrated_cyberbullying_data.csv'
data = read_data(file)
data.head()

Unnamed: 0,text,cyberbullying_type
0,That long rant/explanation of his and the comm...,religion
1,@ToddMetcalf I’m wondering where all our conse...,not_cyberbullying
2,@DaveAtherton20 @bnsphrx @hermannkelly @Robert...,religion
3,RT @TRobinsonNewEra: This has nothing to do wi...,not_cyberbullying
4,@stevesm39312259 @MrsNickyClark @billybragg @T...,gender


In [16]:
data['text'] = data['text'].apply(lower_case_convertion)
data['text'].head()

0    that long rant/explanation of his and the comm...
1    @toddmetcalf i’m wondering where all our conse...
2    @daveatherton20 @bnsphrx @hermannkelly @robert...
3    rt @trobinsonnewera: this has nothing to do wi...
4    @stevesm39312259 @mrsnickyclark @billybragg @t...
Name: text, dtype: object

In [17]:
data['text'] = data['text'].apply(remove_punctuation)
data['text'].head()

0    that long rantexplanation of his and the comme...
1    toddmetcalf i’m wondering where all our conser...
2    daveatherton20 bnsphrx hermannkelly robertsemo...
3    rt trobinsonnewera this has nothing to do with...
4    stevesm39312259 mrsnickyclark billybragg thefa...
Name: text, dtype: object

In [18]:
data['text'] = data['text'].apply(remove_numbers)
data['text'].head()

0    that long rantexplanation of his and the comme...
1    toddmetcalf i’m wondering where all our conser...
2    daveatherton  bnsphrx hermannkelly robertsemon...
3    rt trobinsonnewera this has nothing to do with...
4    stevesm  mrsnickyclark billybragg thefamousart...
Name: text, dtype: object

In [19]:
data['text'] = data['text'].apply(remove_html_tags_beautifulsoup)
data['text'].head()

0    that long rantexplanation of his and the comme...
1    toddmetcalf i’m wondering where all our conser...
2    daveatherton  bnsphrx hermannkelly robertsemon...
3    rt trobinsonnewera this has nothing to do with...
4    stevesm  mrsnickyclark billybragg thefamousart...
Name: text, dtype: object

In [20]:
data['text'] = data['text'].apply(remove_urls)
data['text'].head()

0    that long rantexplanation of his and the comme...
1    toddmetcalf i’m wondering where all our conser...
2    daveatherton  bnsphrx hermannkelly robertsemon...
3    rt trobinsonnewera this has nothing to do with...
4    stevesm  mrsnickyclark billybragg thefamousart...
Name: text, dtype: object

In [21]:
data['text'] = data['text'].apply(accented_to_ascii)
data['text'].head()

0    that long rantexplanation of his and the comme...
1    toddmetcalf i'm wondering where all our conser...
2    daveatherton  bnsphrx hermannkelly robertsemon...
3    rt trobinsonnewera this has nothing to do with...
4    stevesm  mrsnickyclark billybragg thefamousart...
Name: text, dtype: object

In [22]:
data['text'] = data['text'].apply(remove_extra_spaces)
data['text'].head()

0    that long rantexplanation of his and the comme...
1    toddmetcalf i'm wondering where all our conser...
2    daveatherton bnsphrx hermannkelly robertsemons...
3    rt trobinsonnewera this has nothing to do with...
4    stevesm mrsnickyclark billybragg thefamousartb...
Name: text, dtype: object

In [23]:
data['text'] = data['text'].apply(remove_single_char)
data['text'].head()

0    that long rantexplanation of his and the comme...
1    toddmetcalf i'm wondering where all our conser...
2    daveatherton bnsphrx hermannkelly robertsemons...
3    rt trobinsonnewera this has nothing to do with...
4    stevesm mrsnickyclark billybragg thefamousartb...
Name: text, dtype: object

In [24]:
stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [25]:
data['text'] = data['text'].apply(emoji_words)
data['text'].head()

0    long rantexplanation comments copeacefuls make...
1    toddmetcalf i'm wondering conservative brother...
2    daveatherton bnsphrx hermannkelly robertsemons...
3    rt trobinsonnewera nothing islam httptcoacqkux...
4    stevesm mrsnickyclark billybragg thefamousartb...
Name: text, dtype: object

In [26]:
data['text']=data['text'].apply(lemmatization)
data['text'].head()

0    long rantexplanation comment copeacefuls make ...
1    toddmetcalf i 'm wondering conservative brothe...
2    daveatherton bnsphrx hermannkelly robertsemons...
3    rt trobinsonnewera nothing islam httptcoacqkux...
4    stevesm mrsnickyclark billybragg thefamousartb...
Name: text, dtype: object

In [27]:
data['text'] = data['text'].str.replace('\d+', '')

  data['text'] = data['text'].str.replace('\d+', '')


In [28]:
data['text'].head()

0    long rantexplanation comment copeacefuls make ...
1    toddmetcalf i 'm wondering conservative brothe...
2    daveatherton bnsphrx hermannkelly robertsemons...
3    rt trobinsonnewera nothing islam httptcoacqkux...
4    stevesm mrsnickyclark billybragg thefamousartb...
Name: text, dtype: object

In [29]:
data['text'].isnull().sum()

0

In [54]:
data.to_csv('cleaned_data.csv', index = False)