## -------------------------------------------------------------

# <span style="color:purple">IMPORT LIBRARIES: ```</span>

## -------------------------------------------------------------

In [1]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader

from sklearn.feature_extraction.text import CountVectorizer


pd.set_option('display.max_colwidth', None)

## -------------------------------------------------------------
# <span style="color:purple">IMPORT DATASET: ```</span>
## -------------------------------------------------------------

In [2]:
data = pd.read_csv("Thesis Datasets - Combined Total (Labeled).csv", delimiter=',')

In [3]:
# #insert dataset
# dataset = "combinedV5"

# data = pd.read_csv(dataset+".csv",  )
# #print(data['text'][:5])
# data

### -------------------------------------------------------------
## <span style="color:purple">DATASET Overview & Checking for NULL and DUPLICATE values ```</span>
### -------------------------------------------------------------

In [4]:
label_counts = data['label'].value_counts()
print(label_counts)
data

0    10199
1     8685
Name: label, dtype: int64


Unnamed: 0,text,label,language
0,GASTOS NI VP BINAY SA POLITICAL ADS HALOS P7-M NA Inaasahan na ni Vice President Jejomar Binay na may mga taong... https://t.co/SDytgbWiLh,0,English
1,Mar Roxas TANG INA TUWID NA DAAN DAW .. EH SYA NGA DI STRAIGHT,1,English
2,Salamat sa walang sawang suporta ng mga taga makati! Ang Pagbabalik Binay In Makati #OnlyBinayInMakatiSanKaPa https://t.co/iwAOdtZPRE,0,Filipino
3,@rapplerdotcom putangina mo binay TAKBO PA,1,Filipino
4,"Binay with selective amnesia, forgetting about the past six years he spent preparing to be president. #PiliPinasDebates2016",0,English
...,...,...,...
18879,"RT @PinkSapph: If a hoe is talking to your man, she's not the problem, he is. The fact that hoes feel welcomed reflects on how he acts behi&#8230;",1,English
18880,@rednexican69 sounds like a bad bitch.,1,English
18881,#5WordsAfterSex is that all pussy bitch?,1,English
18882,I seen Scooby hoe some niggas at the gardens too &#128514;&#128514;&#128514;,1,English


In [5]:
#DATASET OVERVIEW
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18884 entries, 0 to 18883
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      18884 non-null  object
 1   label     18884 non-null  int64 
 2   language  18884 non-null  object
dtypes: int64(1), object(2)
memory usage: 442.7+ KB


In [6]:
#CHECK FOR NULL VALUES
data.isnull().sum()
null_rows = data[data['label'].isnull()]

null_rows


Unnamed: 0,text,label,language


In [7]:
#CHECK FOR DUPLICATE VALUES
data.duplicated().sum()

7

## -------------------------------------------------------------
# <font color='purple'>Dropping Duplicates and NULL</font>
## -------------------------------------------------------------

In [8]:
# View Duplicates
data[data.duplicated()]

Unnamed: 0,text,label,language
5129,wag magsawa dahil hndi mag sasawa si VP Binay Only B1NAY #OnlyBinayWinner,0,Filipino
8606,“Time you enjoy wasting is not wasted time.” Ang Pagbabalik Binay In Makati #OnlyBinayInMakatiSanKaPa,0,English
12186,President: Rody Duterte VPresident: Bongbong Marcos Secretary: Miriam Santiago CLEANERS: Mar Roxas &amp; Jejomar Binay ??? #HalalanResults,0,English
12783,"OFW rights’ advocate Toots Ople agrees to be a guest candidate of UNA, completing VP Binay’s Senate slate. #PHvote",0,English
12892,"Bakit ba si Binay, lagi nya sinasabi nung mayor sya ""sa Makati..sa Makati"" .. bakit wala syang maipagmalaki bilang bise presidente? Ano na",0,Filipino
13963,"Toni: ""Ano ang katangian niyo kung kaya't dapat kayong maging Big Winner?"" Binay: ""Nognog, pandak, at laki sa hirap.""",0,Filipino
14095,"Partial and unofficial tally of PPCRV: Duterte - 9,990 Roxas - 8,711 Poe - 7,543 Binay - 5,250 Santiago - 1,096",0,English


In [9]:
# Drop the duplicates.
data.drop_duplicates(inplace=True)

# Drop null values.
data.dropna(inplace=True)

# Reset the index.
data.reset_index(drop=True, inplace=True)

In [10]:
#Checking if there are still duplicates
data.duplicated().sum()

0

In [11]:
#Checking if there are still NULL values
data.isnull().sum()


text        0
label       0
language    0
dtype: int64

## -------------------------------------------------------------
# <span style="color:purple">Data Preprocessing Functions ```</span>
## <font color='purple'>``` Functions Overview ```</font>
1. **<font color='blue'>Lowercasing</font>**
2. **<font color='blue'>Binary Classification</font>**
3. **<font color='blue'>Data De-identification</font>**
4. **<font color='blue'>Hashtag Removal</font>**
5. **<font color='blue'>URL Removal</font>**
6. **<font color='blue'>Removing Numbers</font>**
7. **<font color='blue'>Removing Extra White Space</font>**
8. **<font color='blue'>Contraction Expansion</font>**
9. **<font color='blue'>Punctuation Removal</font>**
10. **<font color='blue'>Stop Words Removal</font>**
11. **<span style="color:blue">Candidate Names and RT String Removal</span>**
## -------------------------------------------------------------


In [12]:
#lowercasing
def lowercasing(text):
    if isinstance(text, str):
        text = text.lower()
    return text

#binary classification of Hate Score 
def binary_classification(num):
    if isinstance(num, str):
        num = float(num)  # Convert num to float if it's a string
    if num >= 0.5:
        num = 1
    else:
        num = 0
    return num

# Remove Mentions - Data-deidentification
def data_deidentification(text):
    if isinstance(text, str):
        return re.sub(r'@\w+\:?', '', text)
    else:
        return text

def remove_hashtags(text):
    if isinstance(text, str):
        return re.sub(r'#\w+', '', text)
    else:
        return text

def remove_urls(text):
    if isinstance(text, str):
        return re.sub(r'https?://\S+', '', text)
    else:
        return text

def remove_numbers(text):
    if isinstance(text, str):
        return re.sub(r'\d+', '', text)
    else:
        return text

def remove_extra_spaces(text):
    if isinstance(text, str):
        return re.sub(r'\s+', ' ', text.strip())
    else:
        return text




In [13]:
#contraction library
def contraction_expansion(text):
    contractions = { 
        "won't": "will not",
        "'cause": "because",
        "can't": "cannot",
        "what's": "what is",
        "don't": "do not",
        "aren't": "are not",
        "isn't": "is not",
        "%": " percent",
        "that's": "that is",
        "doesn't": "does not",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "n't": " not",
        "'ve": " have",
        "'s": " is",
        "’s": "",
        "'re": " are",
        "'d": " would",
        "'ll": " will",
        "'m": " am",
        "ako'y": "ako ay"
    }
    for contraction, replacement in contractions.items(): 
        text = text.replace(contraction, replacement)
    return text
    


def punctuations_and_abbreviations(text):
    library = [
        (r"w/", " with "),
        (r"w/o", "without"),
        (r"(\d+)(k)", r"\g<1>000"),
        (r":", " : "),
        (r" u s ", " american "),
        (r"\0s", "0"),
        (r" 9 11 ", "911"),
        (r"e - mail", "email"),
        (r"j k", "jk"),
        (r"\s{2,}", " "),
        (r"amp;", "and"),
        (r",", " "),
        (r"\.", " "),
        (r"!", " "),
        (r";", " "),
        (r"-", " "),
        (r":", " "),
        (r"\/", " "),
        (r"%", " "),
        (r"&", " "),
        (r"\^", "  "),
        (r"\+", "  "),
        (r"\-", "  "),
        (r"\=", "  "),
        (r"https", " "),
        (r"'", " "),
        (r"[^A-Za-z0-9^,!.\/+-=]", " "), 
    ]
    for pattern, replacement in library:
        text = re.sub(pattern, replacement, text)
    return text



#From Filipino Toxic Speech
# Built-in English stop words.
# english_stop_words = CountVectorizer(stop_words='english').get_stop_words()

def custom_stop_words():
    custom_stop_words = [
        'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and',
        'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being',
        'below', 'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'could',
        "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down',
        'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has',
        "hasn't", 'have', "haven't", 'having', 'how', "how's",
        'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
        "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my',
        'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other',
        'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 
        'should', "shouldn't", 'so', 'some', 'such', 'than',
        'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there',
        "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this',
        'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
        'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's",
        'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom',
        'why', "why's", 'with', "won't", 'would', "wouldn't"
        # filipino_stop_words1    
        "ako", "akin", "ako'y", "amin", "aming", "ang", "ano", "anuman", "apat", "at", "atin", "ating",
        "ay", "bababa", "bago", "bakit", "bawat", "bilang", "dahil", "dalawa", "dapat", "din", "dito", "doon",
        "gagawin", "gayunman", "ginagawa", "ginawa", "ginawang", "gumawa", "gusto", "habang", "hanggang", "hindi", "huwag", "iba",
        "ibaba", "ibabaw", "ibig", "ikaw", "ilagay", "ilalim", "ilan", "inyong", "isa", "isang", "ito", "iyo",
        "iyon", "iyong", "kahit", "kailangan", "kailanman", "kami", "kanila", "kanilang", "kanino", "kanya", "kanyang",
        "kapag", "kapwa", "karamihan", "katiyakan", "katulad", "kay", "kaya", "kaysa", "ko", "kung", "laban",
        "lahat", "lamang", "likod", "lima", "maaari", "maaaring", "maging", "mahusay", "makita", "marami", "marapat", "mga",
        "minsan", "mismo", "mula", "muli", "na", "nabanggit", "naging", "nagkaroon", "nais", "nakita", "namin", "napaka",
        "narito", "nasaan", "ng", "nga", "ngayon", "ni", "nila", "nilang", "nito", "niya", "niyang", "noon",
        "o", "pag", "pala", "para", "pati", "pero", "pumunta", "pumupunta", "sa", "saan", "sabi", "sabihin",
        "sarili", "si", "sila", "sino", "siya", "tatlo", "tayo", "tulad", "tungkol", "una", "walang",
        #filipino_stopwords2 
        'ako', 'sa', 'akin', 'ko', 'aking', 'sarili', 'kami', 'atin', 'ang', 'aming', 'amin', 'ating',
        'ka', 'iyong', 'iyo', 'inyong', 'siya', 'kanya', 'mismo', 'ito', 'nito', 'kanyang', 'sila', 'nila',
        'kanila', 'kanilang', 'kung', 'ano', 'alin', 'sino', 'kanino', 'na', 'mga', 'iyon', 'am', 'ay',
        'maging', 'naging', 'mayroon', 'may', 'nagkaroon', 'pagkakaroon', 'gumawa', 'ginagawa', 'ginawa', 'paggawa',
        'ibig', 'dapat', 'maaari', 'marapat', 'kong', 'ikaw', 'tayo', 'hindi', 'namin', 'gusto', 'nais',
        'niyang', 'nilang', 'niya', 'huwag', 'ginawang', 'gagawin', 'maaaring', 'sabihin', 'narito', 'kapag', 'ni',
        'nasaan', 'bakit', 'paano', 'kailangan', 'walang', 'katiyakan', 'isang', 'at', 'pero', 'o', 'dahil',
        'bilang', 'hanggang', 'habang', 'ng', 'pamamagitan', 'para', 'tungkol', 'laban', 'pagitan', 'panahon', 'bago',
        'pagkatapos', 'itaas', 'ibaba', 'mula', 'pataas', 'pababa', 'palabas', 'ibabaw', 'ilalim', 'muli', 'pa',
        'minsan', 'dito', 'doon', 'saan', 'lahat', 'anumang', 'kapwa', 'bawat', 'ilan', 'karamihan', 'iba', 'tulad',
        'lamang', 'pareho', 'kaya', 'kaysa', 'masyado', 'napaka', 'isa', 'bababa', 'kulang', 'marami', 'ngayon',
        'kailanman', 'sabi', 'nabanggit', 'din', 'kumuha', 'pumunta', 'pumupunta', 'ilagay', 'makita', 'nakita',
        'katulad', 'mahusay', 'likod', 'kahit', 'paraan', 'noon', 'gayunman', 'dalawa', 'tatlo', 'apat', 'lima',
        'una', 'pangalawa'
    ]
    return custom_stop_words

def remove_custom_stopwords(text):
    custom_stopwords = custom_stop_words()
    # Split the text into words
    words = text.split()
    # Remove custom stopwords
    filtered_words = [word for word in words if word.lower() not in custom_stopwords]
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    return filtered_text



In [14]:
def name_removal(text):
    candidate = {
    #presidential candidates
    "president": " ",
    "rodrigo": " ",
    "'roa": " ",
    "duterte": " ",
    "du30": " ", #du30 
    "prrd": " ", 
    "rody": " ",
    "digong": " ",
    "binay": " ", 
    "jojo": " ",
    "jejo": " ",
    "jejomar": " ",
    "b1nay": " ",  #b1nay
    "mar": " ", 
    "roxas": " ",
    "grace": " ",
    "poe": " ",
    "miriam": " ",
    "defensor": " ",
    "santiago": " ",
    #vice presidential candidates
    "alan": " ", 
    "peter": " ", 
    "cayetano": " ", 
    "apc": " ", #abbreviation for alan peter cayetano
    "leni": " ",    
    "robredo": " ",
    "francis": " ",
    "escudero": " ", 
    "chiz": " ", 
    "honasan": " ", 
    "gringo": " ", 
    "gregorio": " ",
    "bongbong": " ",
    "ferdinand": " ",
    "marcos": " ",
    "bbm": " ",
    "antonio": " ",
    "trillanes": " ",
    "vice": " ", 
    "vp": " ",
    "villar": " ",
    "erap" : " ",
    "alma" : " ",
    "moreno": " ",
    "djp": " ",
    "senator": " ",
    "daniel": " ",
    "padilla": " ",
    "abi": " ",
    "abby": " ",
    "zapanta": " ",
    "mds": " ",
    }
    # Replace candidate names
    for name, replacement in candidate.items():
        text = re.sub(r'\b' + re.escape(name) + r'\b', replacement, text)
    return text

def remove_rt(text):
    if isinstance(text, str):
        pattern = r'\brt\b|\b[a-z]\b'
        return re.sub(pattern, '', text.strip())
    else:
        return text


## -------------------------------------------------------------
# <font color='purple'>Applying Preprocessing to Data</font>
## -------------------------------------------------------------

In [15]:
#data['text'] = data['text'].apply(remove_extra_spaces)
#data
#data_copy = data.copy()

In [16]:
#sample text: 
#@jejomarbinay Sana ako na lang yung napili sa THE VOICE      #thevoice. DUTERTE and CAYETANO WON! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(lowercasing)
# @jejomarbinay sana ako na lang yung napili sa the voice      #thevoice. duterte won! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(remove_extra_spaces)
# sana ako na lang yung napili sa the voice #thevoice. duterte and cayetano won! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(data_deidentification)
# sana ako na lang yung napili sa the voice #thevoice. duterte and cayetano won! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(contraction_expansion)
# sana ako na lang yung napili sa the voice #thevoice. duterte and cayetano won! it is 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(name_removal)
# sana ako na lang yung napili sa the voice #thevoice. won! it is 100% unfair voting ... https://debololo.com



data['text'] = data['text'].apply(remove_hashtags)
# sana ako na lang yung napili sa the voice. won! it is 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(remove_urls)
# sana ako na lang yung napili sa the voice. won! it is 100% unfair voting ...

data['text'] = data['text'].apply(punctuations_and_abbreviations)
# sana ako na lang yung napili sa the voice won it is 100 unfair voting

data['text'] = data['text'].apply(remove_numbers)
# sana ako na lang yung napili sa the voice. won! it is unfair voting ...

# data['text'] = data['text'].apply(remove_custom_stopwords)

# sana lang yung napili voice. won unfair voting

data['label'] = data['label'].apply(binary_classification)
# Assuming hate label for the sample text is 0.3, it is converted to 0

data.dropna(subset=['text'], inplace=True)
# If text column has null value, drop it


data['text'] = data['text'].apply(remove_rt)
# IF it has rt string, remove it. FOR RETWEETS only


data['text'] = data['text'].apply(lambda x: ' '.join(x.split()))


## -------------------------------------------------------------
# <font color='purple'>DATASET AFTER PREPROCESSING</font>
## -------------------------------------------------------------

In [17]:
print(data['text'].iloc[10:25])

10               sa laki ng ginastos ni tapos sa laki din ng talo niya sa mayo siya pa din tameme sa ending ng kwento yun na
11                 pet theory contrasted with pnoy for past years has not failed much then again he has not done much either
12                                                                               sino ba si yuan nognog pandak laki sa hirap
13                                                                                breaking vcm inside novotel cubao owned by
14    di daw pagsisihan na binoto nila si mds kahit talo baka dun na kayo magsisi kung si or ang mananalo kayo na matatalino
15                           so anak ni pala itong si will not be surprised if nese iye ne eng lehet will be campaign jingle
16         ang kakapal ng mga mukha niyo pnoy at matapos niyong batikusin si at sila naman ang yayain niyo anu toh civil war
17                                                                                               walang hindi importante kay


In [18]:
data.dropna(subset=['text'], inplace=True)
# If text column has null value, drop it
data.dropna(subset=['label'], inplace=True)
# If text column has null value, drop it
# Reset the index.
data.reset_index(drop=True, inplace=True)
#Dataset after preprocessing
data
label_counts = data['label'].value_counts()
print(label_counts)
data
# Drop null values.
data.dropna(inplace=True)

0    10192
1     8685
Name: label, dtype: int64


In [19]:
data = data[data['text'].str.strip() != '']

# Now, you can recheck if there are any missing values in the 'text' column
if data['text'].isna().any():
    print("There are still NA values in the 'text' column.")
else:
    print("There are no missing values in the 'text' column.")

There are no missing values in the 'text' column.


## -------------------------------------------------------------
# <font color='purple'>EXPORTING PREPROCESSED DATASET AS CSV</font>
## -------------------------------------------------------------

In [20]:
data.to_csv('Testssss.csv', index=False)

## -------------------------------------------------------------
# <font color='purple'>PREPROCESSING DONE</font>
## -------------------------------------------------------------