## -------------------------------------------------------------

# <span style="color:purple">IMPORT LIBRARIES: ```</span>

## -------------------------------------------------------------

In [370]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader

from sklearn.feature_extraction.text import CountVectorizer


pd.set_option('display.max_colwidth', None)

## -------------------------------------------------------------
# <span style="color:purple">IMPORT DATASET: ```</span>
## -------------------------------------------------------------

In [371]:
data = pd.read_csv("aaaa.csv", delimiter=',')

In [372]:
# #insert dataset
# dataset = "combinedV5"

# data = pd.read_csv(dataset+".csv",  )
# #print(data['text'][:5])
# data

### -------------------------------------------------------------
## <span style="color:purple">DATASET Overview & Checking for NULL and DUPLICATE values ```</span>
### -------------------------------------------------------------

In [373]:
label_counts = data['label'].value_counts()
print(label_counts)
data

label
0    10161
1     8679
Name: count, dtype: int64


Unnamed: 0,text,label,language
0,gastos ni sa political ads halos p m na inaasahan na ni na may mga taong,0,English
1,tang ina tuwid na daan daw eh sya nga di straight,1,English
2,salamat sa walang sawang suporta ng mga taga makati ang pagbabalik in makati,0,Filipino
3,putangina mo takbo pa,1,Filipino
4,with selective amnesia forgetting about the past six years he spent preparing to be,0,English
...,...,...,...
18835,if a hoe is talking to your man she is not the problem he is the fact that hoes feel welcomed reflects on how he acts behi,1,English
18836,sounds like a bad bitch,1,English
18837,is that all pussy bitch,1,English
18838,i seen scooby hoe some niggas at the gardens too,1,English


In [374]:
#DATASET OVERVIEW
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18840 entries, 0 to 18839
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      18840 non-null  object
 1   label     18840 non-null  int64 
 2   language  18840 non-null  object
dtypes: int64(1), object(2)
memory usage: 441.7+ KB


In [375]:
#CHECK FOR NULL VALUES
data.isnull().sum()
null_rows = data[data['label'].isnull()]

null_rows


Unnamed: 0,text,label,language


In [376]:
#CHECK FOR DUPLICATE VALUES
data.duplicated().sum()

800

## -------------------------------------------------------------
# <font color='purple'>Dropping Duplicates and NULL</font>
## -------------------------------------------------------------

In [377]:
# View Duplicates
data[data.duplicated()]

Unnamed: 0,text,label,language
381,okay nga si nalang kesa si o ano,0,Filipino
485,bakit si ayaw pumunta di lang kasi boo abutin niya go,1,Filipino
629,bakit si ayaw pumunta di lang kasi boo abutin niya go,1,Filipino
787,supreme court upholds ca decision on junjun is case cnn,0,English
1025,completes senate slate,0,English
...,...,...,...
18359,nigger,1,English
18450,niggers,1,English
18535,fag,1,English
18564,fuck you faggot,1,English


In [378]:
# Drop the duplicates.
data.drop_duplicates(inplace=True)

# Drop null values.
data.dropna(inplace=True)

# Reset the index.
data.reset_index(drop=True, inplace=True)

In [379]:
#Checking if there are still duplicates
data.duplicated().sum()

0

In [380]:
#Checking if there are still NULL values
data.isnull().sum()


text        0
label       0
language    0
dtype: int64

## -------------------------------------------------------------
# <span style="color:purple">Data Preprocessing Functions ```</span>
## <font color='purple'>``` Functions Overview ```</font>
1. **<font color='blue'>Lowercasing</font>**
2. **<font color='blue'>Binary Classification</font>**
3. **<font color='blue'>Data De-identification</font>**
4. **<font color='blue'>Hashtag Removal</font>**
5. **<font color='blue'>URL Removal</font>**
6. **<font color='blue'>Removing Numbers</font>**
7. **<font color='blue'>Removing Extra White Space</font>**
8. **<font color='blue'>Contraction Expansion</font>**
9. **<font color='blue'>Punctuation Removal</font>**
10. **<font color='blue'>Stop Words Removal</font>**
11. **<span style="color:blue">Candidate Names and RT String Removal</span>**
## -------------------------------------------------------------


In [381]:
#lowercasing
def lowercasing(text):
    if isinstance(text, str):
        text = text.lower()
    return text

#binary classification of Hate Score 
def binary_classification(num):
    if isinstance(num, str):
        num = float(num)  # Convert num to float if it's a string
    if num >= 0.5:
        num = 1
    else:
        num = 0
    return num

# Remove Mentions - Data-deidentification
def data_deidentification(text):
    if isinstance(text, str):
        return re.sub(r'@\w+\:?', '', text)
    else:
        return text

def remove_hashtags(text):
    if isinstance(text, str):
        return re.sub(r'#\w+', '', text)
    else:
        return text

def remove_urls(text):
    if isinstance(text, str):
        return re.sub(r'https?://\S+', '', text)
    else:
        return text

def remove_numbers(text):
    if isinstance(text, str):
        return re.sub(r'\d+', '', text)
    else:
        return text

def remove_extra_spaces(text):
    if isinstance(text, str):
        return re.sub(r'\s+', ' ', text.strip())
    else:
        return text




In [382]:
#contraction library
def contraction_expansion(text):
    contractions = { 
        "won't": "will not",
        "'cause": "because",
        "can't": "cannot",
        "what's": "what is",
        "don't": "do not",
        "aren't": "are not",
        "isn't": "is not",
        "%": " percent",
        "that's": "that is",
        "doesn't": "does not",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "n't": " not",
        "'ve": " have",
        "'s": " is",
        "’s": "",
        "'re": " are",
        "'d": " would",
        "'ll": " will",
        "'m": " am"
    }
    for contraction, replacement in contractions.items(): 
        text = text.replace(contraction, replacement)
    return text
    


def punctuations_and_abbreviations(text):
    library = [
        (r"w/", " with "),
        (r"w/o", "without"),
        (r"(\d+)(k)", r"\g<1>000"),
        (r":", " : "),
        (r" u s ", " american "),
        (r"\0s", "0"),
        (r" 9 11 ", "911"),
        (r"e - mail", "email"),
        (r"j k", "jk"),
        (r"\s{2,}", " "),
        (r"amp;", "and"),
        (r"g2g", "gtg"),
        (r"2moro", "tomorrow"),
        (r"b4", "before"),
        (r"2nite", "tonight"),
        (r"2day", "today"),
        (r"4U", "for you"),
        (r"4get", "forget"),
        (r"2morrow", "tomorrow"),
        (r"2be", "because"),
        (r"l8r", "later"),
        (r",", " "),
        (r"\.", " "),
        (r"!", " "),
        (r";", " "),
        (r"-", " "),
        (r":", " "),
        (r"\/", " "),
        (r"%", " "),
        (r"&", " "),
        (r"\^", " ^ "),
        (r"\+", " + "),
        (r"\-", " - "),
        (r"\=", " = "),
        # (r"'", " "),
        (r"[^A-Za-z0-9^,!.\/+-=]", " "), 
    ]
    for pattern, replacement in library:
        text = re.sub(pattern, replacement, text)
    return text





#From Filipino Toxic Speech
# Built-in English stop words.
# english_stop_words = CountVectorizer(stop_words='english').get_stop_words()

def custom_stop_words():
    custom_stop_words = [
        'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and',
        'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being',
        'below', 'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'could',
        "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down',
        'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has',
        "hasn't", 'have', "haven't", 'having', 'how', "how's", "see",
         "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 
        "it's", 'its', 'itself', "let's", 'more', 'most', "mustn't", 
        'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other',
        'ought', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 
        'should', "shouldn't", 'so', 'some', 'such', 'than',
        'that', "that's", 'the', 'theirs', 'then', 'there',
        "there's", 'these',  'this',
        'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
        'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's",
        'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom',
        'why', "why's", 'with', "won't", 'would', "wouldn't","just","will","like", "know", "say","says","it","get",
        "makes","make","me","till", "end"
        # filipino_stop_words1    
        "ako", "akin", "ako'y", "amin", "aming", "ang", "ano", "anuman", "apat", "at", "atin", "ating",
        "ay", "bababa", "bago", "bakit", "bawat", "bilang", "dahil", "dalawa", "dapat", "din", "dito", "doon",
        "gagawin", "gayunman", "ginagawa", "ginawa", "ginawang", "gumawa", "gusto", "habang", "hanggang", "hindi", "huwag", "iba",
        "ibaba", "ibabaw", "ibig", "ilagay", "ilalim", "ilan", "inyong", "isa", "isang", "ito", "iyo",
        "iyon", "iyong", "kahit", "kailangan", "kailanman", "kami", "kanila", "kanilang", "kanino", "kanya", "kanyang",
        "kapag", "kapwa", "karamihan", "katiyakan", "katulad", "kay", "kaya", "kaysa", "ko", "kung", "laban",
        "lahat", "lamang", "likod", "lima", "maaari", "maaaring", "maging", "mahusay", "makita", "marami", "marapat", "mga",
        "minsan", "mismo", "mula", "muli", "na", "nabanggit", "naging", "nagkaroon", "nais", "nakita", "namin", "napaka",
        "narito", "nasaan", "ng", "nga", "ngayon", "ni", "nila", "nilang", "nito", "niyang", "noon",
        "o", "pag", "pala", "para", "pati", "pero", "pumunta", "pumupunta", "sa", "saan", "sabi", "sabihin",
        "sarili", "sino", "tatlo", "tayo", "tulad", "tungkol", "una", "walang",
        #filipino_stopwords2 
        'ako', 'sa', 'akin', 'ko', 'aking', 'sarili', 'kami', 'atin', 'ang', 'aming', 'amin', 'ating',
        'ka', 'iyong', 'iyo', 'inyong', 'kanya', 'mismo', 'ito', 'nito', 'kanyang', 'nila',
        'kanila', 'kanilang', 'kung', 'ano', 'alin', 'sino', 'kanino', 'na', 'mga', 'iyon', 'am', 'ay',
        'maging', 'naging', 'mayroon', 'may', 'nagkaroon', 'pagkakaroon', 'gumawa', 'ginagawa', 'ginawa', 'paggawa',
        'ibig', 'dapat', 'maaari', 'marapat', 'kong', 'tayo', 'hindi', 'namin', 'gusto', 'nais',
        'niyang', 'nilang', 'huwag', 'ginawang', 'gagawin', 'maaaring', 'sabihin', 'narito', 'kapag', 'ni',
        'nasaan', 'bakit', 'paano', 'kailangan', 'walang', 'katiyakan', 'isang', 'at', 'pero', 'o', 'dahil',
        'bilang', 'hanggang', 'habang', 'ng', 'pamamagitan', 'para', 'tungkol', 'laban', 'pagitan', 'panahon', 'bago',
        'pagkatapos', 'itaas', 'ibaba', 'mula', 'pataas', 'pababa', 'palabas', 'ibabaw', 'ilalim', 'muli', 'pa',
        'minsan', 'dito', 'doon', 'saan', 'lahat', 'anumang', 'kapwa', 'bawat', 'ilan', 'karamihan', 'iba', 'tulad',
        'lamang', 'pareho', 'kaya', 'kaysa', 'masyado', 'napaka', 'isa', 'bababa', 'kulang', 'marami', 'ngayon',
        'kailanman', 'sabi', 'nabanggit', 'din', 'kumuha', 'pumunta', 'pumupunta', 'ilagay', 'makita', 'nakita',
        'katulad', 'mahusay', 'likod', 'kahit', 'paraan', 'noon', 'gayunman', 'dalawa', 'tatlo', 'apat', 'lima',
        'una', 'pangalawa',

        #filipino_stopwords3
        'wag','lang', 'di','naman', 'nalang','lang', 'ba','wala', "si", "pagbabalik", "makati", "talaga","mas","mataas", "yan"
        #some shit
         "you you", "tsk tsk", "tsk","tell", "every","commercial", "thank", "income", "tax", "guys", "party", "liberal", "place",
        #some
        "abs", "cbn", "bandila", "gma", "inquirer", "rappler", "via", "ad", "ph", "vote", "tv","ads", "black", "propaganda", "look","partial", 
        "unofficial", "country","presidential", "candidate","happy", "birthday",

        #eng
        "right","now", "think", "our"
 


        

        
    ]
    return custom_stop_words

def remove_custom_stopwords(text):
    custom_stopwords = custom_stop_words()
    # Split the text into words
    words = text.split()
    # Remove custom stopwords
    filtered_words = [word for word in words if word.lower() not in custom_stopwords]
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    return filtered_text



In [383]:
def remove_candidate_name(text):
    candidate = { 
    "rodrigo": " ",
    "'roa": " ",
    "duterte": " ",
    "du30": "what is ",
    "prrd": " ", 
    "rody": " ",
    "digong": " ", 
    "alan": " ", 
    "peter": " ", 
    "cayetano": " ", 
    "apc": " ",
    "mar": " ", 
    "roxas": " ", 
    "robredo": " ", 
    "leni": " ",
    "poe": " ", 
    "grace": " ",
    "escudero": " ", 
    "chiz": " ", 
    "francis": " ",
    "binay": " ", 
    "jojo": " ", 
    "vice": " ", 
    "jejomar": " ", 
    "honasan": " ", 
    "gringo": " ", 
    "gregorio": " ",
    "vp": " ", 
    "president": " ", 
    "senator": " ",
    "daniel": " ",
    "padilla": " ",
    "abi": " ",
    "charlie": " ",
    "sheen": " ",
    "sen": " ",
    "miriam": " ",
    "abi": " ",
    "abby": " ",
    "zapanta": " ",

       
    
    
    }
    # Replace candidate names
    for name, replacement in candidate.items():
        text = re.sub(r'\b' + re.escape(name) + r'\b', replacement, text)
    return text

def remove_rt_and_single_char(text):
    if isinstance(text, str):
        pattern = r'\brt\b|\b[a-z]\b'
        return re.sub(pattern, '', text.strip())
    else:
        return text

## -------------------------------------------------------------
# <font color='purple'>Applying Preprocessing to Data</font>
## -------------------------------------------------------------

In [384]:
#data['text'] = data['text'].apply(remove_extra_spaces)
#data
#data_copy = data.copy()

In [385]:
#sample text: 
#@jejomarbinay Sana ako na lang yung napili sa THE VOICE      #thevoice. DUTERTE and CAYETANO WON! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(lowercasing)
# @jejomarbinay sana ako na lang yung napili sa the voice      #thevoice. duterte won! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(remove_extra_spaces)
# sana ako na lang yung napili sa the voice #thevoice. duterte and cayetano won! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(data_deidentification)
# sana ako na lang yung napili sa the voice #thevoice. duterte and cayetano won! it's 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(contraction_expansion)
# sana ako na lang yung napili sa the voice #thevoice. duterte and cayetano won! it is 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(remove_candidate_name)
# sana ako na lang yung napili sa the voice #thevoice. won! it is 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(remove_rt_and_single_char)
# IF it has rt string, remove it. FOR RETWEETS only

data['text'] = data['text'].apply(remove_hashtags)
# sana ako na lang yung napili sa the voice. won! it is 100% unfair voting ... https://debololo.com

data['text'] = data['text'].apply(remove_urls)
# sana ako na lang yung napili sa the voice. won! it is 100% unfair voting ...

data['text'] = data['text'].apply(punctuations_and_abbreviations)
# sana ako na lang yung napili sa the voice won it is 100 unfair voting

data['text'] = data['text'].apply(remove_numbers)
# sana ako na lang yung napili sa the voice. won! it is unfair voting ...

data['text'] = data['text'].apply(remove_custom_stopwords)

# sana lang yung napili voice. won unfair voting

data['label'] = data['label'].apply(binary_classification)
# Assuming hate label for the sample text is 0.3, it is converted to 0

data.dropna(subset=['text'], inplace=True)
# If text column has null value, drop it

data['text'] = data['text'].apply(lambda x: ' '.join(x.split()))


## -------------------------------------------------------------
# <font color='purple'>DATASET AFTER PREPROCESSING</font>
## -------------------------------------------------------------

In [386]:
print(data['text'].iloc[10000:21000])

10000                                                                        pm pdplbn ind
10001                          srsly though whatever you still philippines you respect him
10002                                 using children political winning mr you call decency
10003                               yung famewhore mong classmate mahilig manira ibang tao
10004                                                                                     
                                               ...                                        
18035    hoe talking your man she problem he fact hoes feel welcomed reflects he acts behi
18036                                                                     sounds bad bitch
18037                                                                          pussy bitch
18038                                                       seen scooby hoe niggas gardens
18039                                                                   bitch blew my high

In [387]:
data.dropna(subset=['text'], inplace=True)
# If text column has null value, drop it
data.dropna(subset=['label'], inplace=True)
# If text column has null value, drop it
# Reset the index.
data.reset_index(drop=True, inplace=True)
#Dataset after preprocessing
data
label_counts = data['label'].value_counts()
print(label_counts)
data
# Drop null values.
data.dropna(inplace=True)

label
0    9539
1    8501
Name: count, dtype: int64


In [388]:
# data['word_count'] = data['text'].apply(lambda x: len(x.split()))

# # Drop rows where the word count is less than or equal to 1
# data = data[data['word_count'] > 3]

# # Drop the 'word_count' column as it's no longer needed
# data.drop('word_count', axis=1, inplace=True)

# # Display the DataFrame after dropping rows with one term or one word in the 'text' column
# print(data)

# data = data[data['text'].str.strip() != '']

# # Now, you can recheck if there are any missing values in the 'text' column
# if data['text'].isna().any():
#     print("There are still NA values in the 'text' column.")
# else:
#     print("There are no missing values in the 'text' column.")

                                                                                    text  \
0                                                 gastos political halos inaasahan taong   
1                                                tang ina tuwid daan daw eh sya straight   
2                                                            salamat sawang suporta taga   
4                         selective amnesia forgetting past six years he spent preparing   
5                             matter whoever won long they finally changed noynoy failed   
...                                                                                  ...   
18033                                                        hmu you stop bein lil bitch   
18034                                              money bitches bc bitches follow money   
18035  hoe talking your man she problem he fact hoes feel welcomed reflects he acts behi   
18038                                                     seen scooby hoe niggas

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('word_count', axis=1, inplace=True)


## -------------------------------------------------------------
# <font color='purple'>EXPORTING PREPROCESSED DATASET AS CSV</font>
## -------------------------------------------------------------

In [389]:
data.to_csv('latestPre.csv', index=False)

## -------------------------------------------------------------
# <font color='purple'>PREPROCESSING DONE</font>
## -------------------------------------------------------------