# Data Pre-processing

### Import Libraries

In [1]:
# main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# For Data processing/cleaning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import STOPWORDS
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
import os
import warnings


In [2]:
# import sys
# sys.path.append("stemmer\TagalogStemmerPython")

# import TglStemmer

In [3]:
dataset = pd.read_csv("All_Data.csv")

In [4]:
dataset.shape

(36701, 3)

In [5]:
dataset.head()

Unnamed: 0,date,user,text
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav..."
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...


In [6]:
dataset.isna().any()

date    False
user    False
text    False
dtype: bool

In [7]:
df2 = dataset.drop_duplicates(subset=["text"], keep='first')
df2.shape

(29596, 3)

### Removing @names


In [8]:
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

In [9]:
df2['tidy_tweets'] = np.vectorize(remove_pattern)(df2['text'], "@[\w]*")

df2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tidy_tweets'] = np.vectorize(remove_pattern)(df2['text'], "@[\w]*")


Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...","Hello, #AcademicTwitter,\n\n🎙️""You need to hav..."
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh 😭 academicbreaknow!!!
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,#/academicbreaknow tsngina pagod na 'ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...","Pagoda ang accla, 4hours tulog gising 3:40am l..."
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...","sa letra ng p, putangina pagod na ko #academic..."


###  Removing Punctuations, Numbers, and Special Characters

In [10]:
df2['tidy_tweets'] = df2['tidy_tweets'].str.replace("[^a-zA-Z#]", " ")
df2.head(10)

  df2['tidy_tweets'] = df2['tidy_tweets'].str.replace("[^a-zA-Z#]", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tidy_tweets'] = df2['tidy_tweets'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have ...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh academicbreaknow
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,# academicbreaknow tsngina pagod na ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...",Pagoda ang accla hours tulog gising am l...
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...",sa letra ng p putangina pagod na ko #academic...


### Removing links

In [11]:
cleaned_tweets = []

for index, row in df2.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.tidy_tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

df2['tidy_tweets'] = cleaned_tweets
df2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tidy_tweets'] = cleaned_tweets


Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh academicbreaknow
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,# academicbreaknow tsngina pagod na ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...",Pagoda ang accla hours tulog gising am lecture...
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...",sa letra ng p putangina pagod na ko #academicb...


### Remove rows with empty texts

In [12]:
tweets_df = df2[df2['tidy_tweets']!='']
tweets_df.head(10)

Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh academicbreaknow
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,# academicbreaknow tsngina pagod na ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...",Pagoda ang accla hours tulog gising am lecture...
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...",sa letra ng p putangina pagod na ko #academicb...


### Drop Duplicates

In [13]:
tweets_df.drop_duplicates(subset=['tidy_tweets'], keep='first')
tweets_df.shape

(29593, 4)

### Reset Index


In [14]:
tweets_df = tweets_df.reset_index(drop=True)
tweets_df

Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
...,...,...,...,...
29588,2022-10-21 05:27:01+00:00,dinnyyyyyyyy,@aicannot si taylor nay nag implement ug acade...,si taylor nay nag implement ug academic break
29589,2022-10-21 05:24:05+00:00,stalinistberet,"@RodericDay ""Actually Lenin wasn't peer review...",Actually Lenin wasn t peer reviewed by establi...
29590,2022-10-21 05:15:33+00:00,6Senoritamae,Academic break plss😭,Academic break plss
29591,2022-10-21 04:49:59+00:00,treysinsmxx,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...


In [15]:
tweets_df.shape

(29593, 4)

### Remove special characters again

In [16]:
tweets_df['absolute_tidy_tweets'] = tweets_df['tidy_tweets'].str.replace("[^a-zA-Z# ]", "")

  tweets_df['absolute_tidy_tweets'] = tweets_df['tidy_tweets'].str.replace("[^a-zA-Z# ]", "")


In [17]:
tweets_df

Unnamed: 0,date,user,text,tidy_tweets,absolute_tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...,Hello #AcademicTwitter You need to have a rest...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
...,...,...,...,...,...
29588,2022-10-21 05:27:01+00:00,dinnyyyyyyyy,@aicannot si taylor nay nag implement ug acade...,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break
29589,2022-10-21 05:24:05+00:00,stalinistberet,"@RodericDay ""Actually Lenin wasn't peer review...",Actually Lenin wasn t peer reviewed by establi...,Actually Lenin wasn t peer reviewed by establi...
29590,2022-10-21 05:15:33+00:00,6Senoritamae,Academic break plss😭,Academic break plss,Academic break plss
29591,2022-10-21 04:49:59+00:00,treysinsmxx,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...


# Remove English and Filipino Stopwords

In [18]:
tokenizer = ToktokTokenizer()

In [19]:
stopword_eng = nltk.corpus.stopwords.words('english')
stopword_fil = ["akin","aking","ako","alin","am","amin","aming","ang","ano","anumang","apat","at","atin","ating","ay","bababa","bago","bakit","bawat","bilang","dahil","dalawa","dapat","din","dito","doon","gagawin","gayunman","ginagawa","ginawa","ginawang","gumawa","gusto","habang","hanggang","hindi","huwag","iba","ibaba","ibabaw","ibig","ikaw","ilagay","ilalim","ilan","inyong","isa","isang","itaas","ito","iyo","iyon","iyong","ka","kahit","kailangan","kailanman","kami","kanila","kanilang","kanino","kanya","kanyang","kapag","kapwa","karamihan","katiyakan","katulad","kaya","kaysa","ko","kong","kulang","kumuha","kung","laban","lahat","lamang","likod","lima","maaari","maaaring","maging","mahusay","makita","marami","marapat","masyado","may","mayroon","mga","minsan","mismo","mula","muli","na","nabanggit","naging","nagkaroon","nais","nakita","namin","napaka","narito","nasaan","ng","ngayon","ni","nila","nilang","nito","niya","niyang","noon","o","pa","paano","pababa","paggawa","pagitan","pagkakaroon","pagkatapos","palabas","pamamagitan","panahon","pangalawa","para","paraan","pareho","pataas","pero","pumunta","pumupunta","sa","saan","sabi","sabihin","sarili","sila","sino","siya","tatlo","tayo","tulad","tungkol","una","walang"]

In [20]:
print(stopword_eng)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [21]:
print(stopword_fil)

['akin', 'aking', 'ako', 'alin', 'am', 'amin', 'aming', 'ang', 'ano', 'anumang', 'apat', 'at', 'atin', 'ating', 'ay', 'bababa', 'bago', 'bakit', 'bawat', 'bilang', 'dahil', 'dalawa', 'dapat', 'din', 'dito', 'doon', 'gagawin', 'gayunman', 'ginagawa', 'ginawa', 'ginawang', 'gumawa', 'gusto', 'habang', 'hanggang', 'hindi', 'huwag', 'iba', 'ibaba', 'ibabaw', 'ibig', 'ikaw', 'ilagay', 'ilalim', 'ilan', 'inyong', 'isa', 'isang', 'itaas', 'ito', 'iyo', 'iyon', 'iyong', 'ka', 'kahit', 'kailangan', 'kailanman', 'kami', 'kanila', 'kanilang', 'kanino', 'kanya', 'kanyang', 'kapag', 'kapwa', 'karamihan', 'katiyakan', 'katulad', 'kaya', 'kaysa', 'ko', 'kong', 'kulang', 'kumuha', 'kung', 'laban', 'lahat', 'lamang', 'likod', 'lima', 'maaari', 'maaaring', 'maging', 'mahusay', 'makita', 'marami', 'marapat', 'masyado', 'may', 'mayroon', 'mga', 'minsan', 'mismo', 'mula', 'muli', 'na', 'nabanggit', 'naging', 'nagkaroon', 'nais', 'nakita', 'namin', 'napaka', 'narito', 'nasaan', 'ng', 'ngayon', 'ni', 'nila',

### Remove english stopwords

In [22]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_eng]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_eng]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

tweets_df['stopped_tweets']=tweets_df['absolute_tidy_tweets'].apply(remove_stopwords)

In [23]:
tweets_df

Unnamed: 0,date,user,text,tidy_tweets,absolute_tidy_tweets,stopped_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...,Hello #AcademicTwitter You need to have a rest...,Hello #AcademicTwitter need rest activate best...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
...,...,...,...,...,...,...
29588,2022-10-21 05:27:01+00:00,dinnyyyyyyyy,@aicannot si taylor nay nag implement ug acade...,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break
29589,2022-10-21 05:24:05+00:00,stalinistberet,"@RodericDay ""Actually Lenin wasn't peer review...",Actually Lenin wasn t peer reviewed by establi...,Actually Lenin wasn t peer reviewed by establi...,Actually Lenin peer reviewed established acade...
29590,2022-10-21 05:15:33+00:00,6Senoritamae,Academic break plss😭,Academic break plss,Academic break plss,Academic break plss
29591,2022-10-21 04:49:59+00:00,treysinsmxx,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...


### Remove Filipino Stopwords

In [24]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_fil]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_fil]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

tweets_df['stopped_tweets']=tweets_df['stopped_tweets'].apply(remove_stopwords)

In [25]:
tweets_df

Unnamed: 0,date,user,text,tidy_tweets,absolute_tidy_tweets,stopped_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...,Hello #AcademicTwitter You need to have a rest...,Hello #AcademicTwitter need rest activate best...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow q tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...,super delay tbw list #academicbreaknow
...,...,...,...,...,...,...
29588,2022-10-21 05:27:01+00:00,dinnyyyyyyyy,@aicannot si taylor nay nag implement ug acade...,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break
29589,2022-10-21 05:24:05+00:00,stalinistberet,"@RodericDay ""Actually Lenin wasn't peer review...",Actually Lenin wasn t peer reviewed by establi...,Actually Lenin wasn t peer reviewed by establi...,Actually Lenin peer reviewed established acade...
29590,2022-10-21 05:15:33+00:00,6Senoritamae,Academic break plss😭,Academic break plss,Academic break plss,Academic break plss
29591,2022-10-21 04:49:59+00:00,treysinsmxx,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...


## export to csv for backup

In [26]:
# tweets_df.to_csv('Data_wo_Stopwords.csv')

## Label Sentiments Automatically

Compare Textblob and NLTK Sentiment analyzer

In [27]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor

def fetch_sentiment_using_textblob(text):
    analysis = TextBlob(text)
    return 'pos' if analysis.sentiment.polarity >= 0 else 'neg'

In [44]:
sentiments_using_textblob = tweets_df.text.apply(lambda tweet: fetch_sentiment_using_textblob(tweet))
tweets_df['sentiment'] = sentiments_using_textblob
pd.DataFrame(sentiments_using_textblob.value_counts())

Unnamed: 0,text
pos,26793
neg,2800


In [45]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def fetch_sentiment_using_SIA(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return 'neg' if polarity_scores['neg'] > polarity_scores['pos'] else 'pos'

In [46]:
sentiments_using_SIA = tweets_df.text.apply(lambda tweet: fetch_sentiment_using_SIA(tweet))
tweets_df['sentiment'] = sentiments_using_textblob
pd.DataFrame(sentiments_using_SIA.value_counts())

Unnamed: 0,text
pos,24757
neg,4836


In [55]:
tweets_df.loc[tweets_df.sentiment == 'neg']

Unnamed: 0,date,user,text,tidy_tweets,absolute_tidy_tweets,stopped_tweets,sentiment
11,2022-11-21 14:33:33+00:00,joluvre,@tkluvon it’s so hard to keep up gawa ng schoo...,it s so hard to keep up gawa ng school lately ...,it s so hard to keep up gawa ng school lately ...,hard keep gawa school lately pls #academicbrea...,neg
32,2022-11-15 13:36:49+00:00,kenevermore13,fuck ngayon palang ako mag sisimula #AcademicB...,fuck ngayon palang ako mag sisimula #AcademicB...,fuck ngayon palang ako mag sisimula #AcademicB...,fuck palang mag sisimula #AcademicBreakNow,neg
33,2022-11-15 10:51:19+00:00,flynn_jhnll,I miss kdrama and mga series😭 #AcademicBreakNo...,I miss kdrama and mga series #AcademicBreakNow...,I miss kdrama and mga series #AcademicBreakNow...,miss kdrama series #AcademicBreakNow emz hahah...,neg
35,2022-11-14 13:02:12+00:00,moonlightwonie,miss ko na mga koreano ko :(( #enhypen #academ...,miss ko na mga koreano ko #enhypen #academicbr...,miss ko na mga koreano ko #enhypen #academicbr...,miss koreano #enhypen #academicbreaknow,neg
38,2022-11-14 00:17:22+00:00,fxiryvall,Ang hirap na talaga pumasok wtf #AcademicBreak...,Ang hirap na talaga pumasok wtf #AcademicBreak...,Ang hirap na talaga pumasok wtf #AcademicBreak...,hirap talaga pumasok wtf #AcademicBreakNow #sc...,neg
...,...,...,...,...,...,...,...
29557,2022-10-21 10:14:47+00:00,CraigLBeaton,The administrators of Israeli Apartheid seek t...,The administrators of Israeli Apartheid seek t...,The administrators of Israeli Apartheid seek t...,administrators Israeli Apartheid seek break Pa...,neg
29573,2022-10-21 08:43:45+00:00,anthropicmissy,When the academic break is also our review wee...,When the academic break is also our review wee...,When the academic break is also our review wee...,academic break also review week basically brea...,neg
29580,2022-10-21 07:35:31+00:00,leiyluh,academic break but its only 2 days💀weekends do...,academic break but its only days weekends dont...,academic break but its only days weekends dont...,academic break days weekends dont count smh ig...,neg
29581,2022-10-21 07:26:58+00:00,angelo_fule,Parang isang araw lang yung isang linggo namin...,Parang isang araw lang yung isang linggo namin...,Parang isang araw lang yung isang linggo namin...,Parang araw lang yung linggo naming academic b...,neg


In [56]:
tweets_df.to_csv('03_Data_with_sentiment.csv')

## Label Tagalog Sentiments

In [61]:
from numpy import loadtxt

#import text file into NumPy array
neg_data = loadtxt('negative_words_tl.txt', dtype='object')
pos_data = loadtxt('positive_words_tl.txt', dtype='object')
badwords = ["amputa","animal ka","bilat","binibrocha","bobo","bogo","boto","brocha","burat","bwesit","bwisit","demonyo ka","engot","etits","gaga","gagi","gago","habal","hayop ka","hayup","hinampak","hinayupak","hindot","hindutan","hudas","iniyot","inutel","inutil","iyot","kagaguhan","kagang","kantot","kantotan","kantut","kantutan","kaululan","kayat","kiki","kikinginamo","kingina","kupal","leche","leching","lechugas","lintik","nakakaburat","nimal","ogag","olok","pakingshet","pakshet","pakyu","pesteng yawa","poke","poki","pokpok","poyet","pu'keng","pucha","puchanggala","puchangina","puke","puki","pukinangina","puking","punyeta","puta","putang","putang ina","putangina","putanginamo","putaragis","putragis","puyet","ratbu","shunga","sira ulo","siraulo","suso","susu","tae","taena","tamod","tanga","tangina","taragis","tarantado","tete","teti","timang","tinil","tite","titi","tungaw","ulol","ulul","ungas"]