# Data Pre-processing

### Import Libraries

In [1]:
# main libraries
import numpy as np
import pandas as pd
import nltk

# For Data processing/cleaning
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import STOPWORDS
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
import os
import warnings


In [2]:
# import sys
# sys.path.append("stemmer\TagalogStemmerPython")

# import TglStemmer

In [3]:
dataset = pd.read_csv("nlp_research_dataset_1000.csv")

In [4]:
dataset.shape

(1000, 4)

In [5]:
dataset.head()

Unnamed: 0,date,user,text,label
0,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,-1
1,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,-1
2,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,-1
3,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,-1
4,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...",-1


In [6]:
dataset.isna().any()

date     False
user     False
text     False
label    False
dtype: bool

In [7]:
# df2 = dataset.drop_duplicates(subset=["text"], keep='first')
# df2.shape
df2 = dataset[:]

### Removing @names


In [8]:
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

In [9]:
df2['tidy_tweets'] = np.vectorize(remove_pattern)(df2['text'], "@[\w]*")

df2.head(10)

NameError: name 'df2' is not defined

###  Removing Punctuations, Numbers, and Special Characters

In [None]:
df2['tidy_tweets'] = df2['tidy_tweets'].str.replace("[^a-zA-Z#]", " ")
df2.head(10)

### Removing links

In [None]:
cleaned_tweets = []

for index, row in df2.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.tidy_tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

df2['tidy_tweets'] = cleaned_tweets
df2.head(10)

### Remove hashtags

In [None]:
def clean_char(text):
    clean_tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return clean_tweet
df2['tidy_tweets']=df2['tidy_tweets'].apply(clean_char)

In [None]:
df2.head(10)

### Remove rows with empty texts

In [None]:
tweets_df = df2[df2['tidy_tweets']!='']
tweets_df.head(10)

### Drop Duplicates

In [None]:
# tweets_df.drop_duplicates(subset=['tidy_tweets'], keep='first')
# tweets_df.shape

### Reset Index


In [None]:
tweets_df = tweets_df.reset_index(drop=True)
tweets_df

In [None]:
tweets_df.shape

### Remove special characters again

In [None]:
tweets_df['absolute_tidy_tweets'] = tweets_df['tidy_tweets'].str.replace("[^a-zA-Z# ]", " ")

In [None]:
tweets_df

# Remove English and Filipino Stopwords

In [None]:
tokenizer = ToktokTokenizer()

In [None]:
stopword_eng = nltk.corpus.stopwords.words('english')
stopword_fil = ["akin","aking","ako","alin","am","amin","aming","ang","ano","anumang","apat","at","atin","ating","ay","bababa","bago","bakit","bawat","bilang","dahil","dalawa","dapat","din","dito","doon","gagawin","gayunman","ginagawa","ginawa","ginawang","gumawa","gusto","habang","hanggang","hindi","huwag","iba","ibaba","ibabaw","ibig","ikaw","ilagay","ilalim","ilan","inyong","isa","isang","itaas","ito","iyo","iyon","iyong","ka","kahit","kailangan","kailanman","kami","kanila","kanilang","kanino","kanya","kanyang","kapag","kapwa","karamihan","katiyakan","katulad","kaya","kaysa","ko","kong","kulang","kumuha","kung","laban","lahat","lamang","likod","lima","maaari","maaaring","maging","mahusay","makita","marami","marapat","masyado","may","mayroon","mga","minsan","mismo","mula","muli","na","nabanggit","naging","nagkaroon","nais","nakita","namin","napaka","narito","nasaan","ng","ngayon","ni","nila","nilang","nito","niya","niyang","noon","o","pa","paano","pababa","paggawa","pagitan","pagkakaroon","pagkatapos","palabas","pamamagitan","panahon","pangalawa","para","paraan","pareho","pataas","pero","pumunta","pumupunta","sa","saan","sabi","sabihin","sarili","sila","sino","siya","tatlo","tayo","tulad","tungkol","una","walang"]

In [None]:
print(stopword_eng)

In [None]:
print(stopword_fil)

### Remove english stopwords

In [None]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_eng]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_eng]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

tweets_df['stopped_tweets']=tweets_df['absolute_tidy_tweets'].apply(remove_stopwords)

In [None]:
tweets_df

### Remove Filipino Stopwords

In [None]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_fil]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_fil]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text.lower()

tweets_df['stopped_tweets']=tweets_df['stopped_tweets'].apply(remove_stopwords)

## export to csv for backup

In [None]:
tweets_df.to_csv('02_Data_wo_Stopwords.csv')

## Label Sentiments Automatically

Compare Textblob and NLTK Sentiment analyzer

In [None]:
# from textblob import TextBlob
# from textblob.sentiments import NaiveBayesAnalyzer
# from textblob.np_extractors import ConllExtractor

# def fetch_sentiment_using_textblob(text):
#     analysis = TextBlob(text)
#     return 'pos' if analysis.sentiment.polarity >= 0 else 'neg'

In [None]:
# sentiments_using_textblob = tweets_df.text.apply(lambda tweet: fetch_sentiment_using_textblob(tweet))
# tweets_df['sentiment'] = sentiments_using_textblob
# pd.DataFrame(sentiments_using_textblob.value_counts())

In [None]:
# from nltk.sentiment.vader import SentimentIntensityAnalyzer

# def fetch_sentiment_using_SIA(text):
#     sid = SentimentIntensityAnalyzer()
#     polarity_scores = sid.polarity_scores(text)
#     return 'neg' if polarity_scores['neg'] > polarity_scores['pos'] else 'pos'

In [None]:
# sentiments_using_SIA = tweets_df.text.apply(lambda tweet: fetch_sentiment_using_SIA(tweet))
# tweets_df['sentiment'] = sentiments_using_SIA
# pd.DataFrame(sentiments_using_SIA.value_counts())

In [None]:
# tweets_df.loc[tweets_df.sentiment == 'neg']

### Export to csv

In [None]:
tweets_df.to_csv('03_Data_with_sentiment.csv')