## Label Sentiments

In [1]:
# main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# For Data processing/cleaning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import STOPWORDS
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
import os
import warnings
from numpy import loadtxt
from nltk import tokenize

# import WhitespaceTokenizer() method from nltk
from nltk.tokenize import WhitespaceTokenizer

In [2]:
dataset = pd.read_csv("03_Data_with_sentiment.csv")

# Combine English and Tagalog Stopwords

In [3]:
#import text file into NumPy array
neg_fil = loadtxt('negative_words_tl.txt', dtype='object')
pos_fil = loadtxt('positive_words_tl.txt', dtype='object')
badwords_fil = ["amputa","animal ka","bilat","binibrocha","bobo","bogo","boto","brocha","burat","bwesit","bwisit","demonyo ka","engot","etits","gaga","gagi","gago","habal","hayop ka","hayup","hinampak","hinayupak","hindot","hindutan","hudas","iniyot","inutel","inutil","iyot","kagaguhan","kagang","kantot","kantotan","kantut","kantutan","kaululan","kayat","kiki","kikinginamo","kingina","kupal","leche","leching","lechugas","lintik","nakakaburat","nimal","ogag","olok","pakingshet","pakshet","pakyu","pesteng yawa","poke","poki","pokpok","poyet","pu'keng","pucha","puchanggala","puchangina","puke","puki","pukinangina","puking","punyeta","puta","putang","putang ina","putangina","putanginamo","putaragis","putragis","puyet","ratbu","shunga","sira ulo","siraulo","suso","susu","tae","taena","tamod","tanga","tangina","taragis","tarantado","tete","teti","timang","tinil","tite","titi","tungaw","ulol","ulul","ungas"]

In [4]:
neg_fil = np.concatenate((neg_fil, badwords_fil))

In [5]:
neg_eng = loadtxt('negative_words_en.txt', dtype='object')
pos_eng = loadtxt('positive_words_en.txt', dtype='object')

In [6]:
negative_data = np.concatenate((neg_fil, neg_eng))
positive_data = np.concatenate((pos_fil, pos_eng))

In [7]:
negative_data.shape

(5926,)

In [8]:
positive_data.shape

(2818,)

# Crount positive and negative occurance
Rule-based sentiment labeling

In [9]:
dataset = dataset[['absolute_tidy_tweets', 'stopped_tweets']]

In [10]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets
0,Hello You need to have a rest To activate you...,hello need rest activate best thus try find ti...
1,lunes nanaman bukas,lunes nanaman bukas
2,gusto q na tapusin Wednesday,q tapusin wednesday
3,super delay na ako sa tbw list ko,super delay tbw list
4,same beh academicbreaknow,beh academicbreaknow
...,...,...
28858,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break
28859,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...
28860,Academic break plss,academic break plss
28861,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...


In [11]:
def label_positive(text):
    tk = WhitespaceTokenizer()
    tokens = tk.tokenize(text)
    tokens = [token.strip() for token in tokens]
    positive = 0
    for token in tokens:
        if token in positive_data:
            positive = positive + 1
    return positive

In [12]:
def label_negative(text):
    tk = WhitespaceTokenizer()
    tokens = tk.tokenize(text)
    tokens = [token.strip() for token in tokens]
    negative = 0
    for token in tokens:
        if token in negative_data:
            negative = negative -1
    return negative


In [13]:
dataset = dataset.dropna()

In [14]:
dataset['positive']=dataset['absolute_tidy_tweets'].apply(label_positive)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['positive']=dataset['absolute_tidy_tweets'].apply(label_positive)


In [15]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets,positive
0,Hello You need to have a rest To activate you...,hello need rest activate best thus try find ti...,2
1,lunes nanaman bukas,lunes nanaman bukas,0
2,gusto q na tapusin Wednesday,q tapusin wednesday,1
3,super delay na ako sa tbw list ko,super delay tbw list,1
4,same beh academicbreaknow,beh academicbreaknow,0
...,...,...,...
28858,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,0
28859,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...,2
28860,Academic break plss,academic break plss,0
28861,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,0


In [16]:
dataset['negative']=dataset['absolute_tidy_tweets'].apply(label_negative)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['negative']=dataset['absolute_tidy_tweets'].apply(label_negative)


In [17]:
def count_token(text):
    tk = WhitespaceTokenizer()
    tokens = tk.tokenize(text)
    tokens = [token.strip() for token in tokens]
    count = len(tokens)
    return count

In [18]:
dataset['total']=dataset['absolute_tidy_tweets'].apply(count_token)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['total']=dataset['absolute_tidy_tweets'].apply(count_token)


In [19]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets,positive,negative,total
0,Hello You need to have a rest To activate you...,hello need rest activate best thus try find ti...,2,0,20
1,lunes nanaman bukas,lunes nanaman bukas,0,0,3
2,gusto q na tapusin Wednesday,q tapusin wednesday,1,0,5
3,super delay na ako sa tbw list ko,super delay tbw list,1,-1,8
4,same beh academicbreaknow,beh academicbreaknow,0,0,3
...,...,...,...,...,...
28858,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,0,-2,8
28859,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...,2,-3,29
28860,Academic break plss,academic break plss,0,-1,3
28861,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,0,-6,14


# Compute for Sentiment Score
* 1 to 0.5 — generally positive sentiment
* 0.5 to -0.5 — neutral sentiment
* -0.5 to -1 — negative sentiment

In [45]:
dataset['StSc'] = (dataset.positive + dataset.negative)/dataset.total

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['StSc'] = (dataset.positive + dataset.negative)/dataset.total


In [46]:
dataset['StSc'] = (dataset.positive + dataset.negative)/(dataset.positive - dataset.negative)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['StSc'] = (dataset.positive + dataset.negative)/(dataset.positive - dataset.negative)


In [47]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets,positive,negative,total,StSc,sentiment
0,Hello You need to have a rest To activate you...,hello need rest activate best thus try find ti...,2,0,20,1.0,1
1,lunes nanaman bukas,lunes nanaman bukas,0,0,3,,-1
2,gusto q na tapusin Wednesday,q tapusin wednesday,1,0,5,1.0,1
3,super delay na ako sa tbw list ko,super delay tbw list,1,-1,8,0.0,0
4,same beh academicbreaknow,beh academicbreaknow,0,0,3,,-1
...,...,...,...,...,...,...,...
28858,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,0,-2,8,-1.0,-1
28859,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...,2,-3,29,-0.2,-1
28860,Academic break plss,academic break plss,0,-1,3,-1.0,-1
28861,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,0,-6,14,-1.0,-1


In [48]:
dataset.to_csv('Sentiment_score.csv')

# Label Sentiment

In [54]:
# def label_sentiment(score):
#     sentiment = 0
#     if score >= 0.05:
#         sentiment = 1
#     elif score < 0.05 and score > -0.05:
#         sentiment = 0
#     else:
#         sentiment = -1
#     return sentiment


In [59]:
def label_sentiment(score):
    sentiment = 0
    if score > 0:
        sentiment = 1
    elif score == 0:
        sentiment = 0
    elif score < 0:
        sentiment = -1
    else:
        sentiment = 0
    return sentiment


In [60]:
dataset['sentiment']=dataset['StSc'].apply(label_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['sentiment']=dataset['StSc'].apply(label_sentiment)


In [61]:
dataset.to_csv('Sentiment.csv')

In [62]:
dataset['sentiment'].value_counts()

-1    21020
 1     4324
 0     2847
Name: sentiment, dtype: int64