## Label Sentiments

In [17]:
# main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# For Data processing/cleaning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import STOPWORDS
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
import os
import warnings
from numpy import loadtxt
from nltk import tokenize

# import WhitespaceTokenizer() method from nltk
from nltk.tokenize import WhitespaceTokenizer

In [2]:
dataset = pd.read_csv("03_Data_with_sentiment.csv")

In [3]:
#import text file into NumPy array
neg_fil = loadtxt('negative_words_tl.txt', dtype='object')
pos_fil = loadtxt('positive_words_tl.txt', dtype='object')
badwords_fil = ["amputa","animal ka","bilat","binibrocha","bobo","bogo","boto","brocha","burat","bwesit","bwisit","demonyo ka","engot","etits","gaga","gagi","gago","habal","hayop ka","hayup","hinampak","hinayupak","hindot","hindutan","hudas","iniyot","inutel","inutil","iyot","kagaguhan","kagang","kantot","kantotan","kantut","kantutan","kaululan","kayat","kiki","kikinginamo","kingina","kupal","leche","leching","lechugas","lintik","nakakaburat","nimal","ogag","olok","pakingshet","pakshet","pakyu","pesteng yawa","poke","poki","pokpok","poyet","pu'keng","pucha","puchanggala","puchangina","puke","puki","pukinangina","puking","punyeta","puta","putang","putang ina","putangina","putanginamo","putaragis","putragis","puyet","ratbu","shunga","sira ulo","siraulo","suso","susu","tae","taena","tamod","tanga","tangina","taragis","tarantado","tete","teti","timang","tinil","tite","titi","tungaw","ulol","ulul","ungas"]

In [5]:
neg_fil = np.concatenate((neg_fil, badwords_fil))

In [6]:
neg_eng = loadtxt('negative_words_en.txt', dtype='object')
pos_eng = loadtxt('positive_words_en.txt', dtype='object')

In [7]:
negative_data = np.concatenate((neg_fil, neg_eng))
positive_data = np.concatenate((pos_fil, pos_eng))

In [8]:
negative_data.shape

(5926,)

In [9]:
positive_data.shape

(2818,)

In [11]:
dataset = dataset[['absolute_tidy_tweets', 'stopped_tweets']]

In [12]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets
0,#academicbreaknow,#academicbreaknow
1,Hello #AcademicTwitter You need to have a rest...,hello #academictwitter need rest activate best...
2,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,#AcademicBreakNow gusto q na tapusin Wednesday,#academicbreaknow q tapusin wednesday
4,super delay na ako sa tbw list ko #academicbre...,super delay tbw list #academicbreaknow
...,...,...
29588,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break
29589,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...
29590,Academic break plss,academic break plss
29591,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...


In [23]:
def label_positive(text):
    tk = WhitespaceTokenizer()
    tokens = tk.tokenize(text)
    tokens = [token.strip() for token in tokens]
    positive = 0
    for token in tokens:
        if token in positive_data:
            positive = positive + 1
    return positive

In [24]:
def label_negative(text):
    tk = WhitespaceTokenizer()
    tokens = tk.tokenize(text)
    tokens = [token.strip() for token in tokens]
    negative = 0
    for token in tokens:
        if token in negative_data:
            negative = negative -1
    return negative


In [25]:
dataset['positive']=dataset['absolute_tidy_tweets'].apply(label_positive)

In [26]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets,"(positive, negative)",positive,negative
0,#academicbreaknow,#academicbreaknow,"[0, 0]",0,
1,Hello #AcademicTwitter You need to have a rest...,hello #academictwitter need rest activate best...,"[2, 0]",2,
2,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,"[0, 0]",0,
3,#AcademicBreakNow gusto q na tapusin Wednesday,#academicbreaknow q tapusin wednesday,"[1, 0]",1,
4,super delay na ako sa tbw list ko #academicbre...,super delay tbw list #academicbreaknow,"[1, -1]",1,
...,...,...,...,...,...
29588,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,"[0, -2]",0,
29589,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...,"[2, -3]",2,
29590,Academic break plss,academic break plss,"[0, -1]",0,
29591,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,"[0, -6]",0,


In [27]:
dataset['negative']=dataset['absolute_tidy_tweets'].apply(label_negative)

In [36]:
dataset.drop('(positive, negative)', inplace=True, axis=1)

KeyError: "['(positive, negative)'] not found in axis"

In [28]:
def count_token(text):
    tk = WhitespaceTokenizer()
    tokens = tk.tokenize(text)
    tokens = [token.strip() for token in tokens]
    count = len(tokens)
    return count

In [29]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets,"(positive, negative)",positive,negative
0,#academicbreaknow,#academicbreaknow,"[0, 0]",0,0
1,Hello #AcademicTwitter You need to have a rest...,hello #academictwitter need rest activate best...,"[2, 0]",2,0
2,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,"[0, 0]",0,0
3,#AcademicBreakNow gusto q na tapusin Wednesday,#academicbreaknow q tapusin wednesday,"[1, 0]",1,0
4,super delay na ako sa tbw list ko #academicbre...,super delay tbw list #academicbreaknow,"[1, -1]",1,-1
...,...,...,...,...,...
29588,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,"[0, -2]",0,-2
29589,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...,"[2, -3]",2,-3
29590,Academic break plss,academic break plss,"[0, -1]",0,-1
29591,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,"[0, -6]",0,-6


In [None]:
dataset['total']=dataset['absolute_tidy_tweets'].apply(count_token)

In [22]:
dataset

Unnamed: 0,absolute_tidy_tweets,stopped_tweets,"(positive, negative)",positive,negative
0,#academicbreaknow,#academicbreaknow,"[0, 0]",,
1,Hello #AcademicTwitter You need to have a rest...,hello #academictwitter need rest activate best...,"[2, 0]",,
2,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow,"[0, 0]",,
3,#AcademicBreakNow gusto q na tapusin Wednesday,#academicbreaknow q tapusin wednesday,"[1, 0]",,
4,super delay na ako sa tbw list ko #academicbre...,super delay tbw list #academicbreaknow,"[1, -1]",,
...,...,...,...,...,...
29588,si taylor nay nag implement ug academic break,si taylor nay nag implement ug academic break,"[0, -2]",,
29589,Actually Lenin wasn t peer reviewed by establi...,actually lenin peer reviewed established acade...,"[2, -3]",,
29590,Academic break plss,academic break plss,"[0, -1]",,
29591,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...,"[0, -6]",,
