In [2]:
import re
import json
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
import string
import pandas as pd
import mtranslate

In [3]:
RE_EMOJI = re.compile('[^\U00000000-\U0000d7ff\U0000e000-\U0000ffff]', flags=re.UNICODE)

def strip_emoji(text):
    return RE_EMOJI.sub(r'', text)

In [4]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [5]:
df = pd.DataFrame(columns=['Post', 'Comment', 'Topic', 'Sentiment', 'Source'])
lemmatizer = WordNetLemmatizer()

#For some reason, please returns positive sentiment because it assumes 'pleasing' but normally people just asking politely
weird_sentiment_words = ['la', 'please', 'lah']

file = json.load(open('instagram_fitnessfirst6months.json', encoding="utf8"))
for line in file['data']:
    try:
        print("New comment")
        print('---------------------------------------')
        sentence = line['comment']

        #Translate to English
        sentence = mtranslate.translate(sentence, 'en', 'auto')

        #Split a single comment into multiple sentences
        sentences = sent_tokenize(sentence)
        #print(sentences)

        #Sentiment for each sentence
        for s in sentences:

            s = s.lower()
            s = s.replace('fitnessfirstmalaysia', ' ')
            s = s.replace('fitness first', ' ')
            s = s.replace('fitness_first', ' ')
            s = s.replace('fitness-first', ' ')
            s = re.sub(r"\byin\b", "", s)
            s = re.sub(r"\byang\b", "", s)

            print("Sentence: ", s)
            sentiment = 0.0
            #print(s)

            tokenized = word_tokenize(s)
            #print(tokenized)

            new_tokenized = [x for x in tokenized if x not in weird_sentiment_words]
            tokenized = new_tokenized

            #POS tag each word
            tagged = pos_tag(tokenized)
            print(tagged)

            previous_word = ""
            list_of_nouns = []

            #Get the sentiment for each word
            for w, t in tagged:


                sentiment_added = False

                #Convert Penn Treebank POS tag to Wordnet POS tag
                wn_tag = penn_to_wn(t)

                #If successfully converted, lemmatize using the Wordnet POS tag
                #If not successfully converted, lemmatize without any POS tags
                if wn_tag is None:
                    lemma = lemmatizer.lemmatize(w)
                else:
                    lemma = lemmatizer.lemmatize(w, pos=wn_tag)

                #If can't lemmatize, set the word to be passed into SentiWordnet as the initial word, w
                #If can lemmatize, set the word to be passed into Sentiwordnet as the lemmatized word, lemma
                if not lemma:
                    print("Can't lemmatize: ", w)
                    final_word = w
                else:
                    print("Lemmatized word: ", lemma)
                    final_word = lemma

                #Make cool and wow positive words
                if (final_word in ['cool', 'wow', 'glad']) and (previous_word == 'not'):
                    print('Not is the previous word')
                    sentiment = sentiment - 0.5
                    previous_word = final_word
                    sentiment_added = True
                elif (final_word in ['cool', 'wow', 'glad', 'lose', 'lost']):
                    print(final_word, " has a score of 0.5")
                    sentiment = sentiment + 0.5
                    previous_word = final_word
                    sentiment_added = True
                elif (final_word == 'nice') and (previous_word == 'not'):
                    print('Not is the previous word')
                    sentiment = sentiment - 0.875
                    previous_word = final_word
                    sentiment_added = True
                elif (final_word == 'nice'):
                    print(final_word, " has a score of 0.875")
                    sentiment = sentiment + 0.875
                    previous_word = final_word
                    sentiment_added = True
                elif (final_word == 'job') and (previous_word == 'great'):    
                    print(final_word, " has a score of 0.875")
                    sentiment = sentiment + 0.875
                    previous_word = final_word
                    sentiment_added = True
                elif (final_word in ['terminate', 'stop', 'halt', 'cancel']) and (previous_word == 'not'):
                    sentiment = sentiment + 0.75
                    print("The word terminate, stop or halt is detected and the sentiment is +0.75")
                    previous_word = final_word
                    sentiment_added = True
                elif (final_word in ['terminate', 'stop', 'halt', 'cancel']):
                    sentiment = sentiment - 0.75
                    print("The word terminate, stop or halt is detected and the sentiment is -0.75")
                    previous_word = final_word
                    sentiment_added = True

                #If the Wordnet POS tag exists, use it to get the synset of the word
                if wn_tag is None:
                    synsets = wn.synsets(final_word)
                else:
                    synsets = wn.synsets(final_word, pos=wn_tag)

                #If no synsets found, skip the word
                if not synsets:
                    print("Synset not found")
                    continue

                # Take the first sense of the word which is the most common
                synset = synsets[0]
                swn_synset = swn.senti_synset(synset.name())

                if sentiment_added == False:
                    if final_word != 'fitness' and previous_word == 'not':      
                        print(final_word, " | ", synset, " | ", swn_synset, " | ", synset.definition())
                        sentiment = sentiment - swn_synset.pos_score() - swn_synset.neg_score()
                    elif final_word != 'fitness':
                        print(final_word, " | ", synset, " | ", swn_synset, " | ", synset.definition())
                        sentiment = sentiment + swn_synset.pos_score() - swn_synset.neg_score()
                    else:
                        print("The word fitness is not considered as a sentiment")

                previous_word = final_word

                #Extract nouns
                if t.startswith('N') or w == 'post':
                    if w != 'i' and w != 'la' and w != 'hi':
                        list_of_nouns.append(w)

            try:
                df2 = pd.DataFrame([[line['post'], s, ','.join(list_of_nouns), sentiment, 'Instagram Comment Fitness First']],columns=['Post', 'Comment', 'Topic', 'Sentiment', 'Source'])
            except Exception:
                df2 = pd.DataFrame([['No message', s, ','.join(list_of_nouns), sentiment, 'Instagram Comment Fitness First']],columns=['Post', 'Comment', 'Topic', 'Sentiment', 'Source'])

            df = pd.concat([df2,df], ignore_index=True)

            # sum greater than 0 => positive sentiment
            if sentiment >= 0:
                print("This sentence has a positive sentiment: " + str(sentiment))
            elif sentiment < 0:
                print("This sentence has a negative sentiment: " + str(sentiment))
        print()
    except:
        continue

New comment
---------------------------------------
Sentence:  hi @  why rpm class take off from ff ioi club?
[('hi', 'NN'), ('@', 'NNP'), ('why', 'WRB'), ('rpm', 'JJ'), ('class', 'NN'), ('take', 'VB'), ('off', 'RP'), ('from', 'IN'), ('ff', 'NN'), ('ioi', 'NN'), ('club', 'NN'), ('?', '.')]
Lemmatized word:  hi
hi  |  Synset('hello.n.01')  |  <hello.n.01: PosScore=0.0 NegScore=0.0>  |  an expression of greeting
Lemmatized word:  @
Synset not found
Lemmatized word:  why
why  |  Synset('why.n.01')  |  <why.n.01: PosScore=0.0 NegScore=0.0>  |  the cause or intention underlying an action or situation, especially in the phrase `the whys and wherefores'
Lemmatized word:  rpm
Synset not found
Lemmatized word:  class
class  |  Synset('class.n.01')  |  <class.n.01: PosScore=0.0 NegScore=0.0>  |  a collection of things sharing a common attribute
Lemmatized word:  take
take  |  Synset('take.v.01')  |  <take.v.01: PosScore=0.0 NegScore=0.0>  |  carry out
Lemmatized word:  off
off  |  Synset('away.r

Sentence:  woohoo @jaden_ee 😘
[('woohoo', 'NN'), ('@', 'NNP'), ('jaden_ee', 'NN'), ('😘', 'NN')]
Lemmatized word:  woohoo
Synset not found
Lemmatized word:  @
Synset not found
Lemmatized word:  jaden_ee
Synset not found
Lemmatized word:  😘
Synset not found
This sentence has a positive sentiment: 0.0

New comment
---------------------------------------
Sentence:  today close or open?
[('today', 'NN'), ('close', 'RB'), ('or', 'CC'), ('open', 'VB'), ('?', '.')]
Lemmatized word:  today
today  |  Synset('today.n.01')  |  <today.n.01: PosScore=0.125 NegScore=0.0>  |  the present time or age
Lemmatized word:  close
close  |  Synset('near.r.01')  |  <near.r.01: PosScore=0.0 NegScore=0.0>  |  near in time or place or relationship
Lemmatized word:  or
or  |  Synset('oregon.n.01')  |  <oregon.n.01: PosScore=0.0 NegScore=0.0>  |  a state in northwestern United States on the Pacific
Lemmatized word:  open
open  |  Synset('open.v.01')  |  <open.v.01: PosScore=0.0 NegScore=0.0>  |  cause to open or to

Sentence:  💪 💪 💪
[('💪', 'JJ'), ('💪', 'NNP'), ('💪', 'NN')]
Lemmatized word:  💪
Synset not found
Lemmatized word:  💪
Synset not found
Lemmatized word:  💪
Synset not found
This sentence has a positive sentiment: 0.0

New comment
---------------------------------------
Sentence:  hi,
[('hi', 'NN'), (',', ',')]
Lemmatized word:  hi
hi  |  Synset('hello.n.01')  |  <hello.n.01: PosScore=0.0 NegScore=0.0>  |  an expression of greeting
Lemmatized word:  ,
Synset not found
This sentence has a positive sentiment: 0.0

New comment
---------------------------------------
Sentence:  natashaseah118hi, i got urgent enquiry need inform!!!!!!!!!!!!!!!!
[('natashaseah118hi', 'NN'), (',', ','), ('i', 'NN'), ('got', 'VBD'), ('urgent', 'JJ'), ('enquiry', 'NNS'), ('need', 'VBP'), ('inform', 'NN'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.')]
Lemmatized word:  nata

Lemmatized word:  free
free  |  Synset('free.a.01')  |  <free.a.01: PosScore=0.375 NegScore=0.0>  |  able to act at will; not hampered; not under compulsion or restraint
Lemmatized word:  for
Synset not found
Lemmatized word:  all
all  |  Synset('all.a.01')  |  <all.a.01: PosScore=0.0 NegScore=0.0>  |  quantifier; used with either mass or count nouns to indicate the whole number or amount of or every one of a class
Lemmatized word:  ?
Synset not found
This sentence has a positive sentiment: 0.875
Sentence:  how do we enroll for this @reebokmalaysia event ?
[('how', 'WRB'), ('do', 'VBP'), ('we', 'PRP'), ('enroll', 'VB'), ('for', 'IN'), ('this', 'DT'), ('@', 'JJ'), ('reebokmalaysia', 'NN'), ('event', 'NN'), ('?', '.')]
Lemmatized word:  how
Synset not found
Lemmatized word:  do
do  |  Synset('make.v.01')  |  <make.v.01: PosScore=0.0 NegScore=0.0>  |  engage in
Lemmatized word:  we
Synset not found
Lemmatized word:  enroll
enroll  |  Synset('enroll.v.01')  |  <enroll.v.01: PosScore=0.0 Ne

Sentence:  hi key !
[('hi', 'NN'), ('key', 'NN'), ('!', '.')]
Lemmatized word:  hi
hi  |  Synset('hello.n.01')  |  <hello.n.01: PosScore=0.0 NegScore=0.0>  |  an expression of greeting
Lemmatized word:  key
key  |  Synset('key.n.01')  |  <key.n.01: PosScore=0.0 NegScore=0.0>  |  metal device shaped in such a way that when it is inserted into the appropriate lock the lock's mechanism can be rotated
Lemmatized word:  !
Synset not found
This sentence has a positive sentiment: 0.0
Sentence:  we are so proud of you 💜
[('we', 'PRP'), ('are', 'VBP'), ('so', 'RB'), ('proud', 'JJ'), ('of', 'IN'), ('you', 'PRP'), ('💜', 'VBP')]
Lemmatized word:  we
Synset not found
Lemmatized word:  be
be  |  Synset('be.v.01')  |  <be.v.01: PosScore=0.25 NegScore=0.125>  |  have the quality of being; (copula, used with an adjective or a predicate noun)
Lemmatized word:  so
so  |  Synset('so.r.01')  |  <so.r.01: PosScore=0.0 NegScore=0.0>  |  to a very great extent or degree
Lemmatized word:  proud
proud  |  Synse

Sentence:  wow!
[('wow', 'NN'), ('!', '.')]
Lemmatized word:  wow
wow  has a score of 0.5
Lemmatized word:  !
Synset not found
This sentence has a positive sentiment: 0.5
Sentence:  @ng_jason 만
[('@', 'JJ'), ('ng_jason', 'NN'), ('만', 'NN')]
Lemmatized word:  @
Synset not found
Lemmatized word:  ng_jason
Synset not found
Lemmatized word:  만
Synset not found
This sentence has a positive sentiment: 0.0

New comment
---------------------------------------
Sentence:  @lisz_ireth 😂😂
[('@', 'JJ'), ('lisz_ireth', 'VBZ'), ('😂😂', 'NN')]
Lemmatized word:  @
Synset not found
Lemmatized word:  lisz_ireth
Synset not found
Lemmatized word:  😂😂
Synset not found
This sentence has a positive sentiment: 0.0

New comment
---------------------------------------
Sentence:  slowly operating ..
[('slowly', 'RB'), ('operating', 'VBG'), ('..', 'NN')]
Lemmatized word:  slowly
slowly  |  Synset('slowly.r.01')  |  <slowly.r.01: PosScore=0.0 NegScore=0.0>  |  without speed (`slow' is sometimes used informally for `

Lemmatized word:  other
other  |  Synset('other.a.01')  |  <other.a.01: PosScore=0.0 NegScore=0.625>  |  not the same one or ones already mentioned or implied
Lemmatized word:  club
club  |  Synset('baseball_club.n.01')  |  <baseball_club.n.01: PosScore=0.0 NegScore=0.0>  |  a team of professional baseball players who play and travel together
Lemmatized word:  patinum
Synset not found
Lemmatized word:  member
member  |  Synset('member.n.01')  |  <member.n.01: PosScore=0.0 NegScore=0.0>  |  one of the persons who compose a social group (especially individuals who have joined and participate in a group organization)
Lemmatized word:  can
can  |  Synset('can.n.01')  |  <can.n.01: PosScore=0.0 NegScore=0.0>  |  airtight sealed metal container for food or drink or paint etc.
Lemmatized word:  only
only  |  Synset('merely.r.01')  |  <merely.r.01: PosScore=0.0 NegScore=0.0>  |  and nothing more
Lemmatized word:  access
access  |  Synset('entree.n.02')  |  <entree.n.02: PosScore=0.0 NegScore=0

Sentence:  thanks for the workout!
[('thanks', 'NNS'), ('for', 'IN'), ('the', 'DT'), ('workout', 'NN'), ('!', '.')]
Lemmatized word:  thanks
thanks  |  Synset('thanks.n.01')  |  <thanks.n.01: PosScore=0.125 NegScore=0.0>  |  an acknowledgment of appreciation
Lemmatized word:  for
Synset not found
Lemmatized word:  the
Synset not found
Lemmatized word:  workout
workout  |  Synset('exercise.n.01')  |  <exercise.n.01: PosScore=0.0 NegScore=0.0>  |  the activity of exerting your muscles in various ways to keep fit
Lemmatized word:  !
Synset not found
This sentence has a positive sentiment: 0.125
Sentence:  we had fun 😎
[('we', 'PRP'), ('had', 'VBD'), ('fun', 'VBN'), ('😎', 'NNS')]
Lemmatized word:  we
Synset not found
Lemmatized word:  have
have  |  Synset('have.v.01')  |  <have.v.01: PosScore=0.25 NegScore=0.0>  |  have or possess, either in a concrete or an abstract sense
Lemmatized word:  fun
Synset not found
Lemmatized word:  😎
Synset not found
This sentence has a positive sentiment: 0.

Lemmatized word:  it
it  |  Synset('information_technology.n.01')  |  <information_technology.n.01: PosScore=0.0 NegScore=0.0>  |  the branch of engineering that deals with the use of computers and telecommunications to retrieve and store and transmit information
Lemmatized word:  2
2  |  Synset('two.n.01')  |  <two.n.01: PosScore=0.0 NegScore=0.0>  |  the cardinal number that is the sum of one and one or a numeral representing this number
Lemmatized word:  floors.cool.cant
Synset not found
Lemmatized word:  wait
wait  |  Synset('delay.n.01')  |  <delay.n.01: PosScore=0.0 NegScore=0.0>  |  time during which some action is awaited
Lemmatized word:  for
Synset not found
Lemmatized word:  ur
Synset not found
Lemmatized word:  opening
opening  |  Synset('opening.n.01')  |  <opening.n.01: PosScore=0.0 NegScore=0.0>  |  an open or empty space in or between things
This sentence has a positive sentiment: 0.0

New comment
---------------------------------------
Sentence:  looks great.
[('looks'

Sentence:  @nczeerick16 & @ddianahttira lets join together as a teammate
[('@', 'NNP'), ('nczeerick16', 'NN'), ('&', 'CC'), ('@', 'NNP'), ('ddianahttira', 'VBZ'), ('lets', 'NNS'), ('join', 'VB'), ('together', 'RB'), ('as', 'IN'), ('a', 'DT'), ('teammate', 'NN')]
Lemmatized word:  @
Synset not found
Lemmatized word:  nczeerick16
Synset not found
Lemmatized word:  &
Synset not found
Lemmatized word:  @
Synset not found
Lemmatized word:  ddianahttira
Synset not found
Lemmatized word:  let
let  |  Synset('lashkar-e-taiba.n.01')  |  <lashkar-e-taiba.n.01: PosScore=0.0 NegScore=0.0>  |  a brutal terrorist group active in Kashmir; fights against India with the goal of restoring Islamic rule of India
Lemmatized word:  join
join  |  Synset('join.v.01')  |  <join.v.01: PosScore=0.0 NegScore=0.0>  |  become part of; become a member of a group or organization
Lemmatized word:  together
together  |  Synset('together.r.01')  |  <together.r.01: PosScore=0.0 NegScore=0.0>  |  in contact with each othe

look  |  Synset('expression.n.01')  |  <expression.n.01: PosScore=0.0 NegScore=0.0>  |  the feelings expressed on a person's face
Lemmatized word:  like
like  |  Synset('like.n.01')  |  <like.n.01: PosScore=0.125 NegScore=0.0>  |  a similar kind
Lemmatized word:  ff
Synset not found
Lemmatized word:  platinum
platinum  |  Synset('platinum.n.01')  |  <platinum.n.01: PosScore=0.0 NegScore=0.0>  |  a heavy precious metallic element; grey-white and resistant to corroding; occurs in some nickel and copper ores and is also found native in some deposits
Lemmatized word:  tower
tower  |  Synset('tower.n.01')  |  <tower.n.01: PosScore=0.0 NegScore=0.0>  |  a structure taller than its diameter; can stand alone or be attached to a larger building
Lemmatized word:  535
Synset not found
Lemmatized word:  hong
Synset not found
Lemmatized word:  kong
Synset not found
This sentence has a positive sentiment: 0.125

New comment
---------------------------------------
Sentence:  nice lounge !
[('nice', '

In [9]:
df_filtered = df[df['Sentiment'] != 0]
print(len(df_filtered[df_filtered['Sentiment'] < 0]))
print(len(df_filtered[df_filtered['Sentiment'] > 0]))

14
40


In [10]:
df_filtered[df_filtered['Sentiment'] > 0]

Unnamed: 0,Post,Comment,Topic,Sentiment,Source
5,Reward yourself with a refreshing drink and re...,nice lounge !,lounge,0.875,Instagram Comment Fitness First
6,Reward yourself with a refreshing drink and re...,looks like ff platinum tower 535 hong kong,"looks,platinum,tower",0.125,Instagram Comment Fitness First
7,Looking for a more challenging workout? Try th...,best,,0.75,Instagram Comment Fitness First
11,Get your workout on with Aqua Combat!\n\n#fitn...,having the best day for me!!,day,1.0,Instagram Comment Fitness First
17,Early Bird Tickets for Fit in The City are now...,@zue2020 ok sure why not.. when this zue,,0.125,Instagram Comment Fitness First
20,Early Bird Tickets for Fit in The City are now...,hiii you 💥 you could’ve certainly manager our ...,"manager,line,knowledge,ig",0.25,Instagram Comment Fitness First
23,Here's a sneak peek of what's coming your way ...,cant wait to get my toiletries bag hahahahaha,"wait,toiletries",0.25,Instagram Comment Fitness First
25,Here's a sneak peek of what's coming your way ...,btw it's good to serve members around there.,members,0.75,Instagram Comment Fitness First
30,Breathing in relaxation. Breathing out tension...,what’s this beautiful place called 💫🦋💕🌸,"s,place",1.0,Instagram Comment Fitness First
32,Spartan Beast 2017 is approaching soon and it’...,we manufacture custom ocr and outdoor race app...,"race,apparel,chance",1.375,Instagram Comment Fitness First


In [11]:
df_filtered[df_filtered['Sentiment'] < 0]

Unnamed: 0,Post,Comment,Topic,Sentiment,Source
19,Early Bird Tickets for Fit in The City are now...,act those instantly,act,-0.375,Instagram Comment Fitness First
26,Here's a sneak peek of what's coming your way ...,very bad traffic around there.,traffic,-0.625,Instagram Comment Fitness First
31,Early morning awakenings.\n\nEnergising Mornin...,send us a direct message if you are looking fo...,message,-0.125,Instagram Comment Fitness First
42,"Our Fitness First team - Ahmad, Mukhriz, Faiz ...",missed this great session with the national at...,"session,athletes,class,leisure",-0.25,Instagram Comment Fitness First
51,Tomorrow is the day! Just one day left to take...,can somebody explain why other clubs patinum m...,"somebody,clubs,members,access,club,november",-0.375,Instagram Comment Fitness First
53,We are about to unleash fresh moves and hot ne...,@hatayap hahaha ... not going 😁,,-0.625,Instagram Comment Fitness First
58,"A desire, a passion, a champion. Stay tuned fo...",pick up stop;),stop,-0.75,Instagram Comment Fitness First
76,"Get your groove on with House, Hip Hop and Con...","@fettyzurando haha not sk ii, it’s skm!",ii,-0.625,Instagram Comment Fitness First
85,Be sure to collect your Entry Pack for Fit in ...,tshirt never ada jual ke?,,-0.625,Instagram Comment Fitness First
90,@henrycedeno turning up the heat up at Fitness...,omg i miss this!,,-0.25,Instagram Comment Fitness First


In [12]:
df_filtered.to_json('instagram_fitnessfirstsentiment.json', orient='records')