# Emoji Analysis

In [6]:
from emoji import UNICODE_EMOJI, emoji_lis
import emojis
import pandas as pd
import numpy as np
import math

In [7]:
def is_emoji(s):
    count = 0
    for emoji in UNICODE_EMOJI:
        count += s.count(emoji)
        if count > 1:
            return False
    return bool(count)

In [8]:
# Kralj Novak, Petra; Smailović, Jasmina; Sluban, Borut and Mozetič, Igor, 2015, Emoji Sentiment Ranking 1.0, Slovenian language resource repository CLARIN.SI, http://hdl.handle.net/11356/1048.
df = pd.read_csv("Data/Emoji_Sentiment_Data_v1.0.csv")
df.head()

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 969 entries, 0 to 968
Data columns (total 9 columns):
Emoji                969 non-null object
Unicode codepoint    969 non-null object
Occurrences          969 non-null int64
Position             969 non-null float64
Negative             969 non-null int64
Neutral              969 non-null int64
Positive             969 non-null int64
Unicode name         969 non-null object
Unicode block        969 non-null object
dtypes: float64(1), int64(4), object(4)
memory usage: 68.2+ KB


In [10]:
df['score'] = (df['Positive'] - df['Negative']) / (df['Negative'] + df['Positive'] + df['Neutral'])

In [40]:
df.head(15)

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,score
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,0.220968
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats,0.746087
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols,0.657615
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,0.677937
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,-0.093377
5,😘,0x1f618,3648,0.85448,193,702,2753,FACE THROWING A KISS,Emoticons,0.701754
6,😊,0x1f60a,3186,0.813302,189,754,2243,SMILING FACE WITH SMILING EYES,Emoticons,0.644696
7,👌,0x1f44c,2925,0.805223,274,728,1923,OK HAND SIGN,Miscellaneous Symbols and Pictographs,0.563761
8,💕,0x1f495,2400,0.765726,99,683,1618,TWO HEARTS,Miscellaneous Symbols and Pictographs,0.632917
9,👏,0x1f44f,2336,0.78713,243,634,1459,CLAPPING HANDS SIGN,Miscellaneous Symbols and Pictographs,0.520548


In [53]:
def get_emoji_sentiments(df, text, i):
    emos = []
    raw = emoji_lis(text)
    for item in raw:
        emos.append(list(item.values()))
    total = 0
    for emo in emos:
        try:
            total += float(df.loc[df['Emoji'] == emo[1]]['score'])
        except TypeError:
            total = 0
    return np.tanh(total)

In [54]:
text = 'Python is 😭'
get_emoji_sentiments(df, text)

😭
-0.09337676438653637


-0.09310631672996145

In [25]:
data = pd.read_csv('Data/labeled.csv', names=['Comments', 'Label'])
data.head()

Unnamed: 0,Comments,Label
0,Hnya mungkin iman nya masih goyah dan alasan t...,-0.56118
1,"Kok pada ngributin hidup orang lain, mau pinda...",0.01286
2,Isi suratnya adalah ....\r\nRina nose tidak pe...,0.0
3,"hmmmm sayang nya rina kalau ampe pindah,ibarat...",0.014891
4,👍👍👍👍,0.975


In [16]:
comments = data['Comment']

In [17]:
for comment in comments:
    print(get_emoji_sentiments(df, comment))

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.9999416568696861
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.9962052023468827
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.47933022218387067
0.0
0.0
0.0
0.0
0.47933022218387067
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.9999998890861419
0.0
0.0
0.0
0.0
0.47933022218387067
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [26]:
from string import punctuation
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(punctuation)])
    return s

data['cleaned'] = data['Comments'].apply(remove_punctuation)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

data['tokenized_sents'] = data.apply(lambda row: nltk.word_tokenize(row['cleaned']), axis=1)

print(data['tokenized_sents'])

0       [Hnya, mungkin, iman, nya, masih, goyah, dan, ...
1       [Kok, pada, ngributin, hidup, orang, lain, mau...
2       [Isi, suratnya, adalah, Rina, nose, tidak, per...
3       [hmmmm, sayang, nya, rina, kalau, ampe, pindah...
4                                                  [👍👍👍👍]
5                                         [Azka, ganteng]
6       [Yes, om, Deddy, you, have, built, the, monste...
7       [dapat, suami, bule, dia, jadi, pindah, agama,...
8           [apa, yang, di, madsud, air, kejujuran, rina]
9                                [Yg, nonton, 2019, like]
10      [Bang, sebenar, nya, apa, isi, surat, rina, bang]
11                      [Salam, dari, palangkaraya, bang]
12                            [Gaya, mu, itu, lohhhwkwkw]
13           [Jd, bingung, apa, isi, surat, rina, nose😱😲]
14      [Klo, ngedidik, anak, jjr, gwe, sama, dgn, Om,...
15                             [Om, kapan, punya, rambut]
16                               [Isi, surat, mulai, 630]
17            

In [27]:
from nltk.tag import CRFTagger
import pycrfsuite
# nltk.download('tag')

ct = nltk.tag.CRFTagger()
print(ct)

<nltk.tag.crf.CRFTagger object at 0x00000250FB949390>


In [39]:
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
hasil = ct.tag_sents(data['tokenized_sents'])
for item in hasil:
    for tag in item:
        if tag[0] == 'sedikit':
            print(tag) 

('sedikit', 'CD')
('sedikit', 'CD')
