In [73]:
from pandas import read_csv
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import nltk

In [74]:
#download tokenizer
path=path='D:/misc/Projects/Python/NLP/misc'
nltk.data.path.append(path)

In [11]:
table=read_csv('emoji_table.csv')
table.head()

Unnamed: 0,unicode,emoji,name,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,1F308,🌈,rainbow,0.0,0.28,0.0,0.0,0.69,0.06,0.22,0.33
1,1F319,🌙,crescent moon,0.0,0.31,0.0,0.0,0.25,0.0,0.06,0.25
2,1F31A,🌚,new moon face,0.06,0.08,0.17,0.06,0.42,0.19,0.06,0.11
3,1F31E,🌞,sun with face,0.0,0.22,0.0,0.0,0.78,0.0,0.11,0.22
4,1F31F,🌟,glowing star,0.0,0.28,0.0,0.0,0.53,0.0,0.25,0.31


## 1)

In [12]:
table[['unicode','emoji','name']]

Unnamed: 0,unicode,emoji,name
0,1F308,🌈,rainbow
1,1F319,🌙,crescent moon
2,1F31A,🌚,new moon face
3,1F31E,🌞,sun with face
4,1F31F,🌟,glowing star
...,...,...,...
145,2757,❗,exclamation mark
146,2764,❤,red heart
147,27A1,➡,right arrow
148,2B05,⬅,left arrow


## 2)

In [47]:
emotions=list(table.columns[-8:])
emoticons=list(table['name'])
off_scores=np.array(table[emotions])

In [48]:
def cosine_similarity(a,b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def calculate_model_scores(emotions, emoticons, embed_fun):
    scores=np.zeros([len(emoticons),len(emotions)])
    emotions_vec=[embed_fun(emo) for emo in emotions]
    for i,emoticon in enumerate(emoticons):
        emoticon_vec=embed_fun(emoticon)
        scores[i]=[cosine_similarity(emoticon_vec,emovec) for emovec in emotions_vec]
    return scores

In [71]:
w2v_embed_fun=lambda x: np.mean(np.array([np.random.random(5) for w in x.split()]),axis=0)
word2vec_scores=calculate_model_scores(emotions,emoticons, w2v_embed_fun)

In [72]:
w2v_coeff=[pearsonr(off_scores[:,i],word2vec_scores[:,i]) for i in range(8)]
w2v_coeff

[PearsonRResult(statistic=0.1381102968952918, pvalue=0.09190458397100414),
 PearsonRResult(statistic=-0.08733034694870316, pvalue=0.28793697786309996),
 PearsonRResult(statistic=0.05910096655113584, pvalue=0.47250505349306904),
 PearsonRResult(statistic=-0.07051026571681135, pvalue=0.3912158400438199),
 PearsonRResult(statistic=0.13800218169957065, pvalue=0.092161705171599),
 PearsonRResult(statistic=-0.10110285095433223, pvalue=0.21830398404401943),
 PearsonRResult(statistic=-0.09236693194870493, pvalue=0.26092781318492203),
 PearsonRResult(statistic=0.15224123818775834, pvalue=0.0629102761036149)]

## 3)

In [None]:
gl_embed_fun=lambda x: np.random.random(5)
glove_scores=calculate_model_scores(emotions,emoticons, gl_embed_fun)

In [None]:
gl_coeff=[pearsonr(off_scores[:,i],glove_scores[:,i]) for i in range(8)]

In [None]:
d2v_embed_fun=lambda x: np.random.random(5)
doc2vec_scores=calculate_model_scores(emotions,emoticons, d2v_embed_fun)

In [None]:
d2v_coeff=[pearsonr(off_scores[:,i],doc2vec_scores[:,i]) for i in range(8)]

In [None]:
brt_embed_fun=lambda x: np.random.random(5)
bert_scores=calculate_model_scores(emotions,emoticons, brt_embed_fun)

In [None]:
brt_coeff=[pearsonr(off_scores[:,i],bert_scores[:,i]) for i in range(8)]

## 4)

In [81]:
from collections import defaultdict
#custom dictionary class to conveniently store word frequency
class inverted_index_dict:
    def __init__(self) -> None:
        self.dict=defaultdict()
        
    def add_item(self,token):
        if self.dict.__contains__(token):
            self.dict[token]+=1
        else:
            self.dict[token]=1
    

In [170]:
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.update(["'s","n't",'✅',"\U0001F621","\U0001F624","\U0001F602",
             "\U0001F62D","\U0001F973","\u263A", '🎉','🏆','im','dont','youre'])
punctuation=['"','!','.',',',"'",'(',')',';','``',"''",'?','_',':','-','@','’',
             '#','&','£','$','%','”','*','“','☺️','😭','😂']

def get_mfrequent_terms(path):
    word_freq=inverted_index_dict()
    text=read_csv(path)
    text=list(text['Text'])
    text=''.join(txt for txt in text).lower()
    for punct in punctuation:
        text=text.replace(punct,'')
    for word in nltk.tokenize.word_tokenize(text):
        if word not in stop_words:
            word_freq.add_item(word)
    word_freq_list=list(word_freq.dict.items())
    word_freq_list=sorted(word_freq_list,key=lambda x: x[1], reverse=True)
    return word_freq_list[:30]

In [171]:
paths=['tweets/enraged_face.csv','tweets/face_with_steam_from_nose.csv','tweets/face_with_tears_of_joy.csv',
       'tweets/loudly_crying_face.csv','tweets/partying_face.csv','tweets/smiling_face.csv']
tweet_emoticons=['enragedface', 'facewithsteam','tearsfromjoy','loudcrying','partyingface','smiley']
mfreq=[get_mfrequent_terms(path) for path in paths]
mfreq_w=[[x[0] for x in p] for p in mfreq]

In [172]:
for i in range(6): 
    print(f'{tweet_emoticons[i]}:\n{mfreq[i]}')

enragedface:
[('like', 1488), ('people', 1215), ('get', 1124), ('amp', 1100), ('one', 920), ('cant', 758), ('know', 688), ('even', 677), ('go', 665), ('us', 660), ('time', 647), ('would', 646), ('want', 609), ('see', 583), ('never', 579), ('need', 564), ('got', 524), ('still', 519), ('thats', 517), ('u', 514), ('stop', 489), ('right', 487), ('back', 476), ('make', 474), ('going', 465), ('really', 447), ('think', 440), ('say', 423), ('way', 392), ('take', 389)]
facewithsteam:
[('like', 1346), ('get', 1220), ('one', 988), ('got', 763), ('time', 752), ('know', 701), ('go', 696), ('na', 686), ('back', 668), ('see', 632), ('cant', 607), ('need', 578), ('people', 574), ('amp', 572), ('even', 554), ('love', 550), ('want', 508), ('day', 507), ('good', 501), ('better', 490), ('make', 481), ('thats', 476), ('still', 469), ('right', 438), ('never', 437), ('gon', 435), ('going', 430), ('really', 416), ('shit', 414), ('man', 412)]
tearsfromjoy:
[('like', 1583), ('one', 928), ('get', 907), ('know', 

the stop word removal was not effective for expressions like "youre" instead of "you're", "im" instead of "i'm" and so on, we had to resort to manual intervention

In [197]:
words=set([pr[0] for i in range(6) for pr in mfreq[i]])
inc_mat=matrix = [[0 for _ in range(7)] for _ in range(len(words)+1)]
inc_mat[0][0]='name'
inc_mat[0][1:]=tweet_emoticons
for i,word in enumerate(words):
    inc_mat[i+1][0]=word
    for j in range(6):
        inc_mat[i+1][j+1]=word in mfreq_w[j]
inc_mat

[['name',
  'enragedface',
  'facewithsteam',
  'tearsfromjoy',
  'loudcrying',
  'partyingface',
  'smiley'],
 ['people', True, True, True, True, False, False],
 ['denet', False, False, False, False, True, False],
 ['way', True, False, False, True, False, False],
 ['year', False, False, False, False, True, False],
 ['may', False, False, False, False, True, False],
 ['birthday', False, False, False, False, True, False],
 ['stop', True, False, False, False, False, False],
 ['great', False, False, False, False, True, True],
 ['love', False, True, True, True, True, True],
 ['lol', False, False, True, False, False, False],
 ['na', False, True, True, True, False, False],
 ['much', False, False, False, True, True, True],
 ['know', True, True, True, True, False, True],
 ['today', False, False, False, False, True, True],
 ['good', False, True, True, True, True, True],
 ['also', False, False, False, False, False, True],
 ['arsenal', False, False, True, False, False, False],
 ['us', True, False,

## 5)

In [None]:
'fock'

## 6)

In [None]:
'fock'

## 7)