#### **This notebook manually test the language translation**

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import os

import importlib

import config.config as config_hp

#### **Load data**

In [4]:
importlib.reload(config_hp)

config = config_hp.config()
balanced = config['BALANCED']

positive_conv = balanced['balanced_pos_conversation']
df_pos = pd.read_pickle(positive_conv)

negative_conv = balanced['balanced_neg_conversation']
df_neg = pd.read_pickle(negative_conv)

df = df_pos.append(df_neg)

  df = df_pos.append(df_neg)


#### **Remove non language tweets**

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2673091 entries, 0 to 2673090
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   tweet_text       object 
 1   conversation_id  object 
 2   replier_tweetid  object 
 3   replier_userid   object 
 4   poster_userid    object 
 5   poster_tweetid   object 
 6   tweet_time       object 
 7   tweet_language   object 
 8   replier_label    int64  
 9   year             object 
 10  campaign         object 
 11  tweet_label      int64  
 12  tweet_time_year  object 
 13  common           float64
 14  id               object 
 15  username         object 
dtypes: float64(1), int64(2), object(13)
memory usage: 346.7+ MB


In [71]:
importlib.reload(twitter_hp)

df_rem = twitter_hp.remove_non_language(df)

In [72]:
df_rem['tweet_language'].head()

0    tr
1    tr
2    tr
3    tr
5    tr
Name: tweet_language, dtype: object

In [73]:
total_data = len(df)
total_remaining = len(df_rem)
df_eng = df_rem.loc[df_rem['tweet_language'] == 'en']
eng_replies = len(df_eng)

print('Total data : ', total_data)
print('Total remaining :', total_remaining)
print('Total english replies :', eng_replies)
print('Total other languages :', total_remaining - eng_replies)

Total data :  2673091
Total remaining : 2425668
Total english replies : 85560
Total other languages : 2340108


In [74]:
df_not_eng = df_rem.loc[~(df_rem['tweet_language'] == 'en')]

In [75]:
df_not_eng['tweet_language'].nunique()

54

In [10]:
df_not_eng['tweet_language'].unique()

array(['tr', 'ar', 'es', 'it', 'in', 'sl', 'sr', 'tl', 'pl', 'lv', 'pt',
       'sv', 'is', 'ru', 'et', 'cs', 'hu', 'bg', 'ht', 'lt', 'eu', 'nl',
       'hi', 'uk', 'no', 'fi', 'da', 'zh', 'fr', 'ca', 'fa', 'ur', 'ro',
       'de', 'cy', 'ja', 'th', 'vi', 'bn', 'ko', 'el', 'ne', 'ta', 'ml',
       'ckb', 'hr', 0, 'ps', 'bs', 'iw', 'sd', 'sk', 'art', 'am'],
      dtype=object)

In [11]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

df_not_eng.loc[df_not_eng['tweet_language'] == 0]['tweet_text']

  from IPython.core.display import display, HTML


1227488    @BorkoStef @moon13elup laze se naveloiko,samo ...
2633183    @SamsungRu Конкурсы - это хорошо! Давно пора т...
2633184    @SamsungRu Я девочка! Я не хочу ничего решать!...
2633185    @SamsungRu Пожалейте бедного студента! Подарит...
2633186       @SamsungRu а что нужно сделать то? #смартфон4D
2633187    @SamsungRu Конкурсы - это хорошо! Давно пора т...
2633188    @SamsungRu 8 марта ведь недавно было, нужно де...
Name: tweet_text, dtype: object

#### **Google Cloud Translation API:**
$20 per 1 million characters for translation, spaces are included, empty query charged for one character

#### **Microsoft Translator Text API:**

#### **MBart and MBart50**

##### **Language covered**

In [12]:
mbart = {
'Arabic' : 'ar_AR', 
'Czech' : 'cs_CZ', 
'German' : 'de_DE',
'English': 'en_XX', 
'Spanish' : 'es_XX', 
'Estonian' : 'et_EE', 
'Finnish' : 'fi_FI', 
'French' : 'fr_XX', 
'Gujarati' : 'gu_IN', 
'Hindi': 'hi_IN', 
'Italian': 'it_IT', 
'Japanese' : 'ja_XX', 
'Kazakh' : 'kk_KZ', 
'Korean' : 'ko_KR', 
'Lithuanian' : 'lt_LT', 
'Latvian' : 'lv_LV', 
'Burmese': 'my_MM', 
'Nepali' : 'ne_NP', 
'Dutch' : 'nl_XX', 
'Romanian': 'ro_RO', 
'Russian' : 'ru_RU', 
'Sinhala' : 'si_LK', 
'Turkish' : 'tr_TR', 
'Vietnamese' : 'vi_VN', 
'Chinese': 'zh_CN', 
'Afrikaans' : 'af_ZA', 
'Azerbaijani' : 'az_AZ',
'Bengali' : 'bn_IN', 
'Persian' : 'fa_IR', 
'Hebrew' : 'he_IL', 
'Croatian' : 'hr_HR', 
'Indonesian' : 'id_ID', 
'Georgian': 'ka_GE', 
'Khmer' : 'km_KH',
'Macedonian' : 'mk_MK', 
'Malayalam' : 'ml_IN', 
'Mongolian' : 'mn_MN', 
'Marathi' : 'mr_IN', 
'Polish' : 'pl_PL', 
'Pashto' : 'ps_AF',
'Portuguese' : 'pt_XX', 
'Swedish': 'sv_SE', 
'Swahili' : 'sw_KE', 
'Tamil' : 'ta_IN', 
'Telugu' : 'te_IN',
'Thai' : 'th_TH', 
'Tagalog' : 'tl_XX', 
'Ukrainian' : 'uk_UA', 
'Urdu' : 'ur_PK', 
'Xhosa' : 'xh_ZA',
'Galician' :'gl_ES', 
'Slovene' : 'sl_SI'
}

#### **Twitter language code**

In [13]:
got_codes = ['tr', 'ar', 'es', 'it', 'in', 'sl', 'sr', 'tl', 'pl', 'lv', 'pt',
           'sv', 'is', 'ru', 'et', 'cs', 'hu', 'bg', 'ht', 'lt', 'eu', 'nl',
           'hi', 'uk', 'no', 'fi', 'da', 'zh', 'fr', 'ca', 'fa', 'ur', 'ro',
           'de', 'cy', 'ja', 'th', 'vi', 'bn', 'ko', 'el', 'ne', 'ta', 'ml',
           'ckb', 'hr', 0, 'ps', 'bs', 'iw', 'sd', 'sk', 'art', 'am', 'en']

In [14]:
#34 languages

twitter_codes_available ={
'English': 'en',
'Arabic':'ar',
'Bengali':'bn',
'Czech':'cs',
'Danish':'da',
'German':'de',
'Greek':'el',
'Spanish':'es',
'Persian':'fa',
'Finnish':'fi',
'Tagalog':'fil',
'French':'fr',
'Hebrew':'he',
'Hindi':'hi',
'Hungarian':'hu',
'Indonesian':'id',
'Italian':'it',
'Japanese':'ja',
'Korean':'ko',
'Malay':'sa',
'Dutch':'nl',
'Norwegian':'no',
'Polish':'pl',
'Portuguese':'pt',
'Romanian':'ro',
'Russian':'ru',
'Swedish':'sv',
'Thai':'th',
'Turkish':'tr',
'Ukrainian':'uk',
'Urdu':'ur',
'Vietnamese':'vi',
'Chinese (Simplified)':'zh-cn',
'Chinese (Traditional)': 'zh-tw',
'Chinese': 'zh', 
    
##### got from ChatGPT
'Slovak': 'sk',
'Tamil': 'ta',
'Slovenian': 'sl',
'Latvian': 'lv',
'Estonian': 'et',
'Icelandic': 'is',
'Hebrew': 'iw',
'Welsh': 'cy',
'Croatian': 'hr',
'Pashto': 'ps',
'Kurdish': 'ckb',
'Haitian Creole': 'ht',
'Basque': 'eu',
'Bulgarian': 'bg',
'Catalan': 'ca',
'Bosnian': 'bs',
'Tagalog': 'tl',
'Amharic': 'am',
'Sindhi': 'sd',
'Nepali': 'ne',
'Malayalam': 'ml',
'Serbian': 'sr',
'Lithuanian': 'lt',
'Indonesian': 'in',
'Artificial (used for artificially generated text)': 'art',
}

In [15]:
values = set(got_codes) - set(twitter_codes_available.values())
print('Not available : ', values)

not_available_country = set(twitter_codes_available.keys()) - set(mbart.keys())

print('Country not available :', not_available_country)
print('Total not available :', len(not_available_country))

Not available :  {0}
Country not available : {'Catalan', 'Serbian', 'Norwegian', 'Bulgarian', 'Hungarian', 'Welsh', 'Kurdish', 'Haitian Creole', 'Bosnian', 'Greek', 'Slovak', 'Malay', 'Basque', 'Artificial (used for artificially generated text)', 'Sindhi', 'Chinese (Simplified)', 'Chinese (Traditional)', 'Icelandic', 'Amharic', 'Danish', 'Slovenian'}
Total not available : 21


In [16]:
df_rem.loc[df_rem['tweet_language'] == 'sd']
df_rem.loc[df_rem['tweet_language'] == 'ps']

Unnamed: 0,tweet_text,conversation_id,replier_tweetid,replier_userid,poster_userid,poster_tweetid,tweet_time,tweet_language,replier_label,year,campaign,tweet_label,tweet_time_year,common,id,username
1312085,@AnwarGargash د عربي اماراتو د بهرنیو چارو وزي...,1109043885567807488,1109358953694224384,1107345973070753792,348378205,1109043885567807488,2019-03-23 07:39:20+00:00,ps,0,,,1,2019-03-23,,348378205,AnwarGargash
1312092,@AnwarGargash هو تاسو ته نه ښايي چې د محاربو ک...,1109043885567807488,1109317763519045634,918326626299113472,348378205,1109043885567807488,2019-03-23 04:55:39+00:00,ps,0,,,1,2019-03-23,,348378205,AnwarGargash
1748073,@javerias پاکستان آسیا کې یو مهم هېواد دی او د...,1097121736557060096,1097229325198413824,406852778,69807765,1097121736557060096,2019-02-17 20:20:31+00:00,ps,0,,,1,2019-02-17,,69807765,javerias
766105,@Rawaak ایا پوهېدئ چې لومړنی پوهنتون یوې مسلما...,1450198393129668616,1450407708235124742,419385731,43307251,1450198393129668616,2021-10-19 10:25:19+00:00,ps,0,,,0,2021-10-19,1.0,43307251,Rawaak
1095192,@abqatar الله ج د جنت الفرودس کښې ځاي ورکړي,1510288739171553288,1510575682430115846,1486124290391093249,417870919,1510288739171553288,2022-04-03 11:11:22+00:00,ps,0,,,0,2022-04-03,32.0,417870919,abqatar
1441170,@Bnt_mohammed77 ⚘ـ❃ـ❃ُ⚘♡♡✔ 🌹🌹\nاللـ ـ ـ ــﮪ يس...,1002779044092497921,1004157064061571072,979971489016446977,1947507715,1002779044092497921,2018-06-06 00:24:35+00:00,ps,0,,,0,2018-06-06,2.0,1947507715,Bnt_mohammed77
2484281,@TurkiShalhoub انشاء الله چي نور طوفانونو به ر...,1581734724799586304,1581738912892223489,1570557853319954434,3385869567,1581734724799586304,2022-10-16 20:08:38+00:00,ps,0,,,0,2022-10-16,74.0,3385869567,TurkiShalhoub


#### **MBart-50 for 50 languages**

In [13]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translate Hindi to French
tokenizer.src_lang = "hi_IN"
encoded_hi = tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."

# translate Arabic to English
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(article_ar, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "The Secretary-General of the United Nations says there is no military solution in Syria."

  from .autonotebook import tqdm as notebook_tqdm
2023-05-01 16:19:14.316657: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


['The Secretary-General of the United Nations says there is no military solution in Syria.']

In [1]:
# encoded_ar.shape()

#### **LaBSE- Language-agnostic BERT sentence embedding model supporting 109 languages.**

From Google <br />

@misc{feng2020languageagnostic, <br />
      title={Language-agnostic BERT Sentence Embedding}, <br />
      author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang}, <br />
      year={2020}, <br />
      eprint={2007.01852},<br />
      archivePrefix={arXiv},<br />
      primaryClass={cs.CL}<br />
}<br />
@misc{feng2020languageagnostic, <br />
      title={Language-agnostic BERT Sentence Embedding},<br />
      author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang},<br />
      year={2020},<br />
      eprint={2007.01852},<br />
      archivePrefix={arXiv},<br />
      primaryClass={cs.CL}<br />
}<br />


##### **Languages supported in LaBSE**

In [17]:
labse = {
    'af': 'AFRIKAANS',
    'ht': 'HAITIAN CREOLE',
    'pt': 'PORTUGUESE',
    'am': 'AMHARIC',
    'hu': 'HUNGARIAN',
    'ro': 'ROMANIAN',
    'ar': 'ARABIC',
    'hy': 'ARMENIAN',
    'ru': 'RUSSIAN',
    'as': 'ASSAMESE',
    'id': 'INDONESIAN',
    'rw': 'KINYARWANDA',
    'az': 'AZERBAIJANI',
    'ig': 'IGBO',
    'si': 'SINHALESE',
    'be': 'BELARUSIAN',
    'is': 'ICELANDIC',
    'sk': 'SLOVAK',
    'bg': 'BULGARIAN',
    'it': 'ITALIAN',
    'sl': 'SLOVENIAN',
    'bn': 'BENGALI',
    'ja': 'JAPANESE',
    'sm': 'SAMOAN',
    'bo': 'TIBETAN',
    'jv': 'JAVANESE',
    'sn': 'SHONA',
    'bs': 'BOSNIAN',
    'ka': 'GEORGIAN',
    'so': 'SOMALI',
    'ca': 'CATALAN',
    'kk': 'KAZAKH',
    'sq': 'ALBANIAN',
    'ceb': 'CEBUANO',
    'km': 'KHMER',
    'sr': 'SERBIAN',
    'co': 'CORSICAN',
    'kn': 'KANNADA',
    'st': 'SESOTHO',
    'cs': 'CZECH',
    'ko': 'KOREAN',
    'su': 'SUNDANESE',
    'cy': 'WELSH',
    'ku': 'KURDISH',
    'sv': 'SWEDISH',
    'da': 'DANISH',
    'ky': 'KYRGYZ',
    'sw': 'SWAHILI',
    'de': 'GERMAN',
    'la': 'LATIN',
    'ta': 'TAMIL',
    'el': 'GREEK',
    'lb': 'LUXEMBOURGISH',
    'te': 'TELUGU',
    'en': 'ENGLISH',
    'lo': 'LAOTHIAN',
    'tg': 'TAJIK',
    'eo': 'ESPERANTO',
    'lt': 'LITHUANIAN',
    'th': 'THAI',
    'es': 'SPANISH',
    'lv': 'LATVIAN',
    'tk': 'TURKMEN',
    'et': 'ESTONIAN',
    'mg': 'MALAGASY',
    'tl': 'TAGALOG',
    'eu': 'BASQUE',
    'mi': 'MAORI',
    'tr': 'TURKISH',
    'fa': 'PERSIAN',
    'mk': 'MACEDONIAN',
    'tt': 'TATAR',
    'fi': 'FINNISH',
    'ml': 'MALAYALAM',
    'ug': 'UIGHUR',
    'fr': 'FRENCH',
    'mn': 'MONGOLIAN',
    'uk': 'UKRAINIAN',
    'fy': 'FRISIAN',
    'mr' : 'MARATHI',
    'ur' : 'URDU',
    'ga' : 'IRISH',
    'ms' : 'MALAY',
    'uz' : 'UZBEK',
    'gd' : 'SCOTS_GAELIC',
    'mt' : 'MALTESE',
    'vi' : 'VIETNAMESE',
    'gl' : 'GALICIAN',
    'my' : 'BURMESE',
    'wo' : 'WOLOF',
    'gu' : 'GUJARATI',
    'ne' : 'NEPALI',
    'xh' : 'XHOSA',
    'ha' : 'HAUSA',
    'nl' : 'DUTCH',
    'yi' : 'YIDDISH',
    'haw': 'HAWAIIAN',
    'no' : 'NORWEGIAN',
    'yo' : 'YORUBA',
    'he' : 'HEBREW',
    'ny' : 'NYANJA',
    'zh' : 'CHINESE',
    'hi' : 'HINDI',
    'or' : 'ORIYA',
    'zu' : 'ZULU',
    'hmn': 'HMONG',
    'pa' : 'PUNJABI',
    'hr' : 'CROATIAN',
    'pl' : 'POLISH',
}

In [18]:
len(labse)

109

In [19]:
twitter_code = list(set(twitter_codes_available.values()))
twitter_code = [x.lower() for x in twitter_code]
labse_code = list(set(labse.keys()))
labse_code = [x.lower() for x in labse_code]

values = set(twitter_code) - set(labse_code)
print('Not available : ', values)

twitter_countries = set(twitter_codes_available.keys())

twitter_countries = [x.lower() for x in twitter_countries]

labse_countries = set(labse.values())
labse_countries = [x.lower() for x in labse_countries]
not_available_country =  set(twitter_countries) - set(labse_countries)

print('\n Country not available : \n', not_available_country)
print('\n Total not available :', len(not_available_country))

Not available :  {'in', 'art', 'zh-tw', 'zh-cn', 'sa', 'ckb', 'iw', 'ps', 'sd'}

 Country not available : 
 {'pashto', 'chinese (traditional)', 'artificial (used for artificially generated text)', 'chinese (simplified)', 'sindhi'}

 Total not available : 5


##### **Example use of labse**

In [25]:
import torch
from transformers import BertModel, BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
model = BertModel.from_pretrained("setu4993/LaBSE")
model = model.eval()

english_sentences = [
    # "puppies",
    "Puppies are nice.",
    # "I enjoy taking long walks along the beach with my dog.",
]
english_inputs = tokenizer(english_sentences, 
                           return_tensors="pt", padding=True)

with torch.no_grad():
    english_outputs = model(**english_inputs)
    
english_embeddings = english_outputs.pooler_output

italian_sentences = [
    # "cuccioli",
    "I cuccioli sono carini.",
    # "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.",
]
japanese_sentences = [
    # "犬",
    "子犬はいいです。",
    # "私は犬と一緒にビーチを散歩するのが好きです"
]
italian_inputs = tokenizer(italian_sentences, return_tensors="pt", padding=True)
japanese_inputs = tokenizer(japanese_sentences, return_tensors="pt", padding=True)

with torch.no_grad():
    italian_outputs = model(**italian_inputs)
    japanese_outputs = model(**japanese_inputs)

italian_embeddings = italian_outputs.pooler_output
japanese_embeddings = japanese_outputs.pooler_output

import torch.nn.functional as F


def similarity(embeddings_1, embeddings_2):
    normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
    normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
    return torch.matmul(
        normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
    )


print(similarity(english_embeddings, italian_embeddings))
print(similarity(english_embeddings, japanese_embeddings))
print(similarity(italian_embeddings, japanese_embeddings))

tensor([[0.8598]])
tensor([[0.8709]])
tensor([[0.8003]])


In [38]:
np.array(english_embeddings[0]).reshape(1,-1).shape

(1, 768)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(english_embeddings,
                italian_embeddings)[0][0]
                             

0.6318207

#### **Testing for tweets**

In [30]:
def test_each_language(df, 
                       language_column='tweet_language'
                      ):
    
    import torch
    from transformers import BertModel, BertTokenizerFast

    tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
    model = BertModel.from_pretrained("setu4993/LaBSE")
    model = model.eval()

    languages = df[language_column].unique()
    for lang in languages:
        print(lang)
        df_lang = (df
                   .loc[df[language_column] == lang]
                   .groupby([language_column])
                   .head(1)
                  )
        tweet = df_lang['tweet_text'].tolist()
        tweet_clean = clean_hp.remove_mentions(tweet[0])
        
        inputs = tokenizer(tweet, 
                           return_tensors="pt",
                           padding=True)

        with torch.no_grad():
            output = model(**inputs)

        embeddings = output.pooler_output
        print(embeddings.shape)
        
    return

In [32]:
# test_each_language(df_rem)

#### **Get embedding and cosine of pairs**

In [4]:
importlib.reload(config_hp)

config = config_hp.config()
balanced = config['BALANCED']

positive_conv = balanced['balanced_pos_conversation']
df_pos = pd.read_pickle(positive_conv)

negative_conv = balanced['balanced_neg_conversation']
df_neg = pd.read_pickle(negative_conv)

df = df_pos.append(df_neg)

  df = df_pos.append(df_neg)


In [54]:
def get_embedding(df, filename):
    import torch
    from transformers import BertModel, BertTokenizerFast

    tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
    model = BertModel.from_pretrained("setu4993/LaBSE")
    model = model.eval()

    all_replies = []
    total = len(df)
    print(f'\n *** Starting the embedding process: {total} *** \n')
    
    for row in df.iterrows():
        text = row[1]['tweet_text']
        data = row[1]
        
        tweet_clean = clean_hp.remove_mentions(text)
        tweet_clean = clean_hp.remove_hashtags(tweet_clean)
        tweet_clean = clean_hp.remove_URL(tweet_clean)

        inputs = tokenizer(tweet_clean, 
                           return_tensors="pt",
                           padding=True)

        with torch.no_grad():
            output = model(**inputs)

        embeddings = output.pooler_output
        
        all_replies.append([data['replier_tweetid'],
                            data['poster_userid'],
                            data['poster_tweetid'],
                            data['replier_userid'],
                            embeddings
                           ])
        if len(all_replies) % 1000 == 0:
            total = len(all_replies)
            
            print(f'{total} done!')
            
    (pd.DataFrame(data=all_replies,
                 columns=['replier_tweetid',
                          'poster_userid',
                          'poster_tweetid',
                          'replier_userid'
                         ]
                )
    ).to_pickle(filename)
    
    print('\n *** Ending the embedding process *** \n')


In [None]:
importlib.reload(config_hp)

config = config_hp.config()
embedding_path = config['EMBEDDINGS_PATH']
filename = embedding_path['reply_multilanguage_embedding']


get_embedding(df, filename)

#### **Get combination of replier tweetids**

In [17]:
importlib.reload(config_hp)

config = config_hp.config()

embedding_path = config['EMBEDDINGS_PATH']
reply_multilanguage_embedding = embedding_path['reply_multilanguage_embedding']

df_embedding = pd.read_pickle(reply_multilanguage_embedding)

In [18]:
df_unq = df_embedding.groupby(['poster_tweetid', 'replier_userid'])['replier_tweetid'].last().reset_index()

In [19]:
# df_unq

In [19]:
df_size = df_unq.groupby(['poster_tweetid'])['replier_tweetid'].nunique().to_frame('count').reset_index()

In [20]:
df_size['count'].max()

36889

In [None]:
df_size.loc[(df_size['count'] <= 30000) & (df_size['count'] >= 20000)].count()

In [None]:
from itertools import combinations

def try_get_combination(df):
    df_unq = df.groupby(['poster_tweetid', 
                                   'replier_userid'])['replier_tweetid'].last().reset_index()   
    df_size = df_unq.groupby(['poster_tweetid'])[
        'replier_tweetid'].nunique().to_frame('count').reset_index()
    ids = df_size.loc[df_size['count'] < 100]['poster_tweetid']
    
    df_poster = df.loc[df['poster_tweetid'].isin(ids)]
    
    importlib.reload(config_hp)

    config = config_hp.config()

    embedding_path = config['EMBEDDINGS_PATH']

    combination = embedding_path['combination']

    print(len(df_poster))
    
    print('starting')
    
    df_comb = df_poster.groupby('poster_tweetid')['replier_tweetid'].apply(lambda x:
        list(combinations(x, 2))).reset_index()
    print('list here')
    print(df_comb.info())

    df_exploded = df_comb.explode('replier_tweetid')
    df_exploded['replier_x'] = df_exploded['replier_tweetid'].apply(
        lambda x: x[0])
    df_exploded['replier_y'] = df_exploded['replier_tweetid'].apply(
        lambda x: x[1])
    df_emb = df_exploded.merge(df[['replier_tweetid', 'embeddings']],
                  left_on='replier_x',
                  right_on='replier_tweetid'
                 )
    df_emb = df_emb.merge(df[['replier_tweetid', 'embeddings']],
                      left_on='replier_y',
                      right_on='replier_tweetid'
                     )
    print(df_emb.info())
    print('Embdding here')

    def get_cosine(df):
        '''
        Get the cosine similarity of the vector list
        :param vector_list: list of embedding vectors

        :return list
        '''

        df['cosine'] = df.apply(lambda x: round(
                cosine_similarity(x.embeddings_x, x.embeddings_y)[0][0],
                2),
            axis=1)

        return df
    
    
    df_emb_cosine = get_cosine(df_emb)
    
    return df_emb_cosine[['poster_tweetid', 'replier_x', 'replier_y', 'cosine']]
    
df_comb = try_get_combination(df_embedding)

In [None]:
df_comb.head()

In [None]:
importlib.reload(config_hp)

config = config_hp.config()

embedding_path = config['EMBEDDINGS_PATH']

combination = embedding_path['combination']

df_comb.to_pickle(combination)

#### **Calculate cosine similarity**

In [10]:
df_exploded = df_comb.explode('replier_tweetid')

In [12]:
df_exploded['replier_x'] = df_exploded['replier_tweetid'].apply(
    lambda x: x[0]
)

In [13]:
df_exploded['replier_y'] = df_exploded['replier_tweetid'].apply(
    lambda x: x[1]
)

In [17]:
df_emb = df_exploded.merge(df_embedding[['replier_tweetid', 'embeddings']],
                  left_on='replier_x',
                  right_on='replier_tweetid'
                 )
df_emb = df_emb.merge(df_embedding[['replier_tweetid', 'embeddings']],
                  left_on='replier_y',
                  right_on='replier_tweetid'
                 )

In [20]:
df_emb.columns

Index(['poster_tweetid', 'replier_tweetid_x', 'replier_x', 'replier_y',
       'replier_tweetid_y', 'embeddings_x', 'replier_tweetid', 'embeddings_y'],
      dtype='object')

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine(df):
    '''
    Get the cosine similarity of the vector list
    :param vector_list: list of embedding vectors
    
    :return list
    '''
    
    df['cosine'] = df.apply(lambda x: round(
            cosine_similarity(x.embeddings_x, x.embeddings_y)[0][0],
            2),
        axis=1)
    
    return df

In [29]:
df_emb_cosine = get_cosine(df_emb)

In [34]:
df_emb_cosine[['poster_tweetid', 'replier_x', 'replier_y', 'cosine']]

Unnamed: 0,poster_tweetid,replier_x,replier_y,cosine
0,1170262342606700544,1170377372920680450,1170290164276633600,0.31
1,1170262342606700544,1170377372920680450,1170299388486402048,0.25
2,1170262342606700544,1170290164276633600,1170299388486402048,0.33
3,1170262342606700544,1170377372920680450,1170291675039784962,0.27
4,1170262342606700544,1170290164276633600,1170291675039784962,0.38
...,...,...,...,...
620,1175020156231278592,1175036782582423553,1175036516923596800,0.15
621,1175020156231278592,1175036056082812928,1175036516923596800,0.07
622,1175020156231278592,1175035467278036992,1175036516923596800,0.28
623,1175020156231278592,1175038138152079360,1175036516923596800,0.32
