# IMPORT LIBRARIES

In [25]:
import os
import torch
import pke
import json
import pandas as pd
import numpy as np
from nltk.metrics import edit_distance
import nltk
import re
import emoji
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import pandasql as ps
import en_core_web_sm
from pke.lang import stopwords
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import cos_sim
import torch.nn.functional as F
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
spacy.load("en_core_web_sm")

<spacy.lang.en.English at 0x7f0ae3662320>

# METHOD 1: EDIT-DISTANCE

## 1. Import facebook data

In [165]:
def read_valid_json_files_em(folder_path):
    json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
    valid_data = []
    for json_file in json_files:
        try:
            with open(os.path.join(folder_path, json_file), 'r') as f:
                data = json.load(f)
                for file in data:                    
                    file['tw_id'] = json_file.split('_')[0]
                    file['file_num'] = json_file[-6]
                valid_data.extend(data)
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON file: {json_file}")
    return valid_data

In [166]:
# Data extracting function
def extract_data_em(data):
    extracted_data = []
    for item in data:
        post_id = item.get("post_id")
        user_id = item.get("user_id")
        text = item.get("text")
        username = item.get("username")
        tw_id = item.get("tw_id")
        file_num = item.get("file_num")
        extracted_data.append({"post_id":post_id,
            "username": username, "user_id": user_id, "text": text
                               ,'tw_id':tw_id.lower(),"file_num": file_num
                               })
    return extracted_data

In [167]:
folder_path_fb =  r"fb1800raw"
valid_json_data_fb = read_valid_json_files_em(folder_path_fb)
extracted_data_fb = extract_data_em(valid_json_data_fb)

Skipping invalid JSON file: taylorswift_facebook_6.json


In [168]:
# Create a DataFrame from the extracted data
FB = pd.DataFrame(extracted_data_fb)

## 2. Import X data

In [169]:
def read_valid_json_files_x(folder_path):
    json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
    valid_data = []
    for json_file in json_files:
        try:
            with open(os.path.join(folder_path, json_file), 'r') as f:
                data = json.load(f)
                valid_data.extend(data)
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON file: {json_file}")
    return valid_data

In [170]:
# Data extracting function
def extract_data_x(data):
    extracted_data = []
    for item in data:
        user_id = item.get("user_id")
        text = item.get("text")
        username = item.get("username")
        extracted_data.append({"username": username, "user_id": user_id, "text": text})
    return extracted_data

In [171]:
folder_path_x = r"tw300raw"
valid_json_data_x = read_valid_json_files_x(folder_path_x)
extracted_data_x = extract_data_x(valid_json_data_x)

In [172]:
X = pd.DataFrame(extracted_data_x)

## 3. Calculate Edit-distance

In [173]:
X_check_edit = X[[
    'user_id','username'
    ]].drop_duplicates(subset=[
        'user_id','username'
        ]).reset_index()
X_check_edit

Unnamed: 0,index,user_id,username
0,0,billgates,Bill Gates
1,1001,carmeloanthony,Carmelo Anthony
2,2002,nickiminaj,Nicki Minaj
3,3003,oprah,Oprah Winfrey
4,4004,beyonce,BEYONCÉ
...,...,...,...
270,260188,itsgabrielleu,Gabrielle Union
271,261189,shawnmichaels,Shawn Michaels
272,262190,rainnwilson,RainnWilson
273,263191,kerihilson,Keri Hilson


In [174]:
FB_check_edit = FB[[
    'user_id','username','tw_id','file_num'
    ]].drop_duplicates(subset=[
        'user_id','username','tw_id','file_num'
        ]).reset_index()
FB_check_edit

Unnamed: 0,index,user_id,username,tw_id,file_num
0,0,587701401348302,Forever 5secondsofsummer,5sos,4
1,4,258244370945844,Ronaldinho: A Legend To Be Remembered,10ronaldinho,5
2,32,,Ronaldinho: A Legend To Be Remembered,10ronaldinho,5
3,54,114824906744797,Brazil ; The Home Of Legends,10ronaldinho,3
4,254,100063455659640,𝟓 𝐒𝐞𝐜𝐨𝐧𝐝𝐬 𝐨𝐟 𝐒𝐮𝐦𝐦𝐞𝐫,5sos,3
...,...,...,...,...,...
2987,715358,100050617653542,Zeed,zedd,4
2988,715360,228908680640576,Zeed,zedd,4
2989,715362,,Zeed,zedd,4
2990,715460,181938933887,Insomniac Events,zedd,3


In [181]:
df_edit = pd.DataFrame(columns=['fb_userid','fb_username','file_num','X_username','X_userid','edit_distance'])
for x in range(0,len(X_check_edit)):
    for fb in range(0,len(FB_check_edit)):
        if (X_check_edit['user_id'][x]==FB_check_edit['tw_id'][fb]):
            e = edit_distance(FB_check_edit['username'][fb]
                          ,X_check_edit['username'][x]) 
            df_edit = df_edit._append({
                                'fb_userid':FB_check_edit['user_id'][fb],
                                'fb_username': FB_check_edit['username'][fb],   
                                'file_num' : FB_check_edit['file_num'][fb],
                                'X_username': X_check_edit['username'][x],
                                'X_userid': X_check_edit['user_id'][x],
                                'edit_distance': e
                            }, ignore_index=True
                                    )

In [182]:
df_edit

Unnamed: 0,fb_userid,fb_username,file_num,X_username,X_userid,edit_distance
0,100754822643960,Bill Gates,3,Bill Gates,billgates,0
1,519305544814653,Conservative Tribune by WJ,4,Bill Gates,billgates,23
2,100044428239741,Bill Gates,1,Bill Gates,billgates,0
3,216311481960,Bill Gates,1,Bill Gates,billgates,0
4,,Bill Gates,1,Bill Gates,billgates,0
...,...,...,...,...,...,...
2742,298696600251434,Arjun Rampal,1,arjun rampal,rampalarjun,2
2743,191103734290916,Arjun Rampal Fans Club,6,arjun rampal,rampalarjun,12
2744,,Arjun Rampal Fans Club,6,arjun rampal,rampalarjun,12
2745,100044453798606,ARJUN RAMPAL-LOVE YOU BHAI,2,arjun rampal,rampalarjun,25


In [183]:
import pandasql as ps
edit_result=ps.sqldf("""
    WITH N AS (
    SELECT *,RANK() OVER (PARTITION BY X_username
                                ORDER BY edit_distance ASC) RANKING
    FROM df_edit
    )
    SELECT X_username,X_userid, fb_username,fb_userid, file_num, edit_distance       
    FROM N         
    WHERE RANKING =1
""" , locals())

ModuleNotFoundError: No module named 'pandasql'

In [None]:
edit_result

## 4. Accuracy

In [None]:
cnt=ps.sqldf("""
    SELECT (COUNT(DISTINCT X_username))
    from true_label
""" , locals())

In [None]:
len(true_label['X_username'].unique())/len(X)

# METHOD 2: EMBEDDING VECTORS

## 1. Import data

In [2]:
# Data reading function
def read_valid_json_files_em(folder_path):
    json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
    valid_data = []
    for json_file in json_files:
        try:
            with open(os.path.join(folder_path, json_file), 'r') as f:
                data = json.load(f)
                for file in data:
                    file['file_name_1'] = json_file[:-16]
                    file['file_name_2'] = json_file[:]
                valid_data.extend(data)
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON file: {json_file}")
    return valid_data

In [3]:
# Data extracting function
def extract_data_em(data):
    extracted_data = []
    for item in data:
        user_id = item.get("user_id")
        text = item.get("text")
        username = item.get("username")
        file_name_1 = item.get("file_name_1")
        file_name_2 = item.get("file_name_2")
        extracted_data.append({"username": username, "user_id": user_id, "text": text
                               ,"file_name_1": file_name_1, "file_name_2": file_name_2
                               })
    return extracted_data

In [5]:
def read_valid_json_files_em_x(folder_path):
    json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
    valid_data = []
    for json_file in json_files:
        try:
            with open(os.path.join(folder_path, json_file), 'r') as f:
                data = json.load(f)
                valid_data.extend(data)
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON file: {json_file}")
    return valid_data

In [6]:
def extract_data_em_x(data):
    extracted_data = []
    for item in data:
        user_id = item.get("user_id")
        text = item.get("text")
        username = item.get("username")
        extracted_data.append({"username": username, "user_id": user_id, "text": text})
    return extracted_data

In [7]:
folder_path_x_em = r"tw300raw"
valid_json_data_x_em = read_valid_json_files_em_x(folder_path_x_em)
extracted_data_x_em = extract_data_em_x(valid_json_data_x_em)

In [8]:
folder_path_fb_em = r"fb1800raw"
valid_json_data_fb_em = read_valid_json_files_em(folder_path_fb_em)
extracted_data_fb_em = extract_data_em(valid_json_data_fb_em)

Skipping invalid JSON file: taylorswift_facebook_6.json


In [9]:
# Create a DataFrame from the extracted data
df_fb_em = pd.DataFrame(extracted_data_fb_em)
df_x_em = pd.DataFrame(extracted_data_x_em)

In [10]:
df_x_em.sort_values("username")

Unnamed: 0,username,user_id,text
26939,50cent,50cent,Cyber Monday Sale 🎄35% OFF Storewide! Sale End...
27599,50cent,50cent,🔥 @unclemurda is currently #9 on Netflix ok i ...
27600,50cent,50cent,Happy C Day to my man @SnoopDogg wishing you m...
27601,50cent,50cent,It’s a good thing i don’t talk about things i ...
27602,50cent,50cent,BMF🔥My New #1 👏👏👏👏 https://t.co/3kb7bqZmCu
...,...,...,...
132888,👼🏼,troyesivan,@strrrrre LMAO
132889,👼🏼,troyesivan,@YO_ONGl 🔥🔥🔥
132890,👼🏼,troyesivan,https://t.co/IwbviS1spG
132877,👼🏼,troyesivan,And I’m happy :) https://t.co/LaCauzG2Jx


In [11]:
df_fb_em.sort_values("username")

Unnamed: 0,username,user_id,text,file_name_1,file_name_2
665929,1 9 9 6,102431234759221,Young Keanu Reeves 🔥,Zendaya,Zendaya_facebook_3.json
665988,1 9 9 6,102431234759221,Zendaya ✨\n30 Epic Mistakes in the Most Popula...,Zendaya,Zendaya_facebook_3.json
665989,1 9 9 6,102431234759221,Red carpet 👀🔥,Zendaya,Zendaya_facebook_3.json
665990,1 9 9 6,102431234759221,Monica Bellucci & Alain Delon (1989) 🖤\nPopula...,Zendaya,Zendaya_facebook_3.json
665991,1 9 9 6,102431234759221,Jessica Biel and Chris Evans photographed on t...,Zendaya,Zendaya_facebook_3.json
...,...,...,...,...,...
301,𝟓 𝐒𝐞𝐜𝐨𝐧𝐝𝐬 𝐨𝐟 𝐒𝐮𝐦𝐦𝐞𝐫,100063455659640,Join us for our special show 'The Feeling of F...,5SOS,5SOS_facebook_3.json
300,𝟓 𝐒𝐞𝐜𝐨𝐧𝐝𝐬 𝐨𝐟 𝐒𝐮𝐦𝐦𝐞𝐫,100063455659640,JET BLACK HEART🖤🖤,5SOS,5SOS_facebook_3.json
299,𝟓 𝐒𝐞𝐜𝐨𝐧𝐝𝐬 𝐨𝐟 𝐒𝐮𝐦𝐦𝐞𝐫,100063455659640,Jet Black Heart🖤,5SOS,5SOS_facebook_3.json
313,𝟓 𝐒𝐞𝐜𝐨𝐧𝐝𝐬 𝐨𝐟 𝐒𝐮𝐦𝐦𝐞𝐫,100063455659640,The Summer Brothers,5SOS,5SOS_facebook_3.json


## 2. Clean text data

In [12]:
df_fb_em.isnull().sum()

username          0
user_id        2874
text              0
file_name_1       0
file_name_2       0
dtype: int64

In [13]:
df_fb_em[df_fb_em['user_id'].isnull()]

Unnamed: 0,username,user_id,text,file_name_1,file_name_2
32,Ronaldinho: A Legend To Be Remembered,,This page is back up and running! What do you ...,10Ronaldinho,10Ronaldinho_facebook_5.json
45,Ronaldinho: A Legend To Be Remembered,,,10Ronaldinho,10Ronaldinho_facebook_5.json
47,Ronaldinho: A Legend To Be Remembered,,,10Ronaldinho,10Ronaldinho_facebook_5.json
2213,Luke Hemmings,,,5SOS,5SOS_facebook_2.json
4726,Ashton Irwin,,,Ashton5SOS,Ashton5SOS_facebook_1.json
...,...,...,...,...,...
713388,Jessica Alba,,,ZacEfron,ZacEfron_facebook_4.json
715362,Zeed,,,Zedd,Zedd_facebook_4.json
715443,Zeed,,"Zeed added 12 new photos.\nMay 18, 2014 at 4:0...",Zedd,Zedd_facebook_4.json
715446,Zeed,,"Zeed added 8 new photos.\nMay 13, 2014 at 8:45...",Zedd,Zedd_facebook_4.json


In [14]:
df_fb_em['user_id'] = df_fb_em['user_id'].ffill()

In [15]:
df_fb_em.isnull().sum()

username       0
user_id        0
text           0
file_name_1    0
file_name_2    0
dtype: int64

In [16]:
df_fb_em['text'] = df_fb_em['text'].dropna()

In [17]:
df_x_em['text'] = df_x_em['text'].dropna()

In [18]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")

from nltk.corpus import stopwords
import string

stopword=set(stopwords.words('english'))

def clean(text):
    # Chuyển đổi văn bản thành chữ thường
    text = str(text).lower()
    # Loại bỏ các ký tự trong dấu ngoặc vuông
    text = re.sub('\[.*?\]', '', text)
    # Loại bỏ các đường dẫn web
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # Loại bỏ các thẻ HTML
    text = re.sub('<.*?>+', '', text)
    # Loại bỏ các dấu câu
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Loại bỏ các ký tự xuống dòng
    text = re.sub('\n', '', text)
    # Loại bỏ các từ có chứa số
    text = re.sub('\w*\d\w*', '', text)
    # Loại bỏ emoji
    text = emoji.replace_emoji(text)
    # Loại bỏ các từ dừng, ví dụ như là, của, và.
    # Từ dừng là những từ phổ biến nhưng không mang nhiều ý nghĩa cho văn bản.
    text = [word for word in text.split(' ') if word not in stopword]
    # Nối lại các từ còn lại thành một chuỗi văn bản
    text = " ".join(text)
    # Áp dụng thuật toán stemmer để rút gọn các từ về dạng gốc,
    # ví dụ như học -> học, học sinh -> học, học tập -> học.
    # Stemmer là một công cụ để giảm thiểu số lượng các từ khác nhau nhưng có cùng nghĩa trong văn bản.
    text = [stemmer.stem(word) for word in text.split(' ')]
    # Nối lại các từ đã được stem thành một chuỗi văn bản
    text = " ".join(text)
    return text

[nltk_data] Downloading package stopwords to /home/jupyter-
[nltk_data]     hocnv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
df_fb_em["text"] = df_fb_em["text"].apply(clean)

In [20]:
df_x_em["text"] = df_x_em["text"].apply(clean)

## 3. Combine text

#### 3.1. Embeddings 1: gte-large

In [193]:
df_fb_gte = df_fb_em.groupby("username").head(100)

In [194]:
df_x_gte = df_x_em.groupby("username").head(100)

In [195]:
df_fb_gte = df_fb_gte.groupby('username')['text'].agg(lambda x: ' '.join(x)).reset_index()

In [196]:
df_x_gte = df_x_gte.groupby('username')['text'].agg(lambda x: ' '.join(x)).reset_index()

In [197]:
fb_text_list = df_fb_gte["text"].tolist()

In [198]:
x_text_list = df_x_gte["text"].tolist()

In [199]:
model = SentenceTransformer('thenlper/gte-large')

In [200]:
embeddings_fb_gte = model.encode(fb_text_list, convert_to_tensor=True)
embeddings_x_gte = model.encode(x_text_list, convert_to_tensor=True)

In [201]:
embeddings_fb_np = embeddings_fb_gte.cpu().detach().numpy()
embeddings_x_np = embeddings_x_gte.cpu().detach().numpy()

In [202]:
cosine_scores = util.cos_sim(embeddings_x_np, embeddings_fb_np)

In [203]:
#cosine_scores_tensor = torch.tensor(cosine_scores)
cosine_scores = cosine_scores.clone().detach()
max_values, max_indices = torch.max(cosine_scores, dim=1)

In [204]:
df_x_gte

Unnamed: 0,username,text
0,50cent,cyber monday sale storewid sale end tonight s...
1,@jason,mikesalguero think awesom share calendar revie...
2,A Lord,grate it suppos wow excel kind weekend stay sa...
3,A.R.Rahman,happi birthday udhaystalin glad part mamannan ...
4,Adam Levine,ojala maluma therudeboyz offici video stream...
...,...,...
270,tyler oakley (parody),oh wow shaniatwain lakestreetd forget that a...
271,ye,first time maralago rain traffic can't beli...
272,zayn,ask collabor celebr jimihendrix birthday unex...
273,zooey deschanel,love entir convers im donat join let know re...


In [205]:
df_fb_gte

Unnamed: 0,username,text
0,1 9 9 6,cameron diaz mask popular actor chang s...
1,1997.,angelina joli angelina joli leonardo dicapri...
2,"44th POTUS, his story made history.",queen elizabeth alexandra mari ii majesti made...
3,5 Seconds of Summer,merch grab feel love amazon music silver ...
4,5 Seconds of Summer Fam,nice meet guy hometim your done peopl keep ...
...,...,...
1524,𝑱𝒖𝒔𝒕𝒊𝒏 𝑩𝒊𝒆𝒃𝒆𝒓 𝑭𝒂𝒏𝒔,“a gentl word kind look goodnatur smile work w...
1525,𝓩𝓮𝓷𝓭𝓪𝔂𝓪 𝓕𝓪𝓷𝓼,wikiofwitchercomdemi moor look ...
1526,𝙄𝙘𝙤𝙣𝙨,bella hadid sprayon dress coperni show alexa...
1527,𝚅 𝙴 𝚁 𝚂 𝙴 𝚁 𝙴 𝙿 𝚄 𝙱 𝙻 𝙸 𝙲,somewher knowkeanelyricsi walk across empti la...


In [206]:
df_x_gte = df_x_gte.assign(cosine=max_values, position=max_indices)

In [207]:
df_x_gte["fb_username"] = df_x_gte["position"].apply(lambda x: df_fb_gte["username"][x])

In [208]:
df_x_gte

Unnamed: 0,username,text,cosine,position,fb_username
0,50cent,cyber monday sale storewid sale end tonight s...,0.890400,1212,Shaq
1,@jason,mikesalguero think awesom share calendar revie...,0.874581,200,CNBC Make It
2,A Lord,grate it suppos wow excel kind weekend stay sa...,0.900617,790,Lewis_Hamilton
3,A.R.Rahman,happi birthday udhaystalin glad part mamannan ...,0.898668,18,A.R.Rahman
4,Adam Levine,ojala maluma therudeboyz offici video stream...,0.952626,34,Adam Levine
...,...,...,...,...,...
270,tyler oakley (parody),oh wow shaniatwain lakestreetd forget that a...,0.885632,734,Khloé Kardashian
271,ye,first time maralago rain traffic can't beli...,0.889856,150,Best quotes by Steve Harvey
272,zayn,ask collabor celebr jimihendrix birthday unex...,0.895589,1335,The Pillowtalker Zayn Malik
273,zooey deschanel,love entir convers im donat join let know re...,0.911454,1505,Zooey Deschanel


In [209]:
result_gte = df_x_gte[df_x_gte["username"] == df_x_gte["fb_username"]]

In [210]:
fbb = df_fb_em.drop_duplicates("username")

In [211]:
xx = df_x_em.drop_duplicates("username")

In [212]:
dc = pd.merge(fbb, xx, right_on="username", left_on='username', how='inner')

In [213]:
dc

Unnamed: 0,username,user_id_x,text_x,file_name_1,file_name_2,user_id_y,text_y
0,Luke Hemmings,100044531975401,don't think lip ring work eye mask certain th...,5SOS,5SOS_facebook_2.json,luke5sos,fave video we'v ever done
1,A.R.Rahman,696457343797395,,arrahman,arrahman_facebook_3.json,arrahman,happi birthday udhaystalin glad part mamannan ...
2,Ashton Irwin,100044587322642,year anniversari superbloom kick back video c...,Ashton5SOS,Ashton5SOS_facebook_1.json,ashton5sos,hotel room power stanc see tonight gold coast ...
3,Angel Locsin,100044605471738,top rate phenomen tatak angel locsin,143redangel,143redangel_facebook_3.json,143redangel,new vlog drop theangelandneilchannel today s...
4,Neil Patrick Harris,100044619390816,oh hi here sip whiski scotland complex delici ...,ActuallyNPH,ActuallyNPH_facebook_1.json,actuallynph,tomorrow anniversari wondercad onedercad chee...
...,...,...,...,...,...,...,...
210,Jonathan Ross,539314996150555,,wossy,wossy_facebook_3.json,wossy,stephenk bullshit hate said si
211,Christina Aguilera,100044245098301,la reina billboard latinth queen billboard l...,xtina,xtina_facebook_1.json,xtina,aftermath
212,Yeng Constantino,100044560950813,vote excellenceoctob pm ·last chanc cast vo...,YengPLUGGEDin,YengPLUGGEDin_facebook_1.json,yengpluggedin,love yengster
213,Zac Efron,112576674764564,death majesti queen elizabeth profound sad mom...,ZacEfron,ZacEfron_facebook_2.json,zacefron,let fix keep make beauti movi togeth love guy ...


#### 3.2. Accuracy 1: gte-large

In [214]:
print("Accuracy of gte-large = {0}%".format(round((result_gte.shape[0] / dc.shape[0]) * 100, 5)))

Accuracy of gte-large = 53.95349%


#### 3.3. Embeddings 2: e5-small-v2

In [178]:
model_e = SentenceTransformer('intfloat/e5-small-v2')

In [179]:
df_fb_e = df_fb_em.groupby("username").head(100)

In [180]:
df_x_e = df_x_em.groupby("username").head(100)

In [181]:
df_fb_e = df_fb_e.groupby('username')['text'].agg(lambda x: ' '.join(x)).reset_index()

In [182]:
df_x_e = df_x_e.groupby('username')['text'].agg(lambda x: ' '.join(x)).reset_index()

In [183]:
fb_text_list_e = df_fb_e["text"].tolist()

In [184]:
x_text_list_e = df_x_e["text"].tolist()

In [185]:
embeddings_x_e = model_e.encode(x_text_list_e, normalize_embeddings=True, convert_to_tensor=True)
embeddings_fb_e = model_e.encode(fb_text_list_e, normalize_embeddings=True, convert_to_tensor=True)

In [186]:
embeddings_fb_e = embeddings_fb_e.cpu().detach().numpy()
embeddings_x_e = embeddings_x_e.cpu().detach().numpy()

In [187]:
cosine_scores_e = util.cos_sim(embeddings_x_e, embeddings_fb_e)

In [188]:
cosine_scores_e = cosine_scores_e.clone().detach()
max_values_e, max_indices_e = torch.max(cosine_scores_e, dim=1)

In [189]:
df_x_e = df_x_e.assign(cosine=max_values_e, position=max_indices_e)

In [190]:
df_x_e["fb_username"] = df_x_e["position"].apply(lambda x: df_fb_e["username"][x])

In [191]:
result_e = df_x_e[df_x_e["username"] == df_x_e["fb_username"]]

#### 3.4. Accuracy 2: e5-small-v2

In [192]:
print("Accuracy of e5-small-v2 = {0}%".format(round(result_e.shape[0] / 215 * 100, 5)))

Accuracy of e5-small-v2 = 41.86047%


#### 3.5. Embeddings 3: GIST-small-Embedding-v0

In [None]:
model_gist = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0", revision=None)

In [159]:
df_fb_gist_cf = df_fb_em.groupby("username").head(100)

In [160]:
df_x_gist_cf = df_x_em.groupby("username").head(100)

In [161]:
df_fb_gist_cf = df_fb_gist_cf.groupby('username')['text'].agg(lambda x: ' '.join(x)).reset_index()

In [162]:
df_x_gist_cf = df_x_gist_cf.groupby('username')['text'].agg(lambda x: ' '.join(x)).reset_index()

In [163]:
fb_gist_list_cf = df_fb_gist_cf["text"].tolist()

In [166]:
x_gist_list_cf = df_x_gist_cf["text"].tolist()

In [167]:
embeddings_x_gist = model_gist.encode(x_gist_list_cf, convert_to_tensor=True)
embeddings_fb_gist = model_gist.encode(fb_gist_list_cf, convert_to_tensor=True)

In [168]:
embeddings_fb_gist = embeddings_fb_gist.cpu().detach().numpy()
embeddings_x_gist = embeddings_x_gist.cpu().detach().numpy()

In [170]:
cosine_scores_gist = util.cos_sim(embeddings_x_gist, embeddings_fb_gist)

In [171]:
cosine_scores_gist = cosine_scores_gist.clone().detach()
max_values_gist, max_indices_gist = torch.max(cosine_scores_gist, dim=1)

In [172]:
df_x_gist_cf = df_x_gist_cf.assign(cosine=max_values_gist, position=max_indices_gist)

In [173]:
df_x_gist_cf["fb_username"] = df_x_gist_cf["position"].apply(lambda x: df_fb_gist_cf["username"][x])

In [174]:
result_gist = df_x_gist_cf[df_x_gist_cf["username"] == df_x_gist_cf["fb_username"]]
result_gist

Unnamed: 0,username,text,cosine,position,fb_username
4,Adam Levine,ojala maluma therudeboyz offici video stream...,0.943540,34,Adam Levine
5,Adele,weekend dress nine nye would love come woul...,0.923433,39,Adele
7,Alia Bhatt,travel ab badal gaya haibadloapnaexperi ad...,0.934073,49,Alia Bhatt
10,Andrés Iniesta,mucha felicidad fcbarcelona vamo españa fifa...,0.856171,75,Andrés Iniesta
14,Anthony Bourdain,song score tonight hong kong partsunknowncnn g...,0.880438,96,Anthony Bourdain
...,...,...,...,...,...
243,Tyler Perry,tonight episod sistasonbet air believ episod...,0.936446,1413,Tyler Perry
246,Victoria Beckham,new musthav bag holiday seasonth fun size mini...,0.884670,1433,Victoria Beckham
247,Victoria Justice,ain't parti like diwali parti cuz diwali par...,0.928102,1438,Victoria Justice
248,Virat Kohli,earn cr peer period year invest upskil zero...,0.903246,1445,Virat Kohli


#### 3.6. Accuracy 3: GIST-small-Embedding-v0

In [177]:
print("Accuracy of GIST-small-Embedding-v0 model is {0}%".format(round(result_gist.shape[0] / 215 * 100,5)))

Accuracy of GIST-small-Embedding-v0 model is 45.11628%


## 4. Compare file_name

#### 4.1. Embeddings 1: e5-small-v2

In [21]:
model_e5 = SentenceTransformer('intfloat/e5-small-v2')

In [37]:
df_fb_gist = df_fb_em.groupby('user_id').agg({
    'username': 'first',  # Chọn giá trị đầu tiên cho 'username'
    'text': lambda x: ' '.join(x),  # Kết hợp các giá trị trong 'text'
    'file_name_1': 'first',
    'file_name_2' : 'first'# Chọn giá trị đầu tiên cho 'file_name'
}).reset_index()
df_fb_gist

Unnamed: 0,user_id,username,text,file_name_1,file_name_2
0,719210943,Farah Khan Ali,year begin high note thank jaipurjewellerysho...,TheFarahKhan,TheFarahKhan_facebook_6.json
1,100024590681364,Shraddha kapoor The Adorable Actress 2,shraddhakapoor shraddhakapoorfan shraddhakapo...,ShraddhaKapoor,ShraddhaKapoor_facebook_4.json
2,100024879207187,One Direction News,harri style olivia wild arriv don't worri darl...,NiallOfficial,NiallOfficial_facebook_5.json
3,100028312309284,Skandalous Talk,streamspaceseptemb pm · rock kevin hart lik...,SnoopDogg,SnoopDogg_facebook_4.json
4,100028409966696,A DAY ON THE GREEN,sat mar adam shine light,johngreen,johngreen_facebook_6.json
...,...,...,...,...,...
2269,9899376497,John Cena,john cena vs batista alexa bliss vs bayley rel...,JohnCena,JohnCena_facebook_1.json
2270,9934379006,Miranda Cosgrove,witch tonight today la let come togeth reth...,MirandaCosgrove,MirandaCosgrove_facebook_1.json
2271,99423386070,Florida Gators,still danc congrat gatorsvb advanc round home...,official_flo,official_flo_facebook_4.json
2272,9964154115,Channing Tatum,get back speed ride year still need get knee b...,channingtatum,channingtatum_facebook_1.json


In [38]:
df_x_gist = df_x_em.groupby('user_id').agg({
    'username': 'first',
    'text': lambda x: ' '.join(x)
}).reset_index()
df_x_gist

Unnamed: 0,user_id,username,text
0,10ronaldinho,Ronaldinho,mai um rolê bruxo agora com meus parceiro da p...
1,143redangel,Angel Locsin,new vlog drop theangelandneilchannel today s...
2,50cent,50cent,cyber monday sale storewid sale end tonight s...
3,actuallynph,Neil Patrick Harris,tomorrow anniversari wondercad onedercad chee...
4,adamlevine,Adam Levine,ojala maluma therudeboyz offici video stream...
...,...,...,...
270,zacefron,Zac Efron,let fix keep make beauti movi togeth love guy ...
271,zaynmalik,zayn,ask collabor celebr jimihendrix birthday unex...
272,zedd,Zedd,couragejd proud buddi thankyoux transientlab ...
273,zendaya,Zendaya,luxurylaw loeweoffici step away music quit...


In [79]:
fb_text_list = df_fb_gist["text"].tolist()
x_text_list = df_x_gist["text"].tolist()

In [81]:
embeddings_fb_e5 = model_e5.encode(fb_text_list, convert_to_tensor=True)
embeddings_x_e5 = model_e5.encode(x_text_list, convert_to_tensor=True)

In [82]:
embeddings_fb_e5 = embeddings_fb_e5.cpu().detach().numpy()
embeddings_x_e5 = embeddings_x_e5.cpu().detach().numpy()

In [88]:
cc = embeddings_fb_e5.tolist()
gg = embeddings_x_e5.tolist()

In [90]:
df_fb_gist["text_vector"] = cc

In [91]:
df_fb_gist

Unnamed: 0,user_id,username,text,file_name_1,file_name_2,text_vector
0,719210943,Farah Khan Ali,year begin high note thank jaipurjewellerysho...,TheFarahKhan,TheFarahKhan_facebook_6.json,"[-0.08209969848394394, 0.06346770375967026, 0...."
1,100024590681364,Shraddha kapoor The Adorable Actress 2,shraddhakapoor shraddhakapoorfan shraddhakapo...,ShraddhaKapoor,ShraddhaKapoor_facebook_4.json,"[-0.11407114565372467, 0.04150240495800972, -0..."
2,100024879207187,One Direction News,harri style olivia wild arriv don't worri darl...,NiallOfficial,NiallOfficial_facebook_5.json,"[-0.0811256393790245, 0.025075891986489296, 0...."
3,100028312309284,Skandalous Talk,streamspaceseptemb pm · rock kevin hart lik...,SnoopDogg,SnoopDogg_facebook_4.json,"[-0.07153625786304474, 0.050333671271800995, 0..."
4,100028409966696,A DAY ON THE GREEN,sat mar adam shine light,johngreen,johngreen_facebook_6.json,"[-0.09362127631902695, 0.017025548964738846, 0..."
...,...,...,...,...,...,...
2269,9899376497,John Cena,john cena vs batista alexa bliss vs bayley rel...,JohnCena,JohnCena_facebook_1.json,"[-0.14030547440052032, 0.033722955733537674, 0..."
2270,9934379006,Miranda Cosgrove,witch tonight today la let come togeth reth...,MirandaCosgrove,MirandaCosgrove_facebook_1.json,"[-0.06961727887392044, 0.06061193719506264, 0...."
2271,99423386070,Florida Gators,still danc congrat gatorsvb advanc round home...,official_flo,official_flo_facebook_4.json,"[-0.12481772154569626, 0.029234565794467926, 0..."
2272,9964154115,Channing Tatum,get back speed ride year still need get knee b...,channingtatum,channingtatum_facebook_1.json,"[-0.08227701485157013, 0.056930433958768845, 0..."


In [92]:
df_x_gist["text_vector"] = gg

In [93]:
df_x_gist

Unnamed: 0,user_id,username,text,text_vector
0,10ronaldinho,Ronaldinho,mai um rolê bruxo agora com meus parceiro da p...,"[-0.11690832674503326, 0.06458992511034012, 0...."
1,143redangel,Angel Locsin,new vlog drop theangelandneilchannel today s...,"[-0.06213437765836716, 0.038958705961704254, 0..."
2,50cent,50cent,cyber monday sale storewid sale end tonight s...,"[-0.09256376326084137, 0.028269054368138313, 0..."
3,actuallynph,Neil Patrick Harris,tomorrow anniversari wondercad onedercad chee...,"[-0.09586158394813538, 0.03052220307290554, 0...."
4,adamlevine,Adam Levine,ojala maluma therudeboyz offici video stream...,"[-0.06409592181444168, 0.06178886815905571, 0...."
...,...,...,...,...
270,zacefron,Zac Efron,let fix keep make beauti movi togeth love guy ...,"[-0.09526766091585159, 0.037829067558050156, 0..."
271,zaynmalik,zayn,ask collabor celebr jimihendrix birthday unex...,"[-0.09214373677968979, 0.04666445031762123, 0...."
272,zedd,Zedd,couragejd proud buddi thankyoux transientlab ...,"[-0.08149664103984833, 0.049187108874320984, 0..."
273,zendaya,Zendaya,luxurylaw loeweoffici step away music quit...,"[-0.09616198390722275, 0.06231871619820595, 0...."


In [113]:
data = []
for x in range(len(df_x_gist)):
    for fb in range(len(df_fb_gist)):
        if df_x_gist["user_id"][x] == df_fb_gist["file_name_1"][fb].lower():
            cosine = util.cos_sim(df_x_gist["text_vector"][x], df_fb_gist["text_vector"][fb]).item()
            data.append({
                "fb_username": df_fb_gist["username"][fb],
                "x_username": df_x_gist["username"][x],
                "x_userid": df_x_gist["user_id"][x],
                "file_name": df_fb_gist["file_name_2"][fb],
                "cosine": cosine
            })

In [114]:
data_finish = pd.DataFrame(data)

In [115]:
data_finish

Unnamed: 0,fb_username,x_username,x_userid,file_name,cosine
0,Futrj Football Channel,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_6.json,0.917860
1,Ronaldinho Gaúcho,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_1.json,0.933636
2,"Trener Dryblingu Piłkarskiego - Piotr ""Olo"" Ol...",Ronaldinho,10ronaldinho,10Ronaldinho_facebook_4.json,0.853782
3,Raphinha,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_2.json,0.790712
4,Raphinha,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_2.json,0.917504
...,...,...,...,...,...
2012,Zooey Deschanel,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_1.json,0.936490
2013,Jonathan Silver Scott,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_3.json,0.935158
2014,I Love you Zooey Deschanel,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_4.json,0.908975
2015,Emily Deschanel Fanpage En/Sk,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_6.json,0.901184


In [120]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

data = []

# Vectorize text data for both df_x_gist and df_fb_gist
vectorizer = TfidfVectorizer()
tfidf_matrix_x = vectorizer.fit_transform(df_x_gist["text"])
tfidf_matrix_fb = vectorizer.transform(df_fb_gist["text"])

# Loop through each pair of texts
for x in range(len(df_x_gist)):
    for fb in range(len(df_fb_gist)):
        if df_x_gist["user_id"][x] == df_fb_gist["file_name_1"][fb].lower():
            # Calculate cosine similarity
            cosine = cosine_similarity(tfidf_matrix_x[x], tfidf_matrix_fb[fb])[0][0]
            data.append({
                "fb_username": df_fb_gist["username"][fb],
                "x_username": df_x_gist["username"][x],
                "x_userid": df_x_gist["user_id"][x],
                "file_name": df_fb_gist["file_name_2"][fb],
                "cosine": cosine
            })

In [116]:
data_finish = data_finish.groupby("x_userid").agg({
    "fb_username" : "first",
    "cosine" : "max",
    "file_name" : "first"
})

In [117]:
data_finish

Unnamed: 0_level_0,fb_username,cosine,file_name
x_userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10ronaldinho,Futrj Football Channel,0.933636,10Ronaldinho_facebook_6.json
143redangel,Angel Locsin Supporters,0.918688,143redangel_facebook_4.json
50cent,50 Cent,0.909899,50cent_facebook_1.json
actuallynph,Neil Patrick Harris,0.937589,ActuallyNPH_facebook_1.json
adamlevine,Maroon 5,0.945554,adamlevine_facebook_3.json
...,...,...,...
zacefron,Zac Efron,0.940685,ZacEfron_facebook_1.json
zaynmalik,The Pillowtalker Zayn Malik,0.913868,zaynmalik_facebook_4.json
zedd,Zedd,0.912794,Zedd_facebook_1.json
zendaya,Celebrity Ladies,0.931439,Zendaya_facebook_4.json


In [119]:
true_e5 = data_finish[data_finish["file_name"].str[-6] == "1"]

#### 4.2. Accuracy 1: e5-small-v2

In [122]:
print("Accuracy of e5-small-v2 model is {0}%".format(round(true_e5.shape[0] / data_finish.shape[0] * 100, 5)))

Accuracy of e5-small-v2 model is 63.56877%


#### 4.3. Embeddings 2: GIST-small-Embedding-v0

In [110]:
model_gist = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0", revision=None)

In [123]:
df_fb_gist = df_fb_em.groupby('user_id').agg({
    'username': 'first',
    'text': lambda x: ' '.join(x),
    'file_name_1': 'first',
    'file_name_2' : 'first'
}).reset_index()
df_fb_gist

Unnamed: 0,user_id,username,text,file_name_1,file_name_2
0,719210943,Farah Khan Ali,year begin high note thank jaipurjewellerysho...,TheFarahKhan,TheFarahKhan_facebook_6.json
1,100024590681364,Shraddha kapoor The Adorable Actress 2,shraddhakapoor shraddhakapoorfan shraddhakapo...,ShraddhaKapoor,ShraddhaKapoor_facebook_4.json
2,100024879207187,One Direction News,harri style olivia wild arriv don't worri darl...,NiallOfficial,NiallOfficial_facebook_5.json
3,100028312309284,Skandalous Talk,streamspaceseptemb pm · rock kevin hart lik...,SnoopDogg,SnoopDogg_facebook_4.json
4,100028409966696,A DAY ON THE GREEN,sat mar adam shine light,johngreen,johngreen_facebook_6.json
...,...,...,...,...,...
2269,9899376497,John Cena,john cena vs batista alexa bliss vs bayley rel...,JohnCena,JohnCena_facebook_1.json
2270,9934379006,Miranda Cosgrove,witch tonight today la let come togeth reth...,MirandaCosgrove,MirandaCosgrove_facebook_1.json
2271,99423386070,Florida Gators,still danc congrat gatorsvb advanc round home...,official_flo,official_flo_facebook_4.json
2272,9964154115,Channing Tatum,get back speed ride year still need get knee b...,channingtatum,channingtatum_facebook_1.json


In [124]:
df_x_gist = df_x_em.groupby('user_id').agg({
    'username': 'first',
    'text': lambda x: ' '.join(x)
}).reset_index()
df_x_gist

Unnamed: 0,user_id,username,text
0,10ronaldinho,Ronaldinho,mai um rolê bruxo agora com meus parceiro da p...
1,143redangel,Angel Locsin,new vlog drop theangelandneilchannel today s...
2,50cent,50cent,cyber monday sale storewid sale end tonight s...
3,actuallynph,Neil Patrick Harris,tomorrow anniversari wondercad onedercad chee...
4,adamlevine,Adam Levine,ojala maluma therudeboyz offici video stream...
...,...,...,...
270,zacefron,Zac Efron,let fix keep make beauti movi togeth love guy ...
271,zaynmalik,zayn,ask collabor celebr jimihendrix birthday unex...
272,zedd,Zedd,couragejd proud buddi thankyoux transientlab ...
273,zendaya,Zendaya,luxurylaw loeweoffici step away music quit...


In [125]:
fb_text_list_gist = df_fb_gist["text"].tolist()
x_text_list_gist = df_x_gist["text"].tolist()

In [147]:
embeddings_fb_gist = model_gist.encode(fb_text_list_gist, convert_to_tensor=True)
embeddings_x_gist = model_gist.encode(x_text_list_gist, convert_to_tensor=True)

In [148]:
embeddings_fb_gist = embeddings_fb_gist.cpu().detach().numpy()
embeddings_x_gist = embeddings_x_gist.cpu().detach().numpy()

In [149]:
gist1 = embeddings_fb_gist.tolist()
gist2 = embeddings_x_gist.tolist()

In [150]:
df_fb_gist["text_vector"] = gist1

In [151]:
df_x_gist["text_vector"] = gist2

In [152]:
data = []
for x in range(len(df_x_gist)):
    for fb in range(len(df_fb_gist)):
        if df_x_gist["user_id"][x] == df_fb_gist["file_name_1"][fb].lower():
            scores = F.cosine_similarity(torch.tensor(df_x_gist["text_vector"][x]), torch.tensor(df_fb_gist["text_vector"][fb]), dim=-1)
            data.append({
                "fb_username": df_fb_gist["username"][fb],
                "x_username": df_x_gist["username"][x],
                "x_userid": df_x_gist["user_id"][x],
                "file_name": df_fb_gist["file_name_2"][fb],
                "cosine": cosine
            })

In [153]:
data_finish_gist = pd.DataFrame(data)

In [154]:
data_finish_gist

Unnamed: 0,fb_username,x_username,x_userid,file_name,cosine
0,Futrj Football Channel,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_6.json,0.898605
1,Ronaldinho Gaúcho,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_1.json,0.898605
2,"Trener Dryblingu Piłkarskiego - Piotr ""Olo"" Ol...",Ronaldinho,10ronaldinho,10Ronaldinho_facebook_4.json,0.898605
3,Raphinha,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_2.json,0.898605
4,Raphinha,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_2.json,0.898605
...,...,...,...,...,...
2012,Zooey Deschanel,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_1.json,0.898605
2013,Jonathan Silver Scott,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_3.json,0.898605
2014,I Love you Zooey Deschanel,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_4.json,0.898605
2015,Emily Deschanel Fanpage En/Sk,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_6.json,0.898605


In [155]:
data_finish_gist = data_finish.groupby("x_userid").agg({
    "fb_username" : "first",
    "cosine" : "max",
    "file_name" : "first"
})

In [156]:
data_finish_gist

Unnamed: 0_level_0,fb_username,cosine,file_name
x_userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10ronaldinho,Futrj Football Channel,0.933636,10Ronaldinho_facebook_6.json
143redangel,Angel Locsin Supporters,0.918688,143redangel_facebook_4.json
50cent,50 Cent,0.909899,50cent_facebook_1.json
actuallynph,Neil Patrick Harris,0.937589,ActuallyNPH_facebook_1.json
adamlevine,Maroon 5,0.945554,adamlevine_facebook_3.json
...,...,...,...
zacefron,Zac Efron,0.940685,ZacEfron_facebook_1.json
zaynmalik,The Pillowtalker Zayn Malik,0.913868,zaynmalik_facebook_4.json
zedd,Zedd,0.912794,Zedd_facebook_1.json
zendaya,Celebrity Ladies,0.931439,Zendaya_facebook_4.json


In [157]:
true_gist = data_finish[data_finish["file_name"].str[-6] == "1"]

#### 4.4. Accuracy 2: GIST-small-Embedding-v0

In [158]:
print("Accuracy of GIST-small-Embedding-v0 model is {0}%".format(round(true_gist.shape[0] / data_finish_gist.shape[0] * 100, 5)))

Accuracy of GIST-small-Embedding-v0 model is 63.56877%


# METHOD 3: KEYWORD EMBEDDINGS

## 1. Positionrank

In [31]:
df_fb_pr = pd.DataFrame(df_fb_em)
df_x_pr = pd.DataFrame(df_x_em)

In [33]:
df_fb_pr = df_fb_pr.groupby(['username', "file_name_2"])['text'].agg(lambda x: ' '.join(x)).reset_index()
df_x_pr = df_x_pr.groupby(['user_id','username'])['text'].agg(lambda x: ' '.join(x)).reset_index()

In [34]:
data_pr_x = []
for i in range(0,len(df_x_pr)):
    pos = {'NOUN', 'PROPN', 'ADJ'}
    grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
    extractor = pke.unsupervised.PositionRank()
    extractor.load_document(input=str(df_x_pr['text'][i]),
                            language='en',
                            normalization='stemming')
    extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=10)
    extractor.candidate_weighting(window=10,
                              pos=pos)
    x_keyphrase=[]
    x_score=[]
    for (keyphrase, score) in extractor.get_n_best(n=1, stemming=True):
        x_keyphrase.append(keyphrase)
        x_score.append(score)
        data_pr_x.append({
                    'x_username': df_x_pr['username'][i],
                    'x_userid' : df_x_pr['username'][i],
                    'x_text': df_x_pr['text'][i],
                    'x_keyphrase': x_keyphrase,
                    'x_score': x_score
                })

In [94]:
df_x_pr

Unnamed: 0,user_id,username,text
0,10ronaldinho,Ronaldinho,mai um rolê bruxo agora com meus parceiro da p...
1,143redangel,Angel Locsin,new vlog drop theangelandneilchannel today s...
2,50cent,50cent,cyber monday sale storewid sale end tonight s...
3,actuallynph,Neil Patrick Harris,tomorrow anniversari wondercad onedercad chee...
4,adamlevine,Adam Levine,ojala maluma therudeboyz offici video stream...
...,...,...,...
270,zacefron,Zac Efron,let fix keep make beauti movi togeth love guy ...
271,zaynmalik,zayn,ask collabor celebr jimihendrix birthday unex...
272,zedd,Zedd,couragejd proud buddi thankyoux transientlab ...
273,zendaya,Zendaya,luxurylaw loeweoffici step away music quit...


In [35]:
data_pr_fb = []
for i in range(0,len(df_fb_pr)):
    pos = {'NOUN', 'PROPN', 'ADJ'}
    grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
    extractor = pke.unsupervised.PositionRank()
    extractor.load_document(input=str(df_fb_pr['text'][i]),
                            language='en',
                            normalization='stemming')
    extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=10)
    extractor.candidate_weighting(window=10,
                              pos=pos)
    fb_keyphrase=[]
    fb_score=[]
    for (keyphrase, score) in extractor.get_n_best(n=1, stemming=True):
        fb_keyphrase.append(keyphrase)
        fb_score.append(score)
        data_pr_fb.append({
                    'fb_username': df_fb_pr['username'][i],
                    'fb_text': df_fb_pr['text'][i],
                    'fb_keyphrase': fb_keyphrase,
                    'fb_score': fb_score,
                    'file_name_1': df_fb_pr['file_name_1'][i],
                    'file_name_2': df_fb_pr['file_name_2'][i]
                })

KeyError: 'file_name_1'

In [63]:
data_pr_x = pd.DataFrame(data_pr_x)
data_pr_x

Unnamed: 0,x_username,x_text,x_keyphrase,x_score
0,50cent,cyber monday sale storewid sale end tonight s...,[lol green light gang bransoncognac lechemindu...,[0.1407137229320425]
1,@jason,mikesalguero think awesom share calendar revie...,[someth peopl love amp respect laugh amp],[0.04722670383624279]
2,A Lord,grate it suppos wow excel kind weekend stay sa...,[love day tri happi choic mani realiz believ k...,[0.3024742845936213]
3,A.R.Rahman,happi birthday udhaystalin glad part mamannan ...,[happi birthday udhaystalin glad part mamannan...,[0.07252516022626031]
4,Adam Levine,ojala maluma therudeboyz offici video stream...,[iamdamienmu vote vote vote tonight dial],[0.06373378268493492]
...,...,...,...,...
270,tyler oakley (parody),oh wow shaniatwain lakestreetd forget that a...,[amp love amp amp gosh brain big amp wrink],[0.14264694105909392]
271,ye,first time maralago rain traffic can't beli...,[first time maralago],[0.04579993064643104]
272,zayn,ask collabor celebr jimihendrix birthday unex...,[memyselfandi bestcoversong iheartaward thank ...,[0.05985025202156105]
273,zooey deschanel,love entir convers im donat join let know re...,[love flower love love happi valentin day],[0.03877024906434363]


In [65]:
model_gist = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0", revision=None)

In [97]:
fb_pr_list_gist = data_pr_fb["fb_keyphrase"].tolist()
x_pr_list_gist = data_pr_x["x_keyphrase"].tolist()

In [98]:
converted_list_pr_fb = [item[0] for item in fb_pr_list_gist]
converted_list_pr_x = [item[0] for item in x_pr_list_gist]

In [99]:
embeddings_fb_pr_gist = model_gist.encode(converted_list_pr_fb, convert_to_tensor=True)
embeddings_x_pr_gist = model_gist.encode(converted_list_pr_x, convert_to_tensor=True)

In [100]:
embeddings_fb_pr_gist = embeddings_fb_pr_gist.cpu().detach().numpy()
embeddings_x_pr_gist = embeddings_x_pr_gist.cpu().detach().numpy()

In [101]:
gist_pr_fb = embeddings_fb_pr_gist.tolist()
gist_pr_x = embeddings_x_pr_gist.tolist()

In [102]:
data_pr_fb["text_vector"] = gist_pr_fb
data_pr_x["text_vector"] = gist_pr_x

In [104]:
data_pr_fb

Unnamed: 0,fb_username,fb_text,fb_keyphrase,fb_score,file_name_2,text_vector
0,1 9 9 6,cameron diaz mask popular actor chang s...,[cameron diaz cann film festiv],[0.05983370040697931],LeoDiCaprio_facebook_3.json,"[-0.03853299096226692, -0.008082009851932526, ..."
1,1 9 9 6,cameron diaz mask popular actor chang ic...,[icon actor movi fouzia hattabiseptemb],[0.0632936598281836],Zendaya_facebook_3.json,"[-0.0396171510219574, 0.05737358704209328, 0.0..."
2,1997.,angelina joli angelina joli leonardo dicapri...,[leonardo dicaprio kate winslet set titan],[0.2530076599834736],LeoDiCaprio_facebook_2.json,"[-0.04441084340214729, 0.03955598548054695, 0...."
3,"44th POTUS, his story made history.",queen elizabeth alexandra mari ii majesti made...,[celebr happi birthday presid biden happi birt...,[0.17357329099664737],MichelleObama_facebook_5.json,"[-0.12910056114196777, 0.05880581587553024, -0..."
4,"44th POTUS, his story made history.",queen elizabeth alexandra mari ii majesti made...,[celebr happi birthday presid biden happi birt...,[0.17357329099664737],barackobama_facebook_6.json,"[-0.12910056114196777, 0.05880581587553024, -0..."
...,...,...,...,...,...,...
1800,𝑱𝒖𝒔𝒕𝒊𝒏 𝑩𝒊𝒆𝒃𝒆𝒓 𝑭𝒂𝒏𝒔,“a gentl word kind look goodnatur smile work w...,[justin bieber 𝗝𝗼𝗶𝗻 𝗧𝗵𝗶𝘀 𝗚𝗿𝗼𝘂𝗽 justin bieber b...,[0.5298012734056233],justinbieber_facebook_2.json,"[-0.03818676993250847, -0.018508898094296455, ..."
1801,𝓩𝓮𝓷𝓭𝓪𝔂𝓪 𝓕𝓪𝓷𝓼,wikiofwitchercomdemi moor look ...,[loung bikini wikiofwitchercomsofia vergara fl...,[0.18062534490329424],Zendaya_facebook_2.json,"[-0.043392378836870193, -0.047976549714803696,..."
1802,𝙄𝙘𝙤𝙣𝙨,bella hadid sprayon dress coperni show alexa...,[kyli jenner kim kardashian kendal jenner kend...,[0.3238240785428218],LeoDiCaprio_facebook_4.json,"[-0.0299391932785511, -0.0015602795174345374, ..."
1803,𝚅 𝙴 𝚁 𝚂 𝙴 𝚁 𝙴 𝙿 𝚄 𝙱 𝙻 𝙸 𝙲,somewher knowkeanelyricsi walk across empti la...,[followyoutub channel facebook page songslyr o...,[0.05431443673922178],jason_mraz_facebook_6.json,"[-0.045686908066272736, -0.05799580737948418, ..."


In [103]:
data_pr_x

Unnamed: 0,x_userid,x_username,x_text,x_keyphrase,x_score,text_vector
0,10ronaldinho,Ronaldinho,mai um rolê bruxo agora com meus parceiro da p...,[e jogar com ídolo e grand amigo em doi],[0.09020405427678241],"[-0.013693093322217464, 0.03607881814241409, 0..."
1,143redangel,Angel Locsin,new vlog drop theangelandneilchannel today s...,[thank sa mga nakipagchikahan sa amin sa pina ...,[0.14146163636709397],"[-0.048325881361961365, 0.058305948972702026, ..."
2,50cent,50cent,cyber monday sale storewid sale end tonight s...,[lol green light gang bransoncognac lechemindu...,[0.1407137229320425],"[-0.04142908379435539, -0.001193844829685986, ..."
3,actuallynph,Neil Patrick Harris,tomorrow anniversari wondercad onedercad chee...,[countri first newslett drop tomorrow wondercad],[0.04758505772657545],"[-0.0208944920450449, -0.034327443689107895, 0..."
4,adamlevine,Adam Levine,ojala maluma therudeboyz offici video stream...,[iamdamienmu vote vote vote tonight dial],[0.06373378268493492],"[-0.0789669081568718, 0.0036763264797627926, 0..."
...,...,...,...,...,...,...
270,zacefron,Zac Efron,let fix keep make beauti movi togeth love guy ...,[beauti movi togeth love guy],[0.05258830201341473],"[-0.04326127469539642, 0.07125957310199738, 0...."
271,zaynmalik,zayn,ask collabor celebr jimihendrix birthday unex...,[memyselfandi bestcoversong iheartaward thank ...,[0.05985025202156105],"[-0.05527028068900108, 0.022582782432436943, 0..."
272,zedd,Zedd,couragejd proud buddi thankyoux transientlab ...,[couragejd love u buddi happi birthday],[0.06304944028766526],"[-0.09585268795490265, 0.03892474249005318, 0...."
273,zendaya,Zendaya,luxurylaw loeweoffici step away music quit...,[brunomar luxurylaw tommyhilfig thank man appr...,[0.06641876186675881],"[-0.04071503505110741, 0.10008018463850021, 0...."


In [108]:
data = []
for x in range(len(data_pr_x)):
    for fb in range(len(data_pr_fb)):
        if data_pr_x["x_userid"][x] == data_pr_fb["file_name_2"][fb].lower()[:-16]:
            cosine = F.cosine_similarity(torch.tensor(data_pr_x["text_vector"][x]), torch.tensor(data_pr_fb["text_vector"][fb]), dim=-1).item()
            data.append({
                "fb_username": data_pr_fb["fb_username"][fb],
                "x_username": data_pr_x["x_username"][x],
                "x_userid": data_pr_x["x_userid"][x],
                "fb_file_name": data_pr_fb["file_name_2"][fb],
                "cosine": cosine
            })

In [109]:
data_finish_pr_gist = pd.DataFrame(data)
data_finish_pr_gist

Unnamed: 0,fb_username,x_username,x_userid,fb_file_name,cosine
0,Brazil ; The Home Of Legends,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_3.json,0.676023
1,Futrj Football Channel,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_6.json,0.668486
2,Raphinha,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_2.json,0.766764
3,Ronaldinho Gaúcho,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_1.json,0.814049
4,Ronaldinho: A Legend To Be Remembered,Ronaldinho,10ronaldinho,10Ronaldinho_facebook_5.json,0.746551
...,...,...,...,...,...
1608,I Love you Zooey Deschanel,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_4.json,0.638112
1609,Jonathan Silver Scott,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_3.json,0.687953
1610,Sarah Nelson Makeup,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_5.json,0.708288
1611,Zooey Deschanel,zooey deschanel,zooeydeschanel,ZooeyDeschanel_facebook_1.json,1.000000


In [112]:
data_finish_pr_gist = data_finish_pr_gist.groupby("x_userid").agg({
    "fb_username" : "first",
    "cosine" : "max",
    "fb_file_name" : "first"
})

In [117]:
data_finish_pr_gist

Unnamed: 0_level_0,fb_username,cosine,fb_file_name
x_userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10ronaldinho,Brazil ; The Home Of Legends,0.814049,10Ronaldinho_facebook_3.json
143redangel,Alyanna Angeles,0.767110,143redangel_facebook_5.json
50cent,50 Cent,0.797390,50cent_facebook_1.json
actuallynph,Best Time Ever With Neil Patrick Harris,0.800524,ActuallyNPH_facebook_4.json
adamlevine,Adam Levine,0.712481,adamlevine_facebook_1.json
...,...,...,...
zacefron,Jessica Alba,0.699006,ZacEfron_facebook_4.json
zaynmalik,Free-Fire,0.772427,zaynmalik_facebook_2.json
zedd,Cosmos Music,0.681543,Zedd_facebook_5.json
zendaya,1 9 9 6,0.733082,Zendaya_facebook_3.json


In [116]:
true_gist = data_finish_pr_gist[data_finish_pr_gist["fb_file_name"].str[-6] == "1"]
true_gist

Unnamed: 0_level_0,fb_username,cosine,fb_file_name
x_userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50cent,50 Cent,0.797390,50cent_facebook_1.json
adamlevine,Adam Levine,0.712481,adamlevine_facebook_1.json
adele,Adele,0.891185,Adele_facebook_1.json
akshaykumar,Akshay Kumar,0.684917,akshaykumar_facebook_1.json
aliaa08,Alia Bhatt,0.637424,aliaa08_facebook_1.json
...,...,...,...
thefarahkhan,Farah Khan,0.720068,TheFarahKhan_facebook_1.json
tomhanks,Tom Hanks,0.778263,tomhanks_facebook_1.json
tripleh,"Paul ""Triple H"" Levesque",0.780488,TripleH_facebook_1.json
xtina,Christina Aguilera,0.771833,xtina_facebook_1.json


### 1.1. Accuracy

In [118]:
print("Accuracy of Position rank by GIST-small-Embedding-v0 model is {0}%".format(round(true_gist.shape[0] / data_finish_pr_gist.shape[0] * 100, 5)))

Accuracy of Position rank by GIST-small-Embedding-v0 model is 26.2963%


## 2. Yake

In [122]:
df_fb_yk = pd.DataFrame(df_fb_em)
df_x_yk = pd.DataFrame(df_x_em)

In [123]:
df_fb_yk

Unnamed: 0,username,user_id,text,file_name_1,file_name_2
0,Forever 5secondsofsummer,587701401348302,what whos excit amnesia videolukesmyba,5SOS,5SOS_facebook_4.json
1,Forever 5secondsofsummer,587701401348302,hello im new admin ill wrote imagin peopl want...,5SOS,5SOS_facebook_4.json
2,Forever 5secondsofsummer,587701401348302,admin contest question must answer want becom ...,5SOS,5SOS_facebook_4.json
3,Forever 5secondsofsummer,587701401348302,,5SOS,5SOS_facebook_4.json
4,Ronaldinho: A Legend To Be Remembered,258244370945844,run wing fifa think ronaldo,10Ronaldinho,10Ronaldinho_facebook_5.json
...,...,...,...,...,...
716645,YENGSTERS,100044199167866,want win exclus item prelov yeng constantino s...,YengPLUGGEDin,YengPLUGGEDin_facebook_2.json
716646,YENGSTERS,100044199167866,tonight yeng constantinoyoutubecomgood time g...,YengPLUGGEDin,YengPLUGGEDin_facebook_2.json
716647,YENGSTERS,100044199167866,new vlog see yeng constantinoyoutubecomgumawa...,YengPLUGGEDin,YengPLUGGEDin_facebook_2.json
716648,YENGSTERS,100044199167866,thank teacher dan yeng constantinodanvib cebu...,YengPLUGGEDin,YengPLUGGEDin_facebook_2.json


In [124]:
df_x_yk

Unnamed: 0,username,user_id,text
0,Bill Gates,billgates,order solv world sanit crisi need smarter toil...
1,Bill Gates,billgates,collabor open access data made seem imposs hea...
2,Bill Gates,billgates,decreas matern mortal lower rate vaccinepreven...
3,Bill Gates,billgates,africa alway heart foundat mission today annou...
4,Bill Gates,billgates,amrefworldwid great thing health within kenya ...
...,...,...,...
265188,arjun rampal,rampalarjun,thank much glad u enjoy
265189,arjun rampal,rampalarjun,ty khaleejtim one didnt run daddi dubai
265190,arjun rampal,rampalarjun,yay u guy best
265191,arjun rampal,rampalarjun,thank pragya darl


In [125]:
df_fb_yk = df_fb_yk.groupby(['username', "file_name_2"])['text'].agg(lambda x: ' '.join(x)).reset_index()
df_x_yk = df_x_yk.groupby(['user_id','username'])['text'].agg(lambda x: ' '.join(x)).reset_index()

In [126]:
data_yk_x = []
for i in range(0, len(df_x_yk)):
    extractor = pke.unsupervised.YAKE()
    stoplist = stopwords.get('english')
    extractor.load_document(input = str(df_x_yk['text'][i]),
                            language = 'en',
                            stoplist = stoplist,
                            normalization = None)

    extractor.candidate_selection(n = 3)
    window = 2
    use_stems = False
    ex = extractor.candidate_weighting(window = window,
                              use_stems = use_stems)
    threshold = 0.8
    keyphrases = extractor.get_n_best(n=10, threshold=threshold)
    
    x_keyphrase=[]
    x_score=[]
    for (keyphrase, score) in extractor.get_n_best(n=1, stemming=True):
        x_keyphrase.append(keyphrase)
        x_score.append(score)
        data_yk_x.append({
                    'x_username': df_x_yk['username'][i],
                    'x_userid': df_x_yk['user_id'][i],
                    'x_text': df_x_yk['text'][i],
                    'x_keyphrase': x_keyphrase,
                    'x_score': x_score
                })

In [127]:
data_yk_fb = []
for i in range(0, len(df_fb_yk)):
    extractor = pke.unsupervised.YAKE()
    stoplist = stopwords.get('english')
    extractor.load_document(input = str(df_fb_yk['text'][i]),
                            language = 'en',
                            stoplist = stoplist,
                            normalization = None)

    extractor.candidate_selection(n = 3)
    window = 2
    use_stems = False
    ex = extractor.candidate_weighting(window = window,
                              use_stems = use_stems)
    threshold = 0.8
    keyphrases = extractor.get_n_best(n=10, threshold=threshold)
    
    fb_keyphrase=[]
    fb_score=[]
    for (keyphrase, score) in extractor.get_n_best(n=1, stemming=True):
        fb_keyphrase.append(keyphrase)
        fb_score.append(score)
        data_yk_fb.append({
                    'fb_username': df_fb_yk['username'][i],
                    'fb_text': df_fb_yk['text'][i],
                    'fb_file_name': df_fb_yk['file_name_2'][i],
                    'fb_keyphrase': fb_keyphrase,
                    'fb_score': fb_score
                })

In [128]:
data_yk_fb = pd.DataFrame(data_yk_fb)
data_yk_fb

Unnamed: 0,fb_username,fb_text,fb_file_name,fb_keyphrase,fb_score
0,1 9 9 6,cameron diaz mask popular actor chang s...,LeoDiCaprio_facebook_3.json,[icon actor moviesseptemb],[1.2534113736167603e-06]
1,1 9 9 6,cameron diaz mask popular actor chang ic...,Zendaya_facebook_3.json,[icon actor moviesseptemb],[1.484367470896525e-06]
2,1997.,angelina joli angelina joli leonardo dicapri...,LeoDiCaprio_facebook_2.json,[leonardo dicaprio kate],[9.148561488142332e-07]
3,"44th POTUS, his story made history.",queen elizabeth alexandra mari ii majesti made...,MichelleObama_facebook_5.json,[barack obama obama],[8.85730527616699e-07]
4,"44th POTUS, his story made history.",queen elizabeth alexandra mari ii majesti made...,barackobama_facebook_6.json,[barack obama obama],[8.85730527616699e-07]
...,...,...,...,...,...
1806,𝑱𝒖𝒔𝒕𝒊𝒏 𝑩𝒊𝒆𝒃𝒆𝒓 𝑭𝒂𝒏𝒔,“a gentl word kind look goodnatur smile work w...,justinbieber_facebook_2.json,[𝗝𝗢𝗜𝗡 𝗧𝗛𝗜𝗦 𝗚𝗥𝗢𝗨𝗣],[2.0291174702729002e-08]
1807,𝓩𝓮𝓷𝓭𝓪𝔂𝓪 𝓕𝓪𝓷𝓼,wikiofwitchercomdemi moor look ...,Zendaya_facebook_2.json,[wikiofwitchercomelizabeth hurley look],[2.3288434011263026e-05]
1808,𝙄𝙘𝙤𝙣𝙨,bella hadid sprayon dress coperni show alexa...,LeoDiCaprio_facebook_4.json,[jenner kyli jenner],[2.5626265032966025e-06]
1809,𝚅 𝙴 𝚁 𝚂 𝙴 𝚁 𝙴 𝙿 𝚄 𝙱 𝙻 𝙸 𝙲,somewher knowkeanelyricsi walk across empti la...,jason_mraz_facebook_6.json,[channel facebook page],[7.098779724666373e-10]


In [129]:
data_yk_x = pd.DataFrame(data_yk_x)
data_yk_x

Unnamed: 0,x_username,x_userid,x_text,x_keyphrase,x_score
0,Ronaldinho,10ronaldinho,mai um rolê bruxo agora com meus parceiro da p...,[com meu parceiro],[2.5823788524862365e-05]
1,Angel Locsin,143redangel,new vlog drop theangelandneilchannel today s...,[post photo],[3.3980306889807365e-06]
2,50cent,50cent,cyber monday sale storewid sale end tonight s...,[gang bransoncognac lecheminduroi],[5.176929645322569e-08]
3,Neil Patrick Harris,actuallynph,tomorrow anniversari wondercad onedercad chee...,[happi new year],[6.891414361150879e-06]
4,Adam Levine,adamlevine,ojala maluma therudeboyz offici video stream...,[vote vote vote],[4.6054359543475463e-07]
...,...,...,...,...,...
270,Zac Efron,zacefron,let fix keep make beauti movi togeth love guy ...,[thank birthday love],[1.0801062395036388e-05]
271,zayn,zaynmalik,ask collabor celebr jimihendrix birthday unex...,[thank support love],[1.4136759071548722e-05]
272,Zedd,zedd,couragejd proud buddi thankyoux transientlab ...,[yes yes yes],[3.5595428178318704e-06]
273,Zendaya,zendaya,luxurylaw loeweoffici step away music quit...,[thank thank thank],[9.108413177042713e-06]


In [148]:
fb_yk_list_gist = data_yk_fb["fb_keyphrase"].tolist()
x_yk_list_gist = data_yk_x["x_keyphrase"].tolist()

In [149]:
converted_list_yk_fb = [item[0] for item in fb_yk_list_gist]
converted_list_yk_x = [item[0] for item in x_yk_list_gist]

In [150]:
embeddings_fb_yk_gist = model_gist.encode(converted_list_yk_fb, convert_to_tensor=True)
embeddings_x_yk_gist = model_gist.encode(converted_list_yk_x, convert_to_tensor=True)

In [151]:
gist_yk_fb = embeddings_fb_yk_gist.tolist()
gist_yk_x = embeddings_x_yk_gist.tolist()

In [152]:
data_yk_fb["text_vector"] = gist_yk_fb
data_yk_x["text_vector"] = gist_yk_x

In [159]:
data_yk = []
for x in range(len(data_yk_x)):
    for fb in range(len(data_yk_fb)):
        if data_yk_x["x_userid"][x] == data_yk_fb["fb_file_name"][fb].lower()[:-16]:
            cosine = F.cosine_similarity(torch.tensor(data_yk_x["text_vector"][x]), torch.tensor(data_yk_fb["text_vector"][fb]), dim=-1).item()
            data_yk.append({
                "fb_username": data_yk_fb["fb_username"][fb],
                "x_username": data_yk_x["x_username"][x],
                "x_userid": data_yk_x["x_userid"][x],
                "fb_file_name": data_yk_fb["fb_file_name"][fb],
                "cosine": cosine
            })

In [161]:
data_yk = pd.DataFrame(data_yk)

In [162]:
data_finish_yk_gist = data_yk.groupby("x_userid").agg({
    "fb_username" : "first",
    "cosine" : "max",
    "fb_file_name" : "first"
})

In [163]:
true_yk_gist = data_finish_yk_gist[data_finish_pr_gist["fb_file_name"].str[-6] == "1"]
true_yk_gist

Unnamed: 0_level_0,fb_username,cosine,fb_file_name
x_userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50cent,50 Cent,0.912364,50cent_facebook_1.json
adamlevine,Adam Levine,0.699378,adamlevine_facebook_1.json
adele,Adele,1.000000,Adele_facebook_1.json
akshaykumar,Akshay Kumar,0.684729,akshaykumar_facebook_1.json
aliaa08,Alia Bhatt,0.651843,aliaa08_facebook_1.json
...,...,...,...
thefarahkhan,Farah Khan,1.000000,TheFarahKhan_facebook_1.json
tomhanks,Tom Hanks,0.811401,tomhanks_facebook_1.json
tripleh,"Paul ""Triple H"" Levesque",0.731107,TripleH_facebook_1.json
xtina,Christina Aguilera,0.800555,xtina_facebook_1.json


### 2.1. Accuracy

In [164]:
print("Accuracy of Yake by GIST-small-Embedding-v0 model is {0}%".format(round(true_yk_gist.shape[0] / data_finish_yk_gist.shape[0] * 100, 5)))

Accuracy of Yake by GIST-small-Embedding-v0 model is 26.2963%
