In [32]:
import json
import pickle
import pandas as pd
import unicodedata
pd.set_option('display.max_colwidth', None)
import numpy as np
import random
from sentence_transformers import SentenceTransformer

In [33]:
def name_to_keep_ind(groups):
    groups_to_skip = ['HIRAGANA', 'CJK', 'KATAKANA','ARABIC', 'HANGUL', 'THAI','DEVANAGARI','BENGALI',
                      'THAANA','GUJARATI','CYRILLIC']
    
    if any(x in groups_to_skip for x in groups):
        return 0
    else:
        return 1
    
def group_non_latin_characters(text):
    groups = []
    latin_chars = []
    text = text.replace(".", "").replace(" ", "")
    for char in text:
        try:
            script = unicodedata.name(char).split(" ")[0]
            if script == 'LATIN':
                latin_chars.append(script)
            else:
                if script not in groups:
                    groups.append(script)
        except:
            if "UNK" not in groups:
                groups.append("UNK")
    return groups, len(latin_chars)

def check_for_non_latin_characters(text):
    groups, latin_chars = group_non_latin_characters(str(text))
    if name_to_keep_ind(groups) == 1:
        return 1
    elif latin_chars > 20:
        return 1
    else:
        return 0

In [34]:
def get_journal_emb(journal_name, emb):
    if check_for_non_latin_characters(journal_name) == 1:
        return emb
    else:
        return np.zeros(384, dtype=np.float32)

In [4]:
def save_pickle(dictionary, file_path):
    # Save the dictionary as a pickle file
    with open(file_path, 'wb') as f:
        pickle.dump(dictionary, f)

In [46]:
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [16]:
journal_data = pd.read_csv("{data_which_contains_all_openalex-journal_ids_and_name}")\
    [['journal_id','display_name']].dropna()
journal_data.shape

  journal_data = pd.read_csv("journal_202312261335.csv")[['journal_id','display_name']].dropna()


(250179, 2)

#### Preprocessing the journal names

In [29]:
journal_data['display_name'] = journal_data['display_name'].apply(lambda x: x.strip())
journal_data = journal_data[~journal_data['display_name'].str.contains('eBooks')].copy()
journal_data['non_latin'] = journal_data['display_name'].apply(check_for_non_latin_characters)
journal_data = journal_data[journal_data['non_latin']==1].copy()

##### Using all-MiniLM-L6-v2 to create an embedding for each journal

In [49]:
journal_embs = emb_model.encode(journal_data['display_name'].tolist())

In [50]:
journal_data['emb'] = journal_embs.tolist()

In [None]:
journal_data['emb'] = journal_data.apply(lambda x: get_journal_emb(x.display_name, x.emb), axis=1)

In [12]:
journal_embs = {x:y for x,y in zip(journal_data['journal_id'].tolist(), 
                                   journal_data['emb'].tolist())}

In [13]:
_ = save_pickle(journal_embs, './journal_embs.pkl')