In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm

In [2]:
def extract_ref(ref:dict):
    info = ref["ref-info"]
    if "ref-title" in info.keys():
        if "ref-publicationyear" in info.keys():
            year = info["ref-publicationyear"]["@first"]
        else:
            year = "unknow"
        return year,info["ref-title"]["ref-titletext"]
    return (None,None)

In [83]:
def extract_pub(data:dict):
    abstracts = data["head"]["abstracts"]
    title = data["head"]["citation-title"]
    sourcetitle = data["head"]["source"]["sourcetitle"]
    return title,sourcetitle,abstracts

def extract_affiliation(data:dict):
    def get_affiname(affiliation:list[dict]|dict,col:str):
        if isinstance(affiliation,dict):
            return [affiliation[col] if col in affiliation.keys() else None]
        return [i[col] if col in i.keys() else None for i in affiliation] 
    
    data = data["abstracts-retrieval-response"]["affiliation"]
    return get_affiname(data,"affilname"), get_affiname(data,"affiliation-city"),get_affiname(data,"affiliation-country"),

def extract_json_file(file_name:str):
    list_ref = []
    
    with open(file_name,encoding="utf8") as f:
        try:
            data = json.load(f)
        except:
            print(file_name)
            return None,None
        affilname,affilcity,affilcountry = extract_affiliation(data)
        data = data["abstracts-retrieval-response"]["item"]["bibrecord"]
        title,sourcetitle,abstracts = extract_pub(data)
        new_pub = (title,sourcetitle,abstracts,affilname,affilcity,affilcountry)
        if data["tail"] is not None:
            list_referece = data["tail"]["bibliography"]["reference"]
            if isinstance(list_referece,dict):
                year_ref,title_ref = extract_ref(list_referece)
                if year_ref != None and title_ref != None:
                    list_ref.append((title,sourcetitle,year_ref,title_ref))
            else:
                for i in list_referece:
                    year_ref,title_ref = extract_ref(i)
                    if year_ref != None and title_ref != None:
                        list_ref.append((title,sourcetitle,year_ref,title_ref))
        return new_pub,list_ref

In [84]:
list_ref = []
list_pub = []

for pub_year in os.listdir(r"./Project"):
    for pub_name in tqdm(os.listdir(r"./Project/" + pub_year)):
        file_name = r"./Project/" + pub_year + "/"+ pub_name
        new_pub,list_single_ref = extract_json_file(file_name)
        list_ref.extend(list_single_ref)
        list_pub.append(new_pub)

100%|██████████| 2792/2792 [00:31<00:00, 90.05it/s] 
100%|██████████| 3082/3082 [00:24<00:00, 127.16it/s]
100%|██████████| 3393/3393 [00:10<00:00, 334.02it/s]
100%|██████████| 3815/3815 [00:10<00:00, 351.39it/s]
100%|██████████| 4244/4244 [00:10<00:00, 387.26it/s]
100%|██████████| 2890/2890 [00:06<00:00, 414.24it/s]


In [146]:
df_ref_raw = pd.DataFrame(list_ref,columns=["title","sourcetitle","year","title_ref"])
df_pub_raw = pd.DataFrame(list_pub,columns=["title","sourcetitle","abstracts","affilname","affilcity","affilcountry"])
df_pub_raw["link"] = None

In [147]:
df_pub_raw_explode = df_pub_raw.explode(["affilname","affilcity","affilcountry"])
affiliate_df = df_pub_raw_explode[["affilname","affilcity","affilcountry"]].drop_duplicates().reset_index(drop=True).reset_index(names="id")
df_pub_raw_explode = df_pub_raw_explode.merge(affiliate_df,how="left",on=["affilname","affilcity","affilcountry"])
df_pub_merge_aff = df_pub_raw_explode.groupby("title")["id"].apply(list).to_frame().reset_index()
df_pub_raw = df_pub_raw.merge(df_pub_merge_aff,on="title")[["title","abstracts","id","link"]].rename(columns={"id":"affiliation_id"})
df_pub_raw = df_pub_raw.reset_index(names="id")
df_pub_raw.to_csv("./data/pub.csv",index=False)

In [149]:
lat_lon_ciry = pd.DataFrame(columns=["affilcity","lat","lon"])
affiliate_df.merge(lat_lon_ciry,on="affilcity",how="left").to_csv("./data/affiliation.csv",index=False)

In [87]:
import pandas as pd
df_ref = df_ref_raw
df_pub = df_pub_raw

# Autocomplete

In [88]:
df_pub = df_pub[~df_pub["abstracts"].isna()]
df_pub = df_pub[~df_pub["title"].isna()]
df_pub["len"] = df_pub["title"].str.len()

In [89]:
def auto_complete(query:str):
    df_contain = df_pub[df_pub["title"].str.lower().str.contains(query)]
    df_contain.loc[:,"ind"] = df_contain["title"].str.lower().str.find(query)
    return df_contain.sort_values(["ind","len"]).head(10)

In [90]:
result = auto_complete("machin")
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_contain.loc[:,"ind"] = df_contain["title"].str.lower().str.find(query)


Unnamed: 0,id,title,sourcetitle,abstracts,affiliation_id,len,ind
13960,13960,Machine Learning Based Design of Railway Prest...,Applied Sciences (Switzerland),© 2022 by the authors.The state-of-the-art des...,"[1, 2254, 1957]",70,0
9074,9074,"Machine hermeneutics, postphenomenology, and f...",AI and Society,"© 2020, Springer-Verlag London Ltd., part of S...",[1],74,0
19092,19092,Machine learning and statistical analysis for ...,Bioresource Technology,© 2022 Elsevier LtdTorrefaction is a remarkabl...,"[334, 1, 179, 53, 3505, 6073, 1311]",76,0
16272,16272,Machine Learning Techniques to Detect Failure ...,6th International Conference on Information Te...,© 2022 IEEE.This paper presents machine learni...,[1],77,0
3596,3596,Machine learning to understand the immune-infl...,International Journal of Molecular Sciences,"© 2019 by the authors. Licensee MDPI, Basel, S...","[3020, 549, 1, 2245, 3021, 2534, 3022]",79,0
15165,15165,Machine Reading Comprehension Using Multi-Pass...,ECTI Transactions on Computer and Information ...,"© 2022, ECTI Association. All rights reserved....",[1],84,0
16970,16970,Machine-learning-based automated quantificatio...,PeerJ Computer Science,© Copyright 2022 Phanomchoeng et al.The plaque...,"[1, 1964]",87,0
19857,19857,Machine Learning applications for Data Quality...,Journal of Physics: Conference Series,© Published under licence by IOP Publishing Lt...,"[1, 75]",91,0
23,23,Machine Learning system mimicking student's ch...,Journal of Physics: Conference Series,© Published under licence by IOP Publishing Lt...,[1],96,0
11967,11967,Machine Learning-Driven and Smartphone-Based F...,ACS Omega,© 2021 The Authors. Published by American Chem...,"[1, 531, 522, 7254]",103,0


# Node

In [100]:
df_ref["title_ref"] = df_ref["title_ref"].str.lower()
df_count = df_ref.groupby("title_ref").count()
df_ref_new = df_ref[df_ref["title_ref"].isin(df_count[df_count["title"] > 1].index)]
df_ref_new = df_ref_new.drop(columns=["sourcetitle"])

In [101]:
df_merge = df_ref_new.merge(df_ref_new[["title_ref","title"]],on=["title_ref"],how="left",suffixes=('','_2'))
df_merge = df_merge[df_merge["title"] != df_merge["title_2"]]
df_merge_group = df_merge.groupby("title").count().sort_values("title_ref",ascending=False)

In [102]:
group_fre = df_merge.groupby(["title","title_2"]).count().sort_values("title_ref",ascending=False).reset_index()
group_fre.columns = ["title","title_2","count","count_2"]
group_fre = group_fre.drop(columns=["count_2"])

In [103]:
df_dropdup = df_merge.drop_duplicates(subset=["title","title_2"])
df_dropdup = df_dropdup.merge(group_fre,on=["title","title_2"],how="left")

In [104]:
df_dropdup

Unnamed: 0,title,year,title_ref,title_2,count
0,Public health and international epidemiology f...,2016,"predicting the future - big data, machine lear...",AI-Assisted Diagnosis of Dyssynergic Defecatio...,1.0
1,Public health and international epidemiology f...,2011,global cancer statistics,Ethnic variation of colonic polyps,1.0
2,Public health and international epidemiology f...,2011,global cancer statistics,A phase II study of the efficacy and safety of...,1.0
3,Public health and international epidemiology f...,2011,global cancer statistics,Bimodal chromoendoscopy with confocal laser en...,1.0
4,Public health and international epidemiology f...,2011,global cancer statistics,Awareness level about breast cancer risk facto...,1.0
...,...,...,...,...,...
754040,Effects of remittances on household poverty an...,1979,sample selection bias as a specification error,Audit Partner Industry Specialization and the ...,1.0
754041,Effects of remittances on household poverty an...,1979,sample selection bias as a specification error,Pressure to perform: female CEOs and firm comm...,1.0
754042,Effects of remittances on household poverty an...,1979,sample selection bias as a specification error,Does goodwill pressure drive business restruct...,1.0
754043,Effects of remittances on household poverty an...,1993,theories of international migration: a review ...,Migration of older persons seeking care in Tha...,1.0


# Concat REf and PUB

In [130]:
df_ref_merge_id = df_dropdup.merge(df_pub[["title","id"]],on="title",how="left")
df_ref_merge_id = df_ref_merge_id[df_ref_merge_id.id.notna()]
df_ref_merge_id = df_ref_merge_id.merge(df_pub[["title","id"]],left_on="title_2",right_on="title",how="inner",suffixes=('','_3'))
df_ref_merge_id = df_ref_merge_id[["id","id_3","count","year"]]
df_ref_merge_id = df_ref_merge_id.astype({"id":"int","id_3":"int","count":"int"})
df_ref_merge_id.columns = ["source","target","weight","year"]

In [131]:
df_ref_merge_id.to_csv("ref.csv",index=False)

# Emb

In [109]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    
    # Remove text between parentheses
    text = re.sub(r'\([^)]*\)', ' ', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [110]:
df_pub["abstracts_preprocess"] = df_pub["abstracts"].apply(clean_text)

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_pub["abstracts_preprocess"].to_list())

In [112]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
def create_similarity_matrix(list_abstracts:list[str]):
    # Create TF-IDF representation
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(list_abstracts)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

def recommend_publications(pub_id:int, similarity_matrix, page_size:int=10, page:int=1):
    # Get similarity scores
    similarity_scores = similarity_matrix[pub_id]
    
    # Get indices of publications similar to the target publication
    similar_pub_indices = similarity_scores.argsort()[::-1]
    similar_pub_indices = similar_pub_indices[similar_pub_indices != pub_id]
    similar_pub_indices = similar_pub_indices[(page-1)*page_size:page*page_size]
    return similar_pub_indices

In [113]:
similiar_matrix = create_similarity_matrix(df_pub["abstracts_preprocess"].to_list())

In [114]:
import numpy as np
with open('similiar_metrix.npy', 'wb') as f:
    np.save(f, similiar_matrix)

# TATA preprocess

In [None]:
dftp = pd.read_csv(r"C:\codepluem\CU\datasci_proj\webscraping\research_csv\06D05M2024Y\paper_info.csv")
dftp.drop(columns=["Unnamed: 0"],inplace=True)

In [None]:
import json
json.loads(str(dftp["affiliation"].iloc[0].replace("\'","\"")))

[{'@_fa': 'true',
  'affilname': 'European Investment Bank',
  'affiliation-city': 'Luxembourg',
  'affiliation-country': 'Luxembourg'}]

In [None]:
dftp["affiliation_decode"] = dftp["affiliation"].apply(decode_affiliation)

[{"@_fa": "true", "affilname": "Translators Association of China (TAC)", "affiliation-city': None, 'affiliation-country": "China"}]
'affiliation-city'
[{"@_fa": "true", "affilname": "National Research Institute", "affiliation-city': None, 'affiliation-country': None}]
Unterminated string starting at: line 1 column 63 (char 62)
[{"@_fa": "true", "affilname': "Université d'Orléans", 'affiliation-city": "Orleans", "affiliation-country": "France"}]
Expecting ':' delimiter: line 1 column 33 (char 32)
[{"@_fa": "true", "affilname': "Xi'an Polytechnic University", 'affiliation-city': "Xi'an", 'affiliation-country": "China"}]
Expecting ':' delimiter: line 1 column 33 (char 32)
[{"@_fa": "true", "affilname': "Università Ca' Foscari Venezia", 'affiliation-city": "Venice", "affiliation-country": "Italy"}]
Expecting ':' delimiter: line 1 column 33 (char 32)
[{"@_fa": "true", "affilname': "Vanderbilt University's Peabody College", 'affiliation-city": "Nashville", "affiliation-country": "United Stat