In [1]:
import requests
import json
import os
import gzip
import shutil
import pandas as pd
import re
import numpy as np
import multiprocessing as mp
import time
import concurrent.futures
import nltk
import spacy
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from spellchecker import SpellChecker
from num2words import num2words
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from langdetect import detect
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from transformers import BertTokenizer, BertModel
os.chdir('/mnt/scratch/pandavis/Semantic_Scholar_Data')

2023-10-11 14:16:50.676010: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-11 14:16:52.655440: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
paper_abstracts=json.load(open('/mnt/ufs18/home-230/pandavis/Capstone_CSE890/cse_abstracts_without_representations','r'))

In [14]:
len(paper_abstracts)

37045

In [15]:
english_abstracts_list=[abst_list['abstract'] for abst_list in paper_abstracts]
english_corpus_ids=[abst_list['corpusid'] for abst_list in paper_abstracts]

In [16]:
df_abstracts = pd.DataFrame({
    'corpusid': english_corpus_ids,
    'abstract': english_abstracts_list
})

In [17]:
df_abstracts.shape

(37045, 2)

In [18]:
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_abstracts(abstract):
    abstract = re.sub(r'<.*?>', '', abstract)
    abstract = re.sub(r'http\S+|www\S+|https\S+', '', abstract, flags=re.MULTILINE)
    abstract = re.sub(r'[^a-zA-Z\s]', '', abstract)
    abstract = abstract.lower()
    tokens = nltk.word_tokenize(abstract)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    abstract = ' '.join(tokens)
    return abstract

[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/home/pandavis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/home/pandavis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
english_abstracts_list = [preprocess_abstracts(abst) for abst in english_abstracts_list]

In [21]:
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer()
representation_model = KeyBERTInspired()
topic_model = BERTopic(
  embedding_model=embedding_model,          
  umap_model=umap_model,                    
  hdbscan_model=hdbscan_model,              
  vectorizer_model=vectorizer_model,        
  ctfidf_model=ctfidf_model,                
  representation_model=representation_model
)

In [22]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
topics,prob=topic_model.fit_transform(english_abstracts_list)

In [23]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,18114,-1_features_analysis_models_technology,"[features, analysis, models, technology, digit...",[abstract mobile phones become new common tool...
1,0,1235,0_machine_device_invention_equipment,"[machine, device, invention, equipment, appara...",[invention provides fuel cell distributed cont...
2,1,807,1_traffic_cars_vehicles_roads,"[traffic, cars, vehicles, roads, drivers, driv...",[invention discloses monocular vision vehicle ...
3,2,679,2_linguistics_languages_linguistic_vocabulary,"[linguistics, languages, linguistic, vocabular...",[objective study identify words level equivale...
4,3,671,3_learners_classroom_educational_learning,"[learners, classroom, educational, learning, e...",[aalborg universitys thirty years experience p...


In [24]:
topic_model.update_topics(english_abstracts_list,n_gram_range=(2,3))

In [25]:
doc_topics_df=topic_model.get_document_info(english_abstracts_list,df_abstracts)
doc_topics_df.head()

Unnamed: 0,corpusid,abstract,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,196018249,FIELD: medicine. SUBSTANCE: group of invention...,field medicine substance group inventions refe...,0,0_present invention_invention relates_inventio...,"[present invention, invention relates, inventi...",[invention provides fuel cell distributed cont...,present invention - invention relates - invent...,0.925711,False
1,124436019,This paper presents core-agent based clusterin...,paper presents coreagent based clustering cbc ...,17,17_wireless sensor_sensor networks_wireless se...,"[wireless sensor, sensor networks, wireless se...",[energy conservation critical design issue wir...,wireless sensor - sensor networks - wireless s...,0.44424,False
2,63412102,The interior management structure means the re...,interior management structure means relationsh...,6,6_supply chain_knowledge management_business p...,"[supply chain, knowledge management, business ...",[modern transportation enterprises working com...,supply chain - knowledge management - business...,1.0,False
3,112848277,The electromagnetic frequency spectrum charact...,electromagnetic frequency spectrum characteris...,-1,-1_results show_paper presents_experimental re...,"[results show, paper presents, experimental re...",[abstract mobile phones become new common tool...,results show - paper presents - experimental r...,0.0,False
4,60034015,Some of the potentially most significant uses ...,potentially significant uses microcomputers tr...,-1,-1_results show_paper presents_experimental re...,"[results show, paper presents, experimental re...",[abstract mobile phones become new common tool...,results show - paper presents - experimental r...,0.0,False


In [26]:
doc_topics_df.shape

(37045, 10)

In [28]:
topic_confident_df=doc_topics_df[doc_topics_df['Probability']==1]

In [29]:
cse_abstract_list = []
for index, row in topic_confident_df.iterrows():
    data = {
        'corpusid': row['corpusid'],
        'abstract': row['abstract'],
        'Representation': row['Representation']
    }
    cse_abstract_list.append(data)

In [30]:
cse_abstract_list[0:10]

[{'corpusid': 63412102,
  'abstract': "The interior management structure means the relationships and combinations of different power organizations in Chinese universities.At present,the phenomena of administration,failure to separate the Party work from the government work,inbalance of scientific power and administrative power are popular in Chinese universities and colleges.In order to reform and improve the interior management structure,this article suggests that it should persist in president's responsibility system under the Party committee leading,deal with the relationship between the scientific power and administrative power correctly,call for ideas of professor academic leaders,educationist administrative governors.We should pay more attention to functions of democratic management and democratic supervision of Teachers Representative Committee System and formulation of university regulations.",
  'Representation': ['supply chain',
   'knowledge management',
   'business process

In [31]:
len(cse_abstract_list)

8338

In [32]:
#predicting instances from financial data
company_data=pd.read_excel('/mnt/scratch/pandavis/Refinitive_Data/Company/2010_Company.xlsx')

In [33]:
company_desc_list = company_data['Investee Company Long Business Description\n(\'|\')'].dropna().tolist()
company_desc_list=[preprocess_abstracts(desc) for desc in company_desc_list]

In [34]:
company_topics,company_prob=topic_model.transform(company_desc_list)

In [35]:
company_with_representation = pd.DataFrame({
    'Description': company_desc_list,
    'Topic': company_topics,
    'Probability': company_prob,
    'Representations':[[topic[0] for topic in topic_model.get_topic(index)] for index in company_topics]
})
company_with_representation=company_with_representation[company_with_representation['Probability']==1]

In [37]:
company_representation_list=[]
for index,row in company_with_representation.iterrows():
    data={
        'Description':row['Description'],
        'Representations':row['Representations']
    }
    company_representation_list.append(data)

In [40]:
company_representation_list

[{'Description': 'abertis infraestructuras sa spainbased company primarily engaged management highway infrastructure companys activities divided two business segments toll roads telecommunications toll roads division focuses construction maintenance operation highways located spain france brazil chile united states canada among others telecommunications division manages operates satellite infrastructure hispasat transmission towers mobile telephony audiovisual broadcasting cellnex telecom company controls numerous subsidiaries abertis autopistas espana sa abertis motorways uk ltd autopistas metropolitanas de puerto rico abertis telecom satelites sa highways infrastructure construction',
  'Representations': ['traffic flow',
   'road network',
   'traffic congestion',
   'travel time',
   'license plate',
   'traffic control',
   'traffic management',
   'autonomous driving',
   'intelligent transportation',
   'traffic information']},
 {'Description': 'healthscope pty ltd australiabase

In [41]:
#json.dump(cse_abstract_list,open('/mnt/ufs18/home-230/pandavis/Capstone_CSE890/cse_abstracts_with_bigram_representation','w'))

In [42]:
#json.dump(company_representation_list,open('/mnt/ufs18/home-230/pandavis/Capstone_CSE890/company_representation_list','w'))

In [63]:
#json.dump(abstract_list,open('/mnt/ufs18/home-230/pandavis/Capstone_CSE890/abstract_with_representation','w'))

In [58]:
#topic_model.save('/mnt/ufs18/home-230/pandavis/Capstone_CSE890/minilm_topic_model.pkl')

In [2]:
#abstract_list=json.load(open('/mnt/ufs18/home-230/pandavis/Capstone_CSE890/abstract_with_representation','r'))

In [53]:
dbpedia_lookup='https://lookup.dbpedia.org/api/search?format=JSON&query='
cse_abstract_list_dbpedia=[]
for abstract in cse_abstract_list[0:5]:
    keywords=abstract['Representation']
    label_cleaned = []
    
    for keyword in keywords:
        search_query = dbpedia_lookup + keyword
        query_result = requests.get(search_query).json()
        if query_result['docs']:
            redirectlabel = query_result['docs'][0]['label']
            cleaned = [label.replace("<B>", "").replace("</B>", "") for label in redirectlabel]
            label_cleaned.append(cleaned)
    abstract['dbpedia_query'] = label_cleaned
    cse_abstract_list_dbpedia.append(abstract)

In [55]:
cse_abstract_list_dbpedia

[{'corpusid': 63412102,
  'abstract': "The interior management structure means the relationships and combinations of different power organizations in Chinese universities.At present,the phenomena of administration,failure to separate the Party work from the government work,inbalance of scientific power and administrative power are popular in Chinese universities and colleges.In order to reform and improve the interior management structure,this article suggests that it should persist in president's responsibility system under the Party committee leading,deal with the relationship between the scientific power and administrative power correctly,call for ideas of professor academic leaders,educationist administrative governors.We should pay more attention to functions of democratic management and democratic supervision of Teachers Representative Committee System and formulation of university regulations.",
  'Representation': ['supply chain',
   'knowledge management',
   'business process

In [57]:
#json.dump(cse_abstract_list_dbpedia,open('/mnt/ufs18/home-230/pandavis/Capstone_CSE890/abstract_with_dbpedia_representation','w'))

In [49]:
query_result['docs'][0]['label']

['<B>IJCAI</B> Computers and Thought <B>Award</B>']