In [0]:
import pickle
import boto3
import re
import json
import random
import pandas as pd
import unicodedata
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DoubleType, StructType, StructField,LongType

In [0]:
base_save_path = "{save_path_for_openalex_tables}"
iteration_save_path = "{save_path_for_most_data}"

### Getting all data

In [0]:
classification_labels = spark.read.parquet(f'{iteration_save_path}topic_labels_data_from_cwts_new')
classification_labels.cache().count()

4521

In [0]:
classification_labels.filter(F.col('long_label').contains('Machine Learning')).show()

+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|micro_cluster_id|         short_label|          long_label|            keywords|             summary|       wikipedia_url|
+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            1490|Hydrological Mode...|Hydrological Mode...|Machine Learning;...|This cluster of p...|https://en.wikipe...|
|            1276|   Solar Forecasting|Machine Learning ...|Solar Radiation; ...|This cluster of p...|https://en.wikipe...|
|            1612|    Machine Learning|Optimization Meth...|Stochastic Gradie...|This cluster of p...|https://en.wikipe...|
|            1975| Genetic Programming|Application of Ge...|Genetic Programmi...|This cluster of p...|https://en.wikipe...|
|            1598|Internet Traffic ...|Machine Learning ...|Machine Learning;...|This cluster of p...|https://en.wikipe...|
|       

In [0]:
new_topic_labels = spark.read.parquet(f'{iteration_save_path}topics_data_from_cwts_new') \
    .select(F.col('work_id').cast(LongType()).alias('paper_id'), 
            F.col('macro_cluster_id').cast(IntegerType()),
            F.col('meso_cluster_id').cast(IntegerType()),
            F.col('micro_cluster_id').cast(IntegerType())) \
    .filter(F.col('paper_id').isNotNull() & 
            F.col('macro_cluster_id').isNotNull() & 
            F.col('meso_cluster_id').isNotNull() & 
            F.col('micro_cluster_id').isNotNull()) \
    .join(classification_labels, how='inner', on='micro_cluster_id')
    
new_topic_labels.cache().count()

70674439

In [0]:
print(new_topic_labels.select('micro_cluster_id').distinct().count())
print(new_topic_labels.select('meso_cluster_id').distinct().count())
print(new_topic_labels.select('macro_cluster_id').distinct().count())

4521
917
20


In [0]:
new_topic_labels.select(F.max(F.col('paper_id'))).show()

+-------------+
|max(paper_id)|
+-------------+
|   4388814839|
+-------------+



In [0]:
works = spark.read.parquet(f"{base_save_path}static_works") \
    .dropDuplicates(subset=['paper_id'])

works.cache().count()

245979831

In [0]:
classification_labels.filter(F.lower(F.col('long_label')).contains('social')).show(20)

+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|micro_cluster_id|         short_label|          long_label|            keywords|             summary|       wikipedia_url|
+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             314|Intergroup Relations|Intergroup Relati...|Intergroup Contac...|This cluster of p...|https://en.wikipe...|
|            4198|          Philosophy|Philosophy and So...|Philosophy; Socia...|This cluster of p...|https://en.wikipe...|
|            1795|  Humor and Laughter|Psychological and...|Humor Styles Ques...|This cluster of p...|https://en.wikipe...|
|            3162|  African Governance|Governance and So...|Africa; governanc...|This cluster of p...|https://en.wikipe...|
|            3425|     Rural Education|Rural Education a...|Rural Education; ...|This cluster of p...|https://en.wikipe...|
|       

### Looking at either samples from a specific random topic or looking at samples of all labeled data

In [0]:
random_num = random.sample(list(range(1,4522)),1)[0]
print(random_num)

3562


In [0]:
explore_clusters = works.select('paper_id','original_title') \
    .join(new_topic_labels.select('paper_id','micro_cluster_id'), how='inner', on='paper_id') \
    .join(classification_labels.select('micro_cluster_id','short_label','long_label','keywords'), how='inner', on='micro_cluster_id') \
    .dropDuplicates(subset=['micro_cluster_id']) \
    .sample(0.01).toPandas()

explore_clusters.shape

(44, 6)

In [0]:
explore_clusters = works.select('paper_id','original_title') \
    .join(new_topic_labels.select('paper_id','micro_cluster_id'), how='inner', on='paper_id') \
    .join(classification_labels.select('micro_cluster_id','short_label','long_label','keywords'), how='inner', on='micro_cluster_id') \
    .filter(F.col('micro_cluster_id') == 4126) \
    .sample(0.1).toPandas()

explore_clusters.shape

(135, 6)

In [0]:
explore_clusters.sample(20)

Unnamed: 0,micro_cluster_id,paper_id,original_title,short_label,long_label,keywords
26,97,64757461,Ba 3 B P 3 O 12 : Eu 2 + —A potential scintillation material,Upconversion Nanoparticles,Advances in Upconversion Nanoparticles Research,Upconversion Nanoparticles; Luminescent Materials; Nanocrystal Synthesis; Biological Imaging; Theranostics; Solar Cell Efficiency; Temperature Sensing; Rare Earth Ions; Photon Upconversion; Solid-State Lighting
33,2594,49993750,Virtual Organizations in Manufacturing: Trends and challenges,Collaborative Networks,Collaborative Networks in Manufacturing and Organizations,Collaborative Networks; Agile Manufacturing; Virtual Enterprise; Interpretive Structural Model; Workforce Agility; Organizational Agility; Total Interpretive Structural Modeling; Enterprise Collaboration; Strategic Framework; Partner Selection
41,4265,2967074392,The effects of varying dilution levels of wastewater on the cultivation of Spirulina sp.,Social Activity,Social Activity and Health Research,Social Activity; Health Research; Youth; Cancer; Ethnography; Physical Culture; Digital Learning; Socioeconomic Factors; Redox Homeostasis; Socio-Psychological Adaptation
8,2038,1981379734,Self-concentration effects in preparative SEC of mineral electrolytes using nanoporous neutral polymeric sorbents,Porous Organic Frameworks,Porous Crystalline Organic Frameworks for Energy and Separation Applications,Porous; Crystalline; Organic Frameworks; Covalent; Hydrogen Storage; Methane Storage; Carbon Dioxide Capture; Catalysis; Membrane Separations; Chemical Stability
28,1393,39054441,IDENTIFICATION METHODS | Immunoassay,Paper-based Diagnostics,Advancements in Paper-Based Diagnostic Devices,Paper-Based Microfluidics; Loop-Mediated Isothermal Amplification; Point-of-Care Diagnostics; Biosensors; Microfluidic Devices; Low-Cost; Portable Bioassays; Nanoparticle-Based Sensors; Infectious Disease Detection; Lab-on-a-Chip
31,801,6757332,Frequency Effects in Pol-InSAR Forest Height Estimation,SAR Interferometry,Synthetic Aperture Radar Interferometry,SAR Interferometry; Surface Deformation Monitoring; Persistent Scatterers; Digital Elevation Models; InSAR Technique; Land Subsidence; Polarimetric SAR; Forest Biomass Estimation; Urban Development Monitoring; Groundwater Extraction
38,14,66458591,Environmentally-Induced Oxidative Stress and Its Signaling,Abiotic Stress,Molecular Responses to Abiotic Stress in Plants,Reactive Oxygen Species; Antioxidants; Abiotic Stress; Signal Transduction; Oxidative Stress; Salt Tolerance; Drought Resistance; Plant Responses; Gene Expression; Stress Signaling
34,2233,887195630,Evaluation of simplifications od 2D models of soil-steel shell bridges,Seismic Tunnel Analysis,Seismic Design and Analysis of Underground Structures,Seismic; Tunnel; Analysis; Earthquake; Damage; Pipeline; Fault; Soil-Structure Interaction; Centrifuge Modeling; Liquefaction
23,1000,79768915,Infektionspräventions-Check-in und Infektionspräventions-Check-out zur Prävention nosokomialer Infektionen,Infection Control,Infection Control in Healthcare Settings,Infection Control; Hand Hygiene; Healthcare-Associated Infections; Environmental Contamination; Hospital Surfaces; Nosocomial Pathogens; Prevention Strategies; Healthcare Workers; Guidelines; Patient Safety
25,3714,184345243,Las cuevas con ocupación romana en el noroeste murciano: definición e interpretación,Rural Landscapes,Rural Landscapes in Medieval Iberia,Rural Landscapes; Medieval Architecture; Iberian Peninsula; Early Medieval Churches; Archaeology of Agriculture; Social Organization; Historical Landscape Formation; Hydraulic Systems; Peasant Villages; Commons and Environmental Regulation
