In [40]:
import pandas as pd
import random 
import matplotlib.pyplot as plt
from tqdm import tqdm

### Read the CSV file

## Load the file SUBRELOBJ.csv from https://zenodo.org/records/10022727

In [2]:
SUBRELOBJ = pd.read_csv('SUBRELOBJ.csv')

Analyzing the DataFrame

In [3]:
print("Number of rows: {}".format(SUBRELOBJ.shape))
print("Columns : {}".format(SUBRELOBJ.columns.tolist()))

Number of rows: (5411478, 4)
Columns : ['Subject', 'Object', 'Rel', 'Count']


Total unique entities

In [4]:
unique_entities = list(set(SUBRELOBJ['Subject'].tolist() + SUBRELOBJ['Object'].tolist()))
print("Number of unique entities : {}".format(len(unique_entities)))

Number of unique entities : 70002


Let's see a couple of random samples

In [5]:
SUBRELOBJ.sample(5)

Unnamed: 0,Subject,Object,Rel,Count
4567549,Sr2+,HRTEM,CHM-CMT,107
1693259,Electrostatic Interaction,Amino Acid,PRO-CHM,111
1234587,Current Density,D2O,PRO-CHM,35
1607970,Electrochemical Impedance,Discharge Charge Profile,PRO-PRO,30
2409907,Hydrogen Electrode,Energy Density,APL-PRO,54


### Plotting the distribution of relationships

In [6]:
rel_count = SUBRELOBJ.groupby('Rel').size().reset_index(name = 'count').sort_values('count', ascending = False)
rel_count.head(25)

Unnamed: 0,Rel,count
32,PRO-PRO,513780
8,CHM-CHM,502596
29,PRO-CHM,338940
11,CHM-PRO,338940
25,DSC-PRO,223370
31,PRO-DSC,223370
18,CMT-PRO,211489
30,PRO-CMT,211489
9,CHM-CMT,209937
15,CMT-CHM,209937


In [35]:
rel_count.tail(5)

Unnamed: 0,Rel,count
48,SPL-SPL,5002
41,SMT-SPL,4271
47,SPL-SMT,4271
6,APL-SPL,3005
42,SPL-APL,3005


### Querying CSV File

In [15]:
def query_subrelobj(query, rel, k=10):
    
    def get_relevant_entities(query):
        entities = []
        for ent in unique_entities:
            if query.lower() in ent.lower():
                entities.append(ent)
        return entities
    
    def query_df(entities, rel):
        filtered_df = SUBRELOBJ[(SUBRELOBJ['Subject'].isin(entities)) & (SUBRELOBJ['Rel'] == rel)]
        grouped_df = filtered_df.groupby('Object')['Count'].sum().reset_index()
        sorted_df = grouped_df.sort_values(by='Count', ascending=False)
        objects_and_counts = sorted_df.set_index('Object')['Count']
        num = min(k, len(objects_and_counts))
        return objects_and_counts[:num]
    
    relevant_entities = get_relevant_entities(query)
    
    if len(relevant_entities)<1:
        print("No relevant query results")
        return None
    
    results = query_df(relevant_entities, rel)
    return results

#### Finding all applications of a TiO2

In [16]:
results = query_subrelobj('TiO2', 'CHM-APL')
results

Object
Catalyst                      14533
Electrode                      8492
Photocatalyst                  7286
Photocatalysts                 7013
Electrodes                     6626
Coatings                       5558
Solar Cell                     4795
Electrolyte                    4328
Photocatalytic Degradation     4257
Photoanode                     3935
Name: Count, dtype: int64

#### Finding all Symmetry Phase Labels of TiO2

In [17]:
results = query_subrelobj('TiO2', 'CHM-SPL')
results

Object
Anatase        17657
Rutile         10694
Perovskites     3635
Cubic           1086
Hexagonal        997
Brookite         890
Tetragonal       864
T                830
Spinels          807
Garnet           602
Name: Count, dtype: int64

#### Finding all applications of CdTe

In [19]:
results = query_subrelobj('CdTe', 'CHM-APL')
results

Object
Solar Cell              1099
Electrode                334
Cells                    305
Cell                     294
Detector                 262
Thin Film Solar Cell     237
Electrodes               207
Back Contact             197
Sensor                   180
Contacts                 166
Name: Count, dtype: int64

#### Finding all Properties of CdTe

In [20]:
results = query_subrelobj('CdTe', 'CHM-PRO')
results

Object
Semiconductor         758
Efficiency            528
Structure             472
Compositions          419
Band Gap              366
Optical Properties    344
Fluorescence          330
Stability             309
Morphology            301
Bandgap               280
Name: Count, dtype: int64

#### Creating a NetworkX Graph

In [36]:
import networkx as nx
import matplotlib.pyplot as plt

In [42]:
# Create an empty directed graph
G = nx.DiGraph()

In [43]:
# Add edges to the graph
for index, row in tqdm(SUBRELOBJ.iterrows(), total = SUBRELOBJ.shape[0]):
    G.add_edge(row['Subject'], row['Object'], relationship=row['Rel'], weight=row['Count'])

100%|██████████| 5411478/5411478 [03:56<00:00, 22849.17it/s]


In [45]:
# Calculate average degree
average_degree = sum(dict(G.degree()).values()) / float(len(G))
print(f"Average Degree: {average_degree}")

Average Degree: 140.3048484329019


In [47]:
# Corrected calculation of weighted average degree
weighted_degree = sum(attr['weight'] for _, _, attr in G.edges(data=True)) / float(len(G))

print(f"Weighted Average Degree: {weighted_degree}")

Weighted Average Degree: 8997.688151767092
