In [1]:
import os
import pandas as pd
import csv
import glob
import concurrent.futures
from tqdm import tqdm

def detect_delimiter(file_path):
    with open(file_path, 'r', newline='', encoding='utf-8') as file:
        dialect = csv.Sniffer().sniff(file.read(1024))
    return dialect.delimiter

delimiter = detect_delimiter('ERIHPLUSapprovedJournals.csv')
erih_plus_df = pd.read_csv('ERIHPLUSapprovedJournals.csv', sep=delimiter)


In [2]:
def process_meta_csv(chunk, erih_plus_df):
    meta_data = chunk
    meta_data['venue'] = meta_data['venue'].astype(str)
    meta_data['issn'] = meta_data['venue'].str.extract(r'issn:(\d{4}-\d{3}[\dX])')
    
    # Extract the identifier (OMID) from the 'id' column
    meta_data['id'] = meta_data['id'].str.extract(r'(meta:[^ ]*)')
    
    merged_data_print = erih_plus_df.merge(meta_data, left_on='Print ISSN', right_on='issn', how='inner')
    merged_data_online = erih_plus_df.merge(meta_data, left_on='Online ISSN', right_on='issn', how='inner')
    merged_data = pd.concat([merged_data_print, merged_data_online], ignore_index=True)
    
    # Keep only the relevant columns for the mapping dataframe
    merged_data = merged_data[['id', 'issn', 'Journal ID', 'Print ISSN', 'Online ISSN']].rename(columns={'id': 'OC_OMID', 'issn': 'OC_ISSN', 'Journal ID': 'EP_ID', 'Print ISSN': 'EP_Print_ISSN', 'Online ISSN': 'EP_Online_ISSN'})
    
    # Create the 'EP_ISSN' column
    merged_data['EP_ISSN'] = merged_data['EP_Print_ISSN'].combine_first(merged_data['EP_Online_ISSN'])
    
    # Drop the 'EP_Print_ISSN' and 'EP_Online_ISSN' columns
    merged_data = merged_data.drop(columns=['EP_Print_ISSN', 'EP_Online_ISSN'])
    
    # Drop rows with NaN values in the 'OC_ISSN' column
    merged_data = merged_data.dropna(subset=['OC_ISSN']).reset_index(drop=True)

    return merged_data

In [7]:

def process_file(input_file, erih_plus_df):
    chunksize = 5 * 10 ** 3
    processed_chunks = []
    
    # Read the input_file in chunks and process each chunk
    with pd.read_csv(input_file, chunksize=chunksize) as reader:
        for chunk in reader:
            processed_chunk = process_meta_csv(chunk, erih_plus_df)
            processed_chunks.append(processed_chunk)

    # Combine the processed chunks into a single DataFrame
    return pd.concat(processed_chunks, ignore_index=True)

input_directory = "I:\\open-sci\\dump-files\\opencitations-meta\\partial_dump"
files = glob.glob(os.path.join(input_directory, "*.csv"))

# Number of files to process at once
batch_size = 100

all_results = []

# Initialize a progress bar to visualize the progress of processing batches of files
with tqdm(total=len(files), desc="Batches") as pbar:
    # Process files in batches
    for i in range(0, len(files), batch_size):
        # Get the current batch of files
        batch_files = files[i:i + batch_size]

        # Process the current batch of files using a ProcessPoolExecutor for parallelism
        with concurrent.futures.ProcessPoolExecutor() as executor:
            results = executor.map(process_file, batch_files, [erih_plus_df] * len(batch_files))
            all_results.extend(results)
        
        # Update the progress bar for each batch
        pbar.update(len(batch_files))

# Combine the results from all batches into a single DataFrame
final_df = pd.concat(all_results, ignore_index=True)


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

## 1. Retrieve OpenCitation Meta publication and Journals that are registered in ERIH-PLUS index

Starting from the ERIH-PLUS index of Social Science and Humanities approved journals dataset 
ERIHPLUSapprovedJournals.csv
 (downloaded 27/04/2023) we want to retrieve all the publications belonging to one of those journals, included in OpenCitations Meta database (https://opencitations.net/meta#:~:text=For%20each%20publication%2C%20the%20metadata,and%20PubMed%20Identifiers%20(PMIDs).)

### 1.1 

In order to fulfill this task, we intend to download the data dump and perform chunk operations (either reading the csv with pandas setting a chunksize parameter, using os library to iterate over the folder's files, reading directly the zip file using gzip library etc.)
Note that the OpenCitations Meta data dump has a row for each entity that is either a publication or a venue. At this moment we don't need publication information, so we would need to cut down the dataset to only have venues information in it.

In [1]:
import os
import pandas as pd
import csv

def detect_delimiter(file_path):
    with open(file_path, 'r', newline='', encoding='utf-8') as file:
        dialect = csv.Sniffer().sniff(file.read(1024))
    return dialect.delimiter

delimiter = detect_delimiter('ERIHPLUSapprovedJournals.csv')
erih_plus_df = pd.read_csv('ERIHPLUSapprovedJournals.csv', sep=delimiter)

In [2]:
erih_plus_df.head(1)

Unnamed: 0,Journal ID,Print ISSN,Online ISSN,Original Title,International Title,Country of Publication,ERIH PLUS Disciplines,OECD Classifications,[Last Updated]
0,486254,1989-3477,,@tic.revista d'innovació educativa,@tic.revista d'innovació educativa,Spain,Interdisciplinary research in the Social Scien...,Educational Sciences; Other Social Sciences,2015-06-25 13:48:26


In [3]:
def process_meta_csv(file_path, erih_plus_df):
    meta_data = pd.read_csv(file_path)
    meta_data['venue'] = meta_data['venue'].astype(str)
    meta_data['issn'] = meta_data['venue'].str.extract(r'issn:(\d{4}-\d{3}[\dX])')
    
    # Extract the identifier (OMID) from the 'id' column
    meta_data['id'] = meta_data['id'].str.extract(r'(meta:[^ ]*)')
    
    merged_data_print = erih_plus_df.merge(meta_data, left_on='Print ISSN', right_on='issn', how='inner')
    merged_data_online = erih_plus_df.merge(meta_data, left_on='Online ISSN', right_on='issn', how='inner')
    merged_data = pd.concat([merged_data_print, merged_data_online], ignore_index=True)
    
    # Keep only the relevant columns for the mapping dataframe
    merged_data = merged_data[['id', 'issn', 'Journal ID', 'Print ISSN', 'Online ISSN']].rename(columns={'id': 'OC_OMID', 'issn': 'OC_ISSN', 'Journal ID': 'EP_ID', 'Print ISSN': 'EP_Print_ISSN', 'Online ISSN': 'EP_Online_ISSN'})
    
    # Create the 'EP_ISSN' column
    merged_data['EP_ISSN'] = merged_data['EP_Print_ISSN'].combine_first(merged_data['EP_Online_ISSN'])
    
    # Drop the 'EP_Print_ISSN' and 'EP_Online_ISSN' columns
    merged_data = merged_data.drop(columns=['EP_Print_ISSN', 'EP_Online_ISSN'])
    
    # Drop rows with NaN values in the 'OC_ISSN' column
    merged_data = merged_data.dropna(subset=['OC_ISSN']).reset_index(drop=True)

    return merged_data



In [4]:
import os
import pandas as pd
import glob
import concurrent.futures
from tqdm import tqdm

def process_file(input_file, erih_plus_df):
    chunksize = 3 * 10 ** 3
    processed_chunks = []
    
    # Read the input_file in chunks and process each chunk
    with pd.read_csv(input_file, chunksize=chunksize) as reader:
        for chunk in reader:
            processed_chunk = process_meta_csv(chunk, erih_plus_df)
            processed_chunks.append(processed_chunk)

    # Combine the processed chunks into a single DataFrame
    return pd.concat(processed_chunks, ignore_index=True)

input_directory = "I:\\open-sci\\dump-files\\opencitations-meta\\partial_dump"
files = glob.glob(os.path.join(input_directory, "*.csv"))

# Number of files to process at once
batch_size = 10

all_results = []

# Initialize a progress bar to visualize the progress of processing batches of files
with tqdm(total=len(files), desc="Batches") as pbar:
    # Process files in batches
    for i in range(0, len(files), batch_size):
        # Get the current batch of files
        batch_files = files[i:i + batch_size]

        # Process the current batch of files using a ProcessPoolExecutor for parallelism
        with concurrent.futures.ProcessPoolExecutor() as executor:
            results = executor.map(process_file, batch_files, [erih_plus_df] * len(batch_files))
            all_results.extend(results)
        
        # Update the progress bar for each batch
        pbar.update(len(batch_files))

# Combine the results from all batches into a single DataFrame
final_df = pd.concat(all_results, ignore_index=True)


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
import os
import pandas as pd
import glob
import concurrent.futures
from tqdm import tqdm

def process_file(input_file, erih_plus_df):
    chunksize = 3 * 10 ** 3
    processed_chunks = []
    
    with pd.read_csv(input_file, chunksize=chunksize) as reader:
        for chunk in reader:
            processed_chunk = process_meta_csv(chunk, erih_plus_df)
            processed_chunks.append(processed_chunk)

    return pd.concat(processed_chunks, ignore_index=True)

input_directory = "I:\\open-sci\\dump-files\\opencitations-meta\\partial_dump"
files = glob.glob(os.path.join(input_directory, "*.csv"))

# Number of files to process at once
batch_size = 10

all_results = []

with tqdm(total=len(files), desc="Batches") as pbar:
    for i in range(0, len(files), batch_size):
        batch_files = files[i:i + batch_size]

        with concurrent.futures.ProcessPoolExecutor() as executor:
            results = executor.map(process_file, batch_files, [erih_plus_df] * len(batch_files))
            all_results.extend(results)
        
        # Update the progress bar for each batch
        pbar.update(len(batch_files))

final_df = pd.concat(all_results, ignore_index=True)


In [4]:
import os
import pandas as pd
import glob
import concurrent.futures

def process_file(input_file, erih_plus_df):
    chunksize = 3 * 10 ** 3
    processed_chunks = []

    with pd.read_csv(input_file, chunksize=chunksize) as reader:
        for chunk in reader:
            processed_chunk = process_meta_csv(chunk, erih_plus_df)
            processed_chunks.append(processed_chunk)

    return pd.concat(processed_chunks, ignore_index=True)

input_directory = "I:\\open-sci\\dump-files\\opencitations-meta\\partial_dump"
files = glob.glob(os.path.join(input_directory, "*.csv"))

# Number of files to process at once
batch_size = 10

all_results = []

for i in range(0, len(files), batch_size):
    batch_files = files[i:i + batch_size]

    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = executor.map(process_file, batch_files, [erih_plus_df] * len(batch_files))
        all_results.extend(results)

final_df = pd.concat(all_results, ignore_index=True)


In [39]:
import os
import concurrent.futures
from tqdm import tqdm
import pandas as pd

def process_meta_csv_wrapper(args):
    return process_meta_csv(*args)

csv_directory = 'I:\\open-sci\\dump-files\\opencitations-meta\\partial_dump'
csv_files = [os.path.join(csv_directory, f) for f in os.listdir(csv_directory) if f.endswith('.csv')]

batch_size = 100
num_batches = len(csv_files) // batch_size + (1 if len(csv_files) % batch_size > 0 else 0)

temp_files = []

with tqdm(total=len(csv_files)) as pbar:
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(csv_files))
        batch_files = csv_files[start_idx:end_idx]

        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            results = list(executor.map(process_meta_csv_wrapper, [(file, erih_plus_df) for file in batch_files]))

        batch_data = pd.concat(results, ignore_index=True)
        temp_file = f"temp_merged_data_{batch_idx}.csv"
        batch_data.to_csv(temp_file, index=False)
        temp_files.append(temp_file)

        # Update progress bar for each file in the batch
        pbar.update(len(batch_files))

# Load temporary files and concatenate them
merged_data = pd.concat([pd.read_csv(temp_file) for temp_file in temp_files], ignore_index=True)

# Remove temporary files
for temp_file in temp_files:
    os.remove(temp_file)


100%|██████████| 5/5 [00:08<00:00,  1.61s/it]


In [None]:
""" import os
import concurrent.futures
from tqdm import tqdm
import pandas as pd

def process_meta_csv_wrapper(args):
    return process_meta_csv(*args)

csv_directory = 'I:\\open-sci\\dump-files\\opencitations-meta\\partial_dump'
csv_files = [os.path.join(csv_directory, f) for f in os.listdir(csv_directory) if f.endswith('.csv')]

batch_size = 100
num_batches = len(csv_files) // batch_size + (1 if len(csv_files) % batch_size > 0 else 0)

temp_files = []

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(csv_files))
    batch_files = csv_files[start_idx:end_idx]

    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        results = list(tqdm(executor.map(process_meta_csv_wrapper, [(file, erih_plus_df) for file in batch_files]), total=len(batch_files)))

    batch_data = pd.concat(results, ignore_index=True)
    temp_file = f"temp_merged_data_{batch_idx}.csv"
    batch_data.to_csv(temp_file, index=False)
    temp_files.append(temp_file)

# Load temporary files and concatenate them
merged_data = pd.concat([pd.read_csv(temp_file) for temp_file in temp_files], ignore_index=True)

# Remove temporary files
for temp_file in temp_files:
    os.remove(temp_file) """


In [4]:
csv_directory = 'I:\\open-sci\\dump-files\\opencitations-meta\\solo_one'
merged_data = pd.DataFrame()

for file_name in os.listdir(csv_directory):
    if file_name.endswith('.csv'):
        file_path = os.path.join(csv_directory, file_name)
        merged_data_file = process_meta_csv(file_path, erih_plus_df)
        merged_data = pd.concat([merged_data, merged_data_file], ignore_index=True)

In [40]:
merged_data.head(1)

Unnamed: 0,OC_OMID,OC_ISSN,EP_ID,EP_ISSN
0,meta:br/0601646,0172-6404,471777,0172-6404


In [42]:
merged_data

Unnamed: 0,OC_OMID,OC_ISSN,EP_ID,EP_ISSN
0,meta:br/0601646,0172-6404,471777,0172-6404
1,meta:br/0601638,0172-6404,471777,0172-6404
2,meta:br/0601645,0172-6404,471777,0172-6404
3,meta:br/0601643,0172-6404,471777,0172-6404
4,meta:br/0601640,0172-6404,471777,0172-6404
5,meta:br/0601642,0172-6404,471777,0172-6404
6,meta:br/0601644,0172-6404,471777,0172-6404
7,meta:br/0601648,0172-6404,471777,0172-6404
8,meta:br/0601647,0172-6404,471777,0172-6404
9,meta:br/0601637,0172-6404,471777,0172-6404


In [24]:
""" new_merged_data = merged_data.dropna(subset=['OC_ISSN']).reset_index(drop=True)
new_merged_data.head(2) """

" new_merged_data = merged_data.dropna(subset=['OC_ISSN']).reset_index(drop=True)\nnew_merged_data.head(2) "

### 1.2

HERE WE NEED TO HAVE A STEP FOR ADDING INFORMATION ABOUT OPEN ACCESS TO THE DATAFRAME WE JUST CREATED, SO THAT THE OMIDS ARE DIRECTLY CONNECTED TO THE INFORMATION ABOUT ACCESSIBILITY OF THE JOURNAL!

In [25]:
# Load DOAJ CSV file into a DataFrame
doaj_file_path = 'journalcsv__doaj.csv'
doaj_df = pd.read_csv(doaj_file_path, encoding="UTF-8")

In [26]:
doaj_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19278 entries, 0 to 19277
Data columns (total 54 columns):
 #   Column                                                                       Non-Null Count  Dtype  
---  ------                                                                       --------------  -----  
 0   Journal title                                                                19278 non-null  object 
 1   Journal URL                                                                  19278 non-null  object 
 2   URL in DOAJ                                                                  19278 non-null  object 
 3   When did the journal start to publish all content using an open license?     19277 non-null  float64
 4   Alternative title                                                            7485 non-null   object 
 5   Journal ISSN (print version)                                                 11148 non-null  object 
 6   Journal EISSN (online version)        

In [27]:
new_doaj = doaj_df.iloc[1:, [5, 6, 10]]
new_doaj.columns

Index(['Journal ISSN (print version)', 'Journal EISSN (online version)',
       'Country of publisher'],
      dtype='object')

In [28]:
# Create a dictionary of Open Access ISSNs
open_access_dict = {}

for index, row in new_doaj.iterrows():
    open_access_dict[row['Journal ISSN (print version)']] = True
    open_access_dict[row['Journal EISSN (online version)']] = True


In [29]:
# Merge Open Access information with the main dataframe
#new_merged_data['Open Access'] = new_merged_data['OC_ISSN'].map(open_access_dict)
merged_data['Open Access'] = merged_data['OC_ISSN'].map(open_access_dict)



In [30]:
# Fill missing Open Access information with 'Unknown'
#new_merged_data['Open Access'] = new_merged_data['Open Access'].fillna('Unknown')
merged_data['Open Access'] = merged_data['Open Access'].fillna('Unknown')



In [32]:
#new_merged_data.head()
merged_data

Unnamed: 0,OC_OMID,OC_ISSN,EP_ID,EP_ISSN,Open Access
0,meta:br/0601646,0172-6404,471777,0172-6404,Unknown
1,meta:br/0601638,0172-6404,471777,0172-6404,Unknown
2,meta:br/0601645,0172-6404,471777,0172-6404,Unknown
3,meta:br/0601643,0172-6404,471777,0172-6404,Unknown
4,meta:br/0601640,0172-6404,471777,0172-6404,Unknown
5,meta:br/0601642,0172-6404,471777,0172-6404,Unknown
6,meta:br/0601644,0172-6404,471777,0172-6404,Unknown
7,meta:br/0601648,0172-6404,471777,0172-6404,Unknown
8,meta:br/0601647,0172-6404,471777,0172-6404,Unknown
9,meta:br/0601637,0172-6404,471777,0172-6404,Unknown


In [35]:
import os
import pandas as pd

csv_directory = 'I:\\open-sci\\dump-files\\opencitations-meta\\partial_dump'
csv_files = [os.path.join(csv_directory, f) for f in os.listdir(csv_directory) if f.endswith('.csv')]

# Select the first 5 CSV files to check
files_to_check = csv_files[:5]

# Initialize a set to store unique ISSNs
unique_issns = set()

for file in files_to_check:
    df = pd.read_csv(file)
    df['venue'] = df['venue'].astype(str)
    df['issn'] = df['venue'].str.extract(r'issn:(\d{4}-\d{3}[\dX])')
    unique_issns.update(df['issn'].dropna().tolist())

print(f"Total unique ISSNs found in {len(files_to_check)} files: {len(unique_issns)}")
print("Sample ISSNs:", list(unique_issns)[:10])


Total unique ISSNs found in 5 files: 54
Sample ISSNs: ['2222-1751', '1736-8723', '1738-1266', '0098-7921', '0198-8220', '1229-5949', '1806-3756', '1662-9779', '0753-3322', '1765-2952']
