In [None]:
#trasformare DIRETTAMENTE il json di crossref in csv utile alla nostra nobile causa
#includere anche la strasformazione on fly dei doi in oci

import zipfile
import gzip
import json
import csv
import os
import errno
from concurrent.futures import ThreadPoolExecutor

zip_filename = "chunk14.zip"

def process_json_data(compressed_data):
    decompressed_data = gzip.decompress(compressed_data) #decompression
    decoded_data = decompressed_data.decode('utf-8') #decode as utf8
    json_data = json.loads(decoded_data)
    peer_review_items = [item for item in json_data['items'] if item.get('type') == 'peer-review'] #fltering elements
    return peer_review_items

files = ["peer_review_items11.json"]
output_filenames = ["provina-oci.csv"]
LOOKUP_CSV = 'lookup.csv'
CROSSREF_CODE = '020'

def parse_date(date_parts):
    if date_parts:
        try:
            return "-".join(map(str, date_parts))
        except ValueError:
            pass
    return None

#handle oci
class OciProcess:
    def _init_(self):
        self.lookup_code = 0
        self.lookup_dic = {}
        self.LOOKUP_CSV = 'lookup.csv'
        self.CROSSREF_CODE = '020'
          
    def init_lookup_dic(self):
        with open(self.LOOKUP_CSV,'r') as lookupcsv:
            lookupcsv_reader = csv.DictReader(lookupcsv)
            code = -1
            for row in lookupcsv_reader:
                self.lookup_dic[row['c']] = row['code']
                code = int(row['code'])
            #last code used
            self.lookup_code = code
    
    def calc_next_lookup_code(self):
        rem = self.lookup_code % 100
        newcode = self.lookup_code + 1
        if (rem == 89):
            newcode = newcode * 10
        self.lookup_code = newcode

    def update_lookup(self, c):
        if c not in self.lookup_dic:
            #define the code following the 9 rule ...
            self.calc_next_lookup_code()
            code = self.lookup_code
            self.lookup_dic[c] = code
            self.write_txtblock_on_csv(self.LOOKUP_CSV, '\n"%s","%s"'%(c, code))
    
    def check_make_dirs(self, filename):
        directory = os.path.dirname(filename)
        if directory and not os.path.exists(directory):
            try:
                os.makedirs(directory)
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    raise
    
    def write_txtblock_on_csv(self, csv_path, block_txt):
        self.check_make_dirs(csv_path)
        with open(csv_path, 'a', newline='') as csvfile:
            csvfile.write(block_txt)

    def convert_doi_to_ci(self, doi_str):
        CROSSREF_CODE = '020'
        return self.CROSSREF_CODE + self.match_str_to_lookup(doi_str)
    
    def match_str_to_lookup(self, str_val):
        ci_str = ""
        str_noprefix = str_val[3:]
        for c in str_noprefix:
            if c not in self.lookup_dic:
                self.update_lookup(c)
            ci_str = ci_str + str(self.lookup_dic[c])
        return ci_str

with zipfile.ZipFile(zip_filename, 'r') as zip_file:
    with ThreadPoolExecutor() as executor: #parallelization
        results = []
        for file_info in zip_file.infolist(): #iteration over file in selected chunk (automatize later)
            if file_info.filename.startswith("chunk14/") and file_info.filename.endswith(".json.gz"): #additional check
                with zip_file.open(file_info) as compressed_file: #open file in the zip archive
                    compressed_data = compressed_file.read() #read data
                    results.append(executor.submit(process_json_data, compressed_data)) #give task to threadpoolexecutor
        
        peer_review_items = []
        for result in results:
            peer_review_items.extend(result.result())   #retrieve results from executor

# Write directly to CSV
for output_filename in output_filenames:
    with open(output_filename, 'w', newline='', encoding='utf-8') as output_file:
        fieldnames = ["OCI", "DOI_peer", "DOI_article", "date_peer_review", "author_given_name", "author_family_name", "URL_peer_review"]
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()

        uno = OciProcess()
        due = OciProcess()

        for element in peer_review_items:
            for i in element.get("relation", {}).get("is-review-of"):
                doi_p = element["DOI"]
                doi_a = i.get("id", "")
                url_p = element["URL"]
                citing_entity_local_id = uno.convert_doi_to_ci(doi_p)
                cited_entity_local_id = due.convert_doi_to_ci(doi_a)
                oci = "oci:" + citing_entity_local_id + "-" + cited_entity_local_id
                date_peer_review = element.get("created", {}).get("date-parts", [[]])[0]
                author_info = element.get("editor", [{}])[0]
                given_name = author_info.get("given", "")
                family_name = author_info.get("family", "")
                if doi_p and doi_a:
                    writer.writerow({
                        "OCI": oci,
                        "DOI_peer": doi_p,
                        "DOI_article": doi_a,
                        "date_peer_review": parse_date(date_peer_review),
                        "author_given_name": given_name,
                        "author_family_name": family_name,
                        "URL_peer_review": url_p
                    })

    print("URL pairs from peer review items saved to", output_filename)

Iterazione sui file zippati e creazione di un CSV con le informazioni delle chiavi che ci interessano senza distinzione di pubblicazione (escludiamo solo le peer review):

In [None]:
import zipfile
import gzip
import json
import csv

# Funzione per estrarre informazioni rilevanti da un file JSON
def extract_info_from_json(json_data):
    info_list = []
    for item in json_data['items']:
        if 'peer-review' not in item.get('type', []):
            info = {
                'DOI': item.get('DOI', ''),
                'URL': item.get('URL', ''),
                'ISSN': ', '.join(item.get('ISSN', [])),
                'container-title': ', '.join(item.get('container-title', [])),
                'date-time': str(item.get('created', {}).get('date-time', ''))[:10]
            }
            info_list.append(info)
    return info_list

# Funzione per iterare sui file zippati e creare il CSV
def process_zip_files(zip_filename, output_csv):
    with zipfile.ZipFile(zip_filename, 'r') as zip_file:
        with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['DOI', 'URL', 'ISSN', 'container-title', 'date-time']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            for file_info in zip_file.infolist():
                if file_info.filename.endswith(".json.gz"):
                    with zip_file.open(file_info) as compressed_file:
                        compressed_data = compressed_file.read()
                        decompressed_data = gzip.decompress(compressed_data)
                        json_data = json.loads(decompressed_data)
                        info_list = extract_info_from_json(json_data)
                        writer.writerows(info_list)

# Utilizzo della funzione
zip_filename = "chunk9.zip"
output_csv = "info_non_peer_review9.csv"
process_zip_files(zip_filename, output_csv)

Iterazione sul CSV per trovare i match con i DOI in combined_file_final.csv

In [None]:
import csv

# Funzione per trovare i match con i DOI nel CSV
def find_matches_in_csv(input_csv, match_csv, output_csv):
    dois_to_match = set() # Set per memorizzare i DOI da cercare
    with open(match_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dois_to_match.add(row['DOI_article'])
    
    matched_info = [] # Lista per memorizzare le informazioni corrispondenti
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['DOI'] in dois_to_match:
                matched_info.append(row)
    
    # Scrivere le informazioni corrispondenti in un nuovo CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['DOI', 'URL', 'ISSN', 'container-title', 'timestamp']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(matched_info)

# Utilizzo della funzione
input_csv = "info_non_peer_review.csv"
match_csv = "combined_file_final.csv"
output_csv = "matched_info10.csv"
find_matches_in_csv(input_csv, match_csv, output_csv)