- funzioni come classi
- chiamare da terminale

# 1. Peer Review

- cambiare nome chunk all'occorrenza

In [None]:
! pip install polars

In [None]:
import zipfile
import gzip
import json
import csv
import os
import errno
from concurrent.futures import ThreadPoolExecutor
import polars as pl

zip_filename = "chunk6.zip"

def process_json_data(compressed_data):
    decompressed_data = gzip.decompress(compressed_data) # decompression
    decoded_data = decompressed_data.decode('utf-8') # decode as utf8
    json_data = json.loads(decoded_data)
    peer_review_items = [item for item in json_data['items'] if item.get('type') == 'peer-review'] # filtering elements
    return peer_review_items

output_filename = "info_peer_review6.csv"
LOOKUP_CSV = 'lookup.csv'
CROSSREF_CODE = '020'

# handle oci

class OciProcess:
    def __init__(self):
        self.lookup_code = 0
        self.lookup_dic = {}
        self.LOOKUP_CSV = 'lookup.csv'
        self.CROSSREF_CODE = '020'
        self.init_lookup_dic()  # Ensure lookup dictionary is initialized

    def init_lookup_dic(self):
        with open(self.LOOKUP_CSV, 'r', encoding='utf-8') as lookupcsv:
            lookupcsv_reader = csv.DictReader(lookupcsv)
            code = -1
            for row in lookupcsv_reader:
                if row['c'] not in self.lookup_dic:  # Avoid duplicates
                    self.lookup_dic[row['c']] = row['code']
                    code = int(row['code'])
            self.lookup_code = code

    def calc_next_lookup_code(self):
        rem = self.lookup_code % 100
        newcode = self.lookup_code + 1
        if rem == 89:
            newcode = (self.lookup_code // 100 + 1) * 100
        self.lookup_code = newcode

    def update_lookup(self, c):
        if c not in self.lookup_dic:
            self.calc_next_lookup_code()
            code = str(self.lookup_code).zfill(2)  # Ensure code has at least 2 digits
            self.lookup_dic[c] = code
            self.write_txtblock_on_csv(self.LOOKUP_CSV, '\n"%s","%s"' % (c, code))

    def check_make_dirs(self, filename):
        directory = os.path.dirname(filename)
        if directory and not os.path.exists(directory):
            try:
                os.makedirs(directory)
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    raise

    def write_txtblock_on_csv(self, csv_path, block_txt):
        self.check_make_dirs(csv_path)
        with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
            csvfile.write(block_txt)

    def convert_doi_to_ci(self, doi_str):
        return self.CROSSREF_CODE + self.match_str_to_lookup(doi_str)

    def match_str_to_lookup(self, str_val):
        ci_str = ""
        str_noprefix = str_val[3:]
        for c in str_noprefix:
            if c not in self.lookup_dic:
                self.update_lookup(c)
            ci_str += str(self.lookup_dic[c])
        return ci_str

with zipfile.ZipFile(zip_filename, 'r') as zip_file:
    with ThreadPoolExecutor() as executor: # parallelization
        results = []
        for file_info in zip_file.infolist(): # iteration over file in selected chunk (automatize later)
            if file_info.filename.startswith("chunk6/") and file_info.filename.endswith(".json.gz"): # additional check
                with zip_file.open(file_info) as compressed_file: # open file in the zip archive
                    compressed_data = compressed_file.read() # read data
                    results.append(executor.submit(process_json_data, compressed_data)) # give task to ThreadPoolExecutor
        
        peer_review_items = []
        for result in results:
            peer_review_items.extend(result.result()) # retrieve results from executor

# Write directly to CSV
with open(output_filename, 'w', newline='', encoding='utf-8') as output_file:
    fieldnames = ["oci", "citing_doi", "cited_doi", "citing_date", "citing_url"]
    writer = csv.DictWriter(output_file, fieldnames=fieldnames)
    writer.writeheader()

    uno = OciProcess()
    due = OciProcess()

    for element in peer_review_items:
        for i in element.get("relation", {}).get("is-review-of", []):
            doi_p = element["DOI"]
            doi_a = i.get("id", "")
            url_p = element["URL"]
            citing_entity_local_id = uno.convert_doi_to_ci(doi_p)
            cited_entity_local_id = due.convert_doi_to_ci(doi_a)
            oci = "oci:" + citing_entity_local_id + "-" + cited_entity_local_id
            date_peer_review = str(element["created"]["date-time"])[:10]
            if doi_p and doi_a:
                writer.writerow({
                    "oci": oci,
                    "citing_doi": doi_p,
                    "cited_doi": doi_a,
                    "citing_date": date_peer_review,
                    "citing_url": url_p
                })

print("Info peer review items saved to", output_filename)

input_filename = "info_peer_review6.csv"
output_filename_unique = "info_peer_review6_unique.csv"

# Load the CSV into a DataFrame
df = pl.read_csv(input_filename)

# Remove duplicate rows based on the "OCI" column
df_unique = df.unique(subset=['OCI'])

# Save the unique DataFrame back to a new CSV
df_unique.write_csv(output_filename_unique)

print("Unique info peer review items saved to", output_filename_unique)

# 2. Concatenate Peer Review

In [None]:
def concatenate_csv_files(csv_files_string, output_filename):
    # Split the input string to get the list of CSV filenames
    csv_files_list = csv_files_string.split(', ')

    # Initialize an empty list to hold the dataframes
    dataframes = []

    # Loop through the list of CSV filenames
    for csv_file in csv_files_list:
        # Read each CSV file into a dataframe and append it to the list
        df = pd.read_csv(csv_file.strip())
        dataframes.append(df)

    # Concatenate all the dataframes
    concatenated_df = pd.concat(dataframes, ignore_index=True)

    # Write the concatenated dataframe to a new CSV file
    concatenated_df.to_csv(output_filename, index=False)

# Example usage
csv_files_string = "file1.csv, file2.csv, file3.csv"
output_filename = "combined_output.csv"
concatenate_csv_files(csv_files_string, output_filename)

# 3. Non peer Review

In [None]:
import zipfile
import gzip
import json
import csv

# Funzione per estrarre informazioni rilevanti da un file JSON
def extract_info_from_json(json_data):
    info_list = []
    for item in json_data['items']:
        if 'peer-review' not in item.get('type', []):
            info = {
                'DOI': item.get('DOI', ''),
                'URL': item.get('URL', ''),
                'ISSN': ', '.join(item.get('ISSN', [])),
                'container-title': ', '.join(item.get('container-title', [])),
                'date-time': str(item.get('created', {}).get('date-time', ''))[:10]
            }
            info_list.append(info)
    return info_list

# Funzione per iterare sui file zippati e creare il CSV
def process_zip_files(zip_filename, output_csv):
    with zipfile.ZipFile(zip_filename, 'r') as zip_file:
        with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['DOI', 'URL', 'ISSN', 'container-title', 'date-time']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            for file_info in zip_file.infolist():
                if file_info.filename.endswith(".json.gz"):
                    with zip_file.open(file_info) as compressed_file:
                        compressed_data = compressed_file.read()
                        decompressed_data = gzip.decompress(compressed_data)
                        json_data = json.loads(decompressed_data)
                        info_list = extract_info_from_json(json_data)
                        writer.writerows(info_list)

# Utilizzo della funzione
zip_filename = "chunk14.zip"
output_csv = "info_non_peer_review9.csv"
process_zip_files(zip_filename, output_csv)

# 4. Merge tables e Provenance

In [None]:
import csv
from datetime import datetime
import pytz
import os

input_csv = 'combined_file_final.csv'
temp_csv = 'tempfile.csv'
prov_agent_url = "https://academictorrents.com/details/d9e554f4f0c3047d9f49e448a7004f7aa1701b69"
source_url = "https://doi.org/10.13003/8wx5k"

# Define the timezone (UTC in this example)
timezone = pytz.timezone("UTC")
# Get the current time with timezone information in the desired format
current_timestamp = datetime.now(timezone).strftime('%Y-%m-%dT%H:%M:%S%z')

try:
    with open(input_csv, mode='r', newline='', encoding='utf-8') as infile, open(temp_csv, mode='w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Read the header
        header = next(reader)
        # Add new column headers
        header.extend(['prov_agent', 'source', 'prov_date'])
        writer.writerow(header)

        # Process each row
        for row in reader:
            row.extend([prov_agent_url, source_url, current_timestamp])
            writer.writerow(row)
    
    # Replace the original file with the temporary file
    os.replace(temp_csv, input_csv)
    print(f"Updated CSV saved as {input_csv}")

except UnicodeDecodeError as e:
    print(f"Failed to decode using utf-8 encoding: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

# 5. RDF creator

## Class creation

In [None]:
from rdflib import Graph, RDF, RDFS, XSD, URIRef, Literal
from dateutil.relativedelta import relativedelta
from dateutil.parser import parse
from datetime import datetime

class PeerReview(object):

    # predicates citation
    __cito_base = "http://purl.org/spar/cito/"
    _reviews = URIRef(__cito_base + "reviews") 
    _citation = URIRef(__cito_base + "Citation")
    _has_citation_creation_date = URIRef(__cito_base + "hasCitationCreationDate")
    _has_citation_time_span = URIRef(__cito_base + "hasCitationTimeSpan")
    _has_citing_entity = URIRef(__cito_base + "hasCitingEntity")
    _has_cited_entity = URIRef(__cito_base + "hasCitedEntity")
    _has_citation_characterization = URIRef(__cito_base + "hasCitationCharacterisation")

    # predicates provenance
    
    __prov_base = "http://www.w3.org/ns/prov#"
    _was_attributed_to = URIRef(__prov_base + "wasAttributedTo")
    _had_primary_source = URIRef(__prov_base + "hadPrimarySource")
    _generated_at_time = URIRef(__prov_base + "generatedAtTime")

    # init

    def __init__(self, oci, citing_url=None, cited_url=None, timespan=None, citing_date=None, prov_agent_url=None, source=None, prov_date=None):
        self.oci = oci[4:]
        self.citing_url = citing_url
        self.citing_date = citing_date
        self.cited_url = cited_url
        self.timespan = timespan
        self.prov_agent_url = prov_agent_url
        self.source = source
        self.prov_date = prov_date


    def get_peer_review_rdf(self, baseurl, include_data=True, include_prov=True):
        peer_review_graph = Graph()

        citation_corpus_id = "ci/" + self.oci
        citation = URIRef(baseurl + citation_corpus_id)

        if include_data:
            citing_br = URIRef(self.citing_url)
            cited_br = URIRef(self.cited_url)

            peer_review_graph.add((citation, RDF.type, self.__citation))
            # ho rimosso le robe di self citation.
            #statement of charaacherization
            peer_review_graph.add((citation,self._has_citation_characterization, self._reviews))

            peer_review_graph.add((citation, self.__has_citing_entity, citing_br))
            peer_review_graph.add((citation, self.__has_cited_entity, cited_br))

            if self.citing_date is not None:
                if PeerReview.contains_days(self.citing_date):
                    xsd_type = XSD.date
                elif PeerReview.contains_months(self.citing_date):
                    xsd_type = XSD.gYearMonth
                else:
                    xsd_type = XSD.gYear

                peer_review_graph.add((citation, self.__has_citation_creation_date,
                                    Literal(self.citing_date, datatype=xsd_type, normalize=False)))
                if self.timespan is not None:
                    peer_review_graph.add((citation, self.__has_citation_time_span,
                                        Literal(self.timespan, datatype=XSD.duration)))
                    

        if include_prov:
            peer_review_graph.add((citation, self.__was_attributed_to, URIRef(self.prov_agent_url)))
            peer_review_graph.add((citation, self.__had_primary_source, URIRef(self.source)))
            peer_review_graph.add((citation, self.__generated_at_time, Literal(self.prov_date, datatype=XSD.dateTime)))

        return peer_review_graph
    
    @staticmethod
    def contains_years(date):
        return date is not None and len(date) >= 4

    @staticmethod
    def contains_months(date):
        return date is not None and len(date) >= 7

    @staticmethod
    def contains_days(date):
        return date is not None and len(date) >= 10

## Elements creation

In [None]:
import os
import errno
from argparse import ArgumentParser
# from citation import Citation
from rdflib.namespace import RDF, RDFS, SKOS
import csv
import urllib
from io import StringIO

# Function to populate RDF data from CSV input
import csv
from io import StringIO
from urllib.parse import quote
from datetime import datetime
from rdflib import XSD, Literal


def populate_data(csv_file, output_file):
    with open(csv_file, 'r') as file:

        reader = csv.DictReader(file, delimiter=',')


        for row in reader:
            oci = row['oci']
            citing_url = row['citing_url'] 
            cited_url = row['cited_url']
            citing_date = row['citing_date'] 
            timespan = row['timespan']

            citation = PeerReview(oci,
                                  citing_url=citing_url,
                                  cited_url=cited_url,
                                  timespan=timespan,
                                  citing_date=citing_date)

            g = citation.get_peer_review_rdf(BASE_URL, include_data=INCLUDE_DATA, include_prov=INCLUDE_PROV)

            # Open output file in append mode outside the inner loop
            with open(output_file, 'a', newline='') as f:
                f.write(g.serialize(format='nt'))



 # Function to populate provenance RDF data from CSV input
def populate_prov(csv_file, output_file):
    block_txt = ''  # Initialize an empty string to store RDF data
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file, delimiter=',')
        for row in reader:
            oci = row['oci']
            agent_url = row['prov_agent_url']
            prov_source = row['source']
            prov_date = row['prov_date']


            citation = PeerReview(oci,
                                  prov_agent_url=agent_url,
                                  source=prov_source,
                                  prov_date=prov_date)

            g = citation.get_peer_review_rdf(BASE_URL, include_data=INCLUDE_DATA, include_prov=INCLUDE_PROV)
            block_txt += g.serialize(format='nt')

    if block_txt:  # Write to output file if block_txt is not empty
        with open(output_file, 'a', newline='') as f:
            f.write(block_txt)





INPUT_ROOT_DIR = "."
OUTPUT_FILE = "dataset.ttl"
BASE_URL = "https://w3id.org/oc/index/coci/"
INCLUDE_DATA = True
INCLUDE_PROV = False
ENTRIES_PER_FILE = 100000
#BUFFER = 100000
OUTPUT_FILE = str(ENTRIES_PER_FILE)+'.ttl'


agent_url = "https://w3id.org/oc/index/coci/prov/pa/1"

prov_source = "https://api.crossref.org/works/"


csv_file = 'fakeassbitch.csv'
output_file = 'ciaociaociao.ttl'
populate_data(csv_file, OUTPUT_FILE)
# populate_prov(csv_file, OUTPUT_FILE )

# 6. ISSN table

In [None]:
import pandas as pd

# Read the original CSV file
df = pd.read_csv("matched_info17.csv")
df = df.fillna('')

# Filter rows containing a comma in the 'issn' column
comma_rows = df[df['issn'].str.contains(',')]

# Make a copy of the DataFrame to avoid SettingWithCopyWarning
comma_rows = comma_rows.copy()

# Split the 'issn' column into two separate columns for comma rows
comma_rows[['issn1', 'issn2']] = comma_rows['issn'].str.split(',', expand=True)

# Group by the two ISSN values and count occurrences for comma rows
grouped_df_comma = comma_rows.groupby(['issn1', 'issn2']).size().reset_index(name='count')

# Add the 'title' column to the grouped DataFrame
grouped_df_comma['title'] = comma_rows.groupby(['issn1', 'issn2'])['title'].apply(lambda x: ', '.join(set(x))).reset_index(drop=True)

# Filter rows without a comma in the 'issn' column
no_comma_rows = df[~df['issn'].str.contains(',')]

# Update counter for rows without comma and create new rows if needed
for index, row in no_comma_rows.iterrows():
    if row['issn'] in grouped_df_comma['issn1'].values or row['issn'] in grouped_df_comma['issn2'].values:
        for i, g_row in grouped_df_comma.iterrows():
            if row['issn'] == g_row['issn1'] or row['issn'] == g_row['issn2']:
                grouped_df_comma.at[i, 'count'] += 1
                break
    else:
        # Check if the ISSN is already in comma_rows to avoid duplicating titles
        if row['issn'] in comma_rows['issn'].values:
            title = comma_rows[comma_rows['issn'] == row['issn']]['title'].iloc[0]
        else:
            title = row['title']
        # Check if the ISSN group already has the title appended
        if (row['issn'], None) in zip(grouped_df_comma['issn1'], grouped_df_comma['issn2']):
            continue
        new_row = pd.DataFrame({'issn1': [row['issn']], 'issn2': [None], 'count': [1], 'title': [title]})
        grouped_df_comma = pd.concat([grouped_df_comma, new_row], ignore_index=True)

# Save the updated grouped data to a new CSV file
grouped_df_comma.to_csv("grouped_updated.csv", index=False)

# 7. Meta counting

## to extract doi

In [None]:
import pandas as pd
import zipfile
import io

# Path to the zip file
zip_file_path = 'csv_openalex.zip'

# Path to the output CSV file
output_file_path = 'meta_doi.csv'

# Clear the output file by opening it in write mode
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write('DOI\n')

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Iterate through the files in the zip
    for file_name in zip_ref.namelist():
        # Ensure the file is a CSV
        if file_name.endswith('.csv'):
            # Open the CSV file from the zip
            with zip_ref.open(file_name) as csv_file:
                # Process the CSV file in chunks
                for chunk in pd.read_csv(io.TextIOWrapper(csv_file, encoding='utf-8'), chunksize=10000):
                    # Initialize an empty list to hold the DOI values for this chunk
                    chunk_dois = []
                    
                    # Extract DOI values from the first column
                    first_column_values = chunk.iloc[:, 0].tolist()
                    for i in first_column_values:
                        if 'doi:' in i:
                            # Find the index of 'doi:'
                            start_index = i.find('doi:') + len('doi:')
                            # Find the index of the first blank space after 'doi:'
                            end_index = i.find(' ', start_index)
                            # If no space is found, use the length of the string
                            if end_index == -1:
                                end_index = len(i)
                            # Extract the substring starting from the character after 'doi:' up to the first blank space
                            doi = i[start_index:end_index]
                            # If the DOI contains a comma, enclose it in quotes
                            if ',' in doi:
                                doi = f'"{doi}"'
                            # Append the extracted DOI to the list for this chunk
                            chunk_dois.append(doi)
                    
                    # Write the extracted DOIs to the output CSV file with utf-8 encoding
                    with open(output_file_path, 'a', encoding='utf-8') as output_file:
                        for doi in chunk_dois:
                            output_file.write(f'{doi}\n')

print('file created')

## to count lines in output file

In [None]:
#count line in meta-doi

import csv

def count_rows_in_csv(file_path):
    with open(file_path, mode='r', newline='') as file:
        reader = csv.reader(file)
        row_count = sum(1 for row in reader)
    return row_count

# Example usage
file_path_2 = 'meta_doi.csv'
print(f'The number of dois in Meta is: {count_rows_in_csv(file_path_2)}')

## creates the two lists of peer reviews and articles reviewd

In [None]:
import pandas as pd

# Paths to the input and output files
csv_file_path = 'combined_file_final.csv'
meta_file_path = 'meta_doi.csv'
meta_peer_file_path = 'meta_peer.csv'
meta_article_file_path = 'meta_article.csv'

# Load 'combined_file_final.csv' into a DataFrame
combined_df = pd.read_csv(csv_file_path)

# Create sets from the columns 'DOI_peer' and 'DOI_article'
doi_peer_set = set(combined_df['DOI_peer'])
doi_article_set = set(combined_df['DOI_article'])

# Initialize the output CSV files with headers
with open(meta_peer_file_path, 'w', encoding='utf-8') as meta_peer_file:
    meta_peer_file.write('DOI\n')
with open(meta_article_file_path, 'w', encoding='utf-8') as meta_article_file:
    meta_article_file.write('DOI\n')

# Read 'meta_doi.csv' in chunks and process each chunk
chunk_size = 10000
for chunk in pd.read_csv(meta_file_path, chunksize=chunk_size):
    meta_peer_list = []
    meta_article_list = []
    
    # Iterate over each DOI in the chunk
    for doi in chunk.iloc[:, 0]:  # Assuming the DOI is in the first column
        if doi in doi_peer_set:
            meta_peer_list.append(doi)
        if doi in doi_article_set:
            meta_article_list.append(doi)
    
    # Append the results to the respective output files
    with open(meta_peer_file_path, 'a', encoding='utf-8') as meta_peer_file:
        for peer_doi in meta_peer_list:
            meta_peer_file.write(f'{peer_doi}\n')
    
    with open(meta_article_file_path, 'a', encoding='utf-8') as meta_article_file:
        for article_doi in meta_article_list:
            meta_article_file.write(f'{article_doi}\n')

print('files created')

## Count rows in lists

In [None]:
import pandas as pd

# Paths to the output files
meta_peer_file_path = 'meta_peer.csv'
meta_article_file_path = 'meta_article.csv'

# Function to count the number of rows in a CSV file
def count_rows(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # Skip the header line
        next(file)
        # Count the remaining lines
        row_count = sum(1 for row in file)
    return row_count

# Count the rows in each output file
meta_peer_row_count = count_rows(meta_peer_file_path)
meta_article_row_count = count_rows(meta_article_file_path)

# Print the results
print(f'Number of rows in {meta_peer_file_path}: {meta_peer_row_count}')
print(f'Number of rows in {meta_article_file_path}: {meta_article_row_count}')


#sistemare per non prendere doppioni

## Drop duplicates

In [None]:
import pandas as pd

def drop_duplicates_and_save(input_file, output_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file)
    
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    # Save the cleaned DataFrame back to a CSV file
    df.to_csv(output_file, index=False)
    
    print("Duplicates removed and cleaned file saved successfully!")

# Example usage:
input_file = 'meta_article.csv'
input_file_count = count_rows(input_file)

print(f'Number of rows in {input_file}: {input_file_count}')

output_file = 'meta_article_cleaned.csv'
drop_duplicates_and_save(input_file, output_file)
output_file_count = count_rows(output_file)

print(f'Number of rows in {output_file}: {output_file_count}') 


# Example usage:
input_file = 'meta_peer.csv'
input_file_count = count_rows(input_file)

print(f'Number of rows in {input_file}: {input_file_count}')

output_file = 'meta_peer_cleaned.csv'
drop_duplicates_and_save(input_file, output_file)
output_file_count = count_rows(output_file)

print(f'Number of rows in {output_file}: {output_file_count}')