In [1]:
import pandas as pd
import os
import glob
import pprint

In [2]:
# Define the directory containing the TSV files
directory = "claude_3.5_sonnet_results/"

In [3]:
# Find all TSV files in the directory
tsv_files = glob.glob(os.path.join(directory, "*.tsv"))


In [4]:
pprint.pprint(tsv_files)

['claude_3.5_sonnet_results/JGR Biogeosciences - 2022 - Rooney - The Impact of '
 'Freeze‐Thaw History on Soil Carbon Response to Experimental Freeze‐Thaw.tsv',
 'claude_3.5_sonnet_results/s41467-023-36515-y.tsv',
 'claude_3.5_sonnet_results/Limnology   Oceanography - 2019 - Linz - '
 'Time‐series metatranscriptomes reveal conserved patterns between '
 'phototrophic and.tsv',
 'claude_3.5_sonnet_results/s41597-024-03069-7.tsv',
 'claude_3.5_sonnet_results/1-s2.0-S0016706121007540-main.tsv',
 'claude_3.5_sonnet_results/s41564-020-00861-0.tsv',
 'claude_3.5_sonnet_results/bell-et-al-2020-metatranscriptomic-sequencing-of-a-cyanobacterial-soil-surface-consortium-with-and-without-a-diverse.tsv',
 'claude_3.5_sonnet_results/alteio-et-al-2020-complementary-metagenomic-approaches-improve-reconstruction-of-microbial-diversity-in-a-forest-soil.tsv',
 'claude_3.5_sonnet_results/s41597-024-04013-5.tsv',
 'claude_3.5_sonnet_results/microorganisms-09-00357-v2.tsv',
 'claude_3.5_sonnet_results/s40168

In [5]:
# Initialize a list to store the DataFrames
all_dfs = list()

# Iterate through the TSV files
for file_path in tsv_files:
    try:
        print(f"Processing {file_path}")
        # Load the TSV file without treating "NA" as NaN
        df = pd.read_csv(file_path, sep='\t', keep_default_na=False)

        # Clean column names for easier access
        df.columns = ["Number", "Item", "Vote", "Comments"]

        # Add a filename column
        df['Filename'] = os.path.basename(file_path)

        # Append the DataFrame to the list
        all_dfs.append(df)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

Processing claude_3.5_sonnet_results/JGR Biogeosciences - 2022 - Rooney - The Impact of Freeze‐Thaw History on Soil Carbon Response to Experimental Freeze‐Thaw.tsv
Processing claude_3.5_sonnet_results/s41467-023-36515-y.tsv
Processing claude_3.5_sonnet_results/Limnology   Oceanography - 2019 - Linz - Time‐series metatranscriptomes reveal conserved patterns between phototrophic and.tsv
Processing claude_3.5_sonnet_results/s41597-024-03069-7.tsv
Processing claude_3.5_sonnet_results/1-s2.0-S0016706121007540-main.tsv
Processing claude_3.5_sonnet_results/s41564-020-00861-0.tsv
Processing claude_3.5_sonnet_results/bell-et-al-2020-metatranscriptomic-sequencing-of-a-cyanobacterial-soil-surface-consortium-with-and-without-a-diverse.tsv
Processing claude_3.5_sonnet_results/alteio-et-al-2020-complementary-metagenomic-approaches-improve-reconstruction-of-microbial-diversity-in-a-forest-soil.tsv
Processing claude_3.5_sonnet_results/s41597-024-04013-5.tsv
Processing claude_3.5_sonnet_results/microor

In [6]:
# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(all_dfs, ignore_index=True)


In [7]:
combined_df

Unnamed: 0,Number,Item,Vote,Comments,Filename
0,1.0,Structured or unstructured abstract,Yes,Contains both an abstract and plain language s...,JGR Biogeosciences - 2022 - Rooney - The Impac...
1,1.1,Study design,Yes,States this is a laboratory-controlled freeze-...,JGR Biogeosciences - 2022 - Rooney - The Impac...
2,1.2,Environmental & sample information,Yes,Describes permafrost soils from two Alaskan si...,JGR Biogeosciences - 2022 - Rooney - The Impac...
3,1.3,Host information (if applicable),,Study does not involve host organisms.,JGR Biogeosciences - 2022 - Rooney - The Impac...
4,1.4,Experiments & omics methods,Yes,Describes use of FT-ICR-MS analysis to identif...,JGR Biogeosciences - 2022 - Rooney - The Impac...
...,...,...,...,...,...
1670,14.1,Funding,Yes,Detailed funding information provided,Molecular Ecology - 2023 - He - Diversity dis...
1671,15.0,Conflicts of interest,Yes,COI statement included,Molecular Ecology - 2023 - He - Diversity dis...
1672,16.0,Supplementary data & files,Yes,Supplementary tables and files referenced,Molecular Ecology - 2023 - He - Diversity dis...
1673,17.0,Sample & data availability,Yes,Data availability statement with accession num...,Molecular Ecology - 2023 - He - Diversity dis...


In [8]:
combined_df.to_csv("streams_summary.tsv", sep='\t', index=False)

In [9]:
# Create a file-wise summary
file_summary = combined_df.pivot_table(index='Filename', columns='Vote', aggfunc='count', fill_value=0)['Number']
file_summary = file_summary.reset_index()
file_summary.columns.name = None
file_summary = file_summary.rename(columns={'Filename': 'Filename', 'No': 'No Votes', 'Yes': 'Yes Votes', 'NA': 'NA Votes'})


In [10]:
file_summary

Unnamed: 0,Filename,NA Votes,No Votes,Yes Votes
0,1-s2.0-S0016706121007540-main.tsv,12,8,47
1,2022.12.12.520098v2.full.tsv,8,7,52
2,41564_2019_Article_449.tsv,6,6,55
3,BISS_article_20637.tsv,44,6,17
4,JGR Biogeosciences - 2022 - Rooney - The Impac...,10,6,51
5,Limnology Oceanography - 2019 - Linz - Time‐...,8,4,55
6,Molecular Ecology - 2023 - He - Diversity dis...,5,7,55
7,New Phytologist - 2018 - Sasse - Multilab EcoF...,6,5,56
8,Soil_Bacterial_Diversity_Is_Positively_Correla...,1,6,60
9,acp-23-15783-2023.tsv,11,9,47


In [11]:
file_summary.to_csv('streams_summary_by_pdf.tsv', index=False, sep='\t')

In [12]:
# Create a question-wise summary (using Number instead of Item)
question_summary = combined_df.pivot_table(index='Number', columns='Vote', aggfunc='count', fill_value=0)['Item']
question_summary = question_summary.reset_index()
question_summary.columns.name = None
question_summary = question_summary.rename(columns={'Number': 'Question Number', 'No': 'No Votes', 'Yes': 'Yes Votes', 'NA': 'NA Votes'})

# Create a mapping from Number to Item
number_to_item = combined_df.groupby('Number')['Item'].first().to_dict()

# Add the Item column to question_summary using the mapping
question_summary['Item'] = question_summary['Question Number'].map(number_to_item)

# Reorder columns to put 'Item' next to 'Question Number'
question_summary = question_summary[['Question Number', 'Item', 'Yes Votes', 'No Votes', 'NA Votes']]


In [13]:
question_summary

Unnamed: 0,Question Number,Item,Yes Votes,No Votes,NA Votes
0,1.0,Structured or unstructured abstract,25,0,0
1,1.1,Study design,25,0,0
2,1.2,Environmental & sample information,25,0,0
3,1.3,Host information (if applicable),9,0,16
4,1.4,Experiments & omics methods,25,0,0
...,...,...,...,...,...
62,14.1,Funding,24,1,0
63,15.0,Conflicts of interest,19,6,0
64,16.0,Supplementary data & files,21,4,0
65,17.0,Sample & data availability,24,0,1


In [14]:
question_summary.to_csv('streams_summary_by_question.tsv', index=False, sep='\t')