# Description

This notebook reads a PR from a manuscript and matches original paragraphs with modified ones.

# Modules

In [1]:
from pathlib import Path

import pandas as pd
from github import Auth, Github
from IPython.display import display
from proj import conf
from proj.utils import process_paragraph

# Settings/paths

In [2]:
REPO = "pivlab/manubot-ai-editor-code-test-ccc-manuscript"
PR = (2, "gpt-3.5-turbo")

OUTPUT_FILE_PATH = None
REVERSED_OUTPUT_FILE_PATH = None

In [3]:
# Parameters
OUTPUT_FILE_PATH = "/home/miltondp/projects/others/manubot/manubot-ai-editor-code/base/results/paragraph_match/ccc-manuscript--gpt-3.5-turbo.pkl"
REVERSED_OUTPUT_FILE_PATH = "/home/miltondp/projects/others/manubot/manubot-ai-editor-code/base/results/paragraph_match/ccc-manuscript--gpt-3.5-turbo--reversed.pkl"


In [4]:
OUTPUT_FILE_PATH = Path(OUTPUT_FILE_PATH).resolve()
OUTPUT_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)
display(OUTPUT_FILE_PATH)

PosixPath('/home/miltondp/projects/others/manubot/manubot-ai-editor-code/base/results/paragraph_match/ccc-manuscript--gpt-3.5-turbo.pkl')

In [5]:
REVERSED_OUTPUT_FILE_PATH = Path(REVERSED_OUTPUT_FILE_PATH).resolve()
REVERSED_OUTPUT_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)
display(REVERSED_OUTPUT_FILE_PATH)

PosixPath('/home/miltondp/projects/others/manubot/manubot-ai-editor-code/base/results/paragraph_match/ccc-manuscript--gpt-3.5-turbo--reversed.pkl')

# Get Repo

In [6]:
auth = Auth.Token(conf.github.API_TOKEN)

In [7]:
g = Github(auth=auth)

In [8]:
repo = g.get_repo(REPO)

# Get Pull Request

In [9]:
pr = repo.get_pull(PR[0])

In [10]:
list(pr.get_files())

[File(sha="4b1b8489a63dc51e4eba4d71f32b026169e901d1", filename="content/01.abstract.md"),
 File(sha="96b9d8fdf8b314d5156029ea5e5c0727408d6cee", filename="content/02.introduction.md"),
 File(sha="ad101146fd6476ed4bf1fd4bcbbe8fa9e174c2f0", filename="content/04.05.results_intro.md"),
 File(sha="74ea03690a400f1c576007aff728e41fc7fb0faf", filename="content/04.10.results_comp.md"),
 File(sha="5088c6fa5f7d04ffdf17fd2392cc27c9d917c7a0", filename="content/04.12.results_giant.md"),
 File(sha="e0be1fecd6158445a8102f098545b135abf07fed", filename="content/06.discussion.md"),
 File(sha="328b3aac9742ec34d4d0d0fcc42dca439952433d", filename="content/08.01.methods.ccc.md"),
 File(sha="0c5625cac78916c535fe5f1c404846e52c62b59a", filename="content/08.05.methods.data.md"),
 File(sha="16d0a39afea2c2c361967d01214f79d8bbf2d76d", filename="content/08.15.methods.giant.md"),
 File(sha="81b2c0d4c4fca81a3820bd2ac3bb46efdd7b2ca5", filename="content/08.20.methods.mic.md"),
 File(sha="176e79ecda4017d56f4808a244e70a278

In [11]:
pr_commits = list(pr.get_commits())

In [12]:
pr_commits[0].parents

[Commit(sha="0adeb9d709cc9d66e52a325c114605655b1b4923")]

In [13]:
pr_prev = pr_commits[0].parents[0].sha
print(pr_prev)

0adeb9d709cc9d66e52a325c114605655b1b4923


In [14]:
pr_curr = pr_commits[0].sha
print(pr_curr)

bdee3d136aa9e8b6d80b31e926069f9b96e1cac5


# Get file list

In [15]:
pr_files = [f for f in pr.get_files() if f.filename.endswith(".md")]
display(pr_files)

[File(sha="4b1b8489a63dc51e4eba4d71f32b026169e901d1", filename="content/01.abstract.md"),
 File(sha="96b9d8fdf8b314d5156029ea5e5c0727408d6cee", filename="content/02.introduction.md"),
 File(sha="ad101146fd6476ed4bf1fd4bcbbe8fa9e174c2f0", filename="content/04.05.results_intro.md"),
 File(sha="74ea03690a400f1c576007aff728e41fc7fb0faf", filename="content/04.10.results_comp.md"),
 File(sha="5088c6fa5f7d04ffdf17fd2392cc27c9d917c7a0", filename="content/04.12.results_giant.md"),
 File(sha="e0be1fecd6158445a8102f098545b135abf07fed", filename="content/06.discussion.md"),
 File(sha="328b3aac9742ec34d4d0d0fcc42dca439952433d", filename="content/08.01.methods.ccc.md"),
 File(sha="0c5625cac78916c535fe5f1c404846e52c62b59a", filename="content/08.05.methods.data.md"),
 File(sha="16d0a39afea2c2c361967d01214f79d8bbf2d76d", filename="content/08.15.methods.giant.md"),
 File(sha="81b2c0d4c4fca81a3820bd2ac3bb46efdd7b2ca5", filename="content/08.20.methods.mic.md"),
 File(sha="176e79ecda4017d56f4808a244e70a278

# Sections

In [16]:
paragraph_matches = []

## Abstract

In [17]:
section_name = "abstract"

In [18]:
pr_filename = pr_files[0].filename
assert section_name in pr_filename
print(pr_filename)

content/01.abstract.md


### Original

In [19]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

## Abstract {.page_break_before}

Correlation coef


In [20]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

2

### Modified

In [21]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

## Abstract {.page_break_before}

In transcriptomi


In [22]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

2

### Match

In [23]:
orig_section_paragraphs[0]

'## Abstract {.page_break_before}'

In [24]:
mod_section_paragraphs[0]

'## Abstract {.page_break_before}'

####  Paragraph 00

In [25]:
par0 = process_paragraph(orig_section_paragraphs[1])
print(par0)

Correlation coefficients are widely used to identify patterns in data that may be of particular interest. In transcriptomics, genes with correlated expression often share functions or are part of disease-relevant biological processes. Here we introduce the Clustermatch Correlation Coefficient (CCC), an efficient, easy-to-use and not-only-linear coefficient based on machine learning models. CCC reveals biologically meaningful linear and nonlinear patterns missed by standard, linear-only correlation coefficients. CCC captures general patterns in data by comparing clustering solutions while being much faster than state-of-the-art coefficients such as the Maximal Information Coefficient. When applied to human gene expression data, CCC identifies robust linear relationships while detecting nonlinear patterns associated, for example, with sex differences that are not captured by linear-only coefficients. Gene pairs highly ranked by CCC were enriched for interactions in integrated networks bu

In [26]:
par1 = process_paragraph(mod_section_paragraphs[1])
print(par1)

In transcriptomics, identifying patterns in gene expression data is crucial for understanding biological processes. The traditional linear correlation coefficients may not capture all the complexities in gene relationships. To address this, we propose the Clustermatch Correlation Coefficient (CCC), a machine learning-based coefficient that can reveal both linear and nonlinear patterns efficiently. Compared to standard coefficients like the Maximal Information Coefficient, CCC is faster and can detect biologically meaningful patterns that are missed by linear-only methods. When applied to human gene expression data, CCC not only identifies robust linear relationships but also uncovers nonlinear patterns, such as sex differences. Additionally, CCC can detect functional relationships between genes that are overlooked by linear-only methods, as shown by enrichment in integrated networks. Overall, CCC is a versatile and efficient tool that can be applied to genome-scale data and various dat

In [27]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [28]:
display(paragraph_matches[-1])

('abstract',
 'Correlation coefficients are widely used to identify patterns in data that may be of particular interest. In transcriptomics, genes with correlated expression often share functions or are part of disease-relevant biological processes. Here we introduce the Clustermatch Correlation Coefficient (CCC), an efficient, easy-to-use and not-only-linear coefficient based on machine learning models. CCC reveals biologically meaningful linear and nonlinear patterns missed by standard, linear-only correlation coefficients. CCC captures general patterns in data by comparing clustering solutions while being much faster than state-of-the-art coefficients such as the Maximal Information Coefficient. When applied to human gene expression data, CCC identifies robust linear relationships while detecting nonlinear patterns associated, for example, with sex differences that are not captured by linear-only coefficients. Gene pairs highly ranked by CCC were enriched for interactions in integra

## Introduction

In [29]:
section_name = "introduction"

In [30]:
pr_filename = pr_files[1].filename
assert section_name in pr_filename
print(pr_filename)

content/02.introduction.md


### Original

In [31]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

## Introduction

New technologies have vastly impr


In [32]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

4

### Modified

In [33]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

## Introduction

Recent advancements in data colle


In [34]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

4

### Match

In [35]:
orig_section_paragraphs[0]

'## Introduction'

In [36]:
mod_section_paragraphs[0]

'## Introduction'

####  Paragraph 00

In [37]:
par0 = process_paragraph(orig_section_paragraphs[1])
print(par0)

New technologies have vastly improved data collection, generating a deluge of information across different disciplines. This large amount of data provides new opportunities to address unanswered scientific questions, provided we have efficient tools capable of identifying multiple types of underlying patterns. Correlation analysis is an essential statistical technique for discovering relationships between variables [@pmid:21310971]. Correlation coefficients are often used in exploratory data mining techniques, such as clustering or community detection algorithms, to compute a similarity value between a pair of objects of interest such as genes [@pmid:27479844] or disease-relevant lifestyle factors [@doi:10.1073/pnas.1217269109]. Correlation methods are also used in supervised tasks, for example, for feature selection to improve prediction accuracy [@pmid:27006077; @pmid:33729976]. The Pearson correlation coefficient is ubiquitously deployed across application domains and diverse scient

In [38]:
par1 = process_paragraph(mod_section_paragraphs[1])
print(par1)

Recent advancements in data collection have led to a significant increase in the amount of information available in various fields. This abundance of data presents an opportunity to explore unanswered scientific questions, but efficient tools are needed to uncover different patterns within the data. Correlation analysis, a fundamental statistical method, is commonly used to identify relationships between variables (Smith et al., 2011). Correlation coefficients play a crucial role in data mining approaches, such as clustering algorithms, to measure the similarity between objects of interest, such as genes (Jones et al., 2016) or lifestyle factors related to diseases (Brown et al., 2013). These methods are also applied in supervised tasks, including feature selection to enhance prediction accuracy (Johnson et al., 2017; White et al., 2021). The widespread use of the Pearson correlation coefficient spans various scientific fields and applications. Therefore, even slight improvements in th

In [39]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [40]:
display(paragraph_matches[-1])

('introduction',
 'New technologies have vastly improved data collection, generating a deluge of information across different disciplines. This large amount of data provides new opportunities to address unanswered scientific questions, provided we have efficient tools capable of identifying multiple types of underlying patterns. Correlation analysis is an essential statistical technique for discovering relationships between variables [@pmid:21310971]. Correlation coefficients are often used in exploratory data mining techniques, such as clustering or community detection algorithms, to compute a similarity value between a pair of objects of interest such as genes [@pmid:27479844] or disease-relevant lifestyle factors [@doi:10.1073/pnas.1217269109]. Correlation methods are also used in supervised tasks, for example, for feature selection to improve prediction accuracy [@pmid:27006077; @pmid:33729976]. The Pearson correlation coefficient is ubiquitously deployed across application domains

####  Paragraph 01

In [41]:
par0 = process_paragraph(orig_section_paragraphs[2])
print(par0)

In transcriptomics, many analyses start with estimating the correlation between genes. More sophisticated approaches built on correlation analysis can suggest gene function [@pmid:21241896], aid in discovering common and cell lineage-specific regulatory networks [@pmid:25915600], and capture important interactions in a living organism that can uncover molecular mechanisms in other species [@pmid:21606319; @pmid:16968540]. The analysis of large RNA-seq datasets [@pmid:32913098; @pmid:34844637] can also reveal complex transcriptional mechanisms underlying human diseases [@pmid:27479844; @pmid:31121115; @pmid:30668570; @pmid:32424349; @pmid:34475573]. Since the introduction of the omnigenic model of complex traits [@pmid:28622505; @pmid:31051098], gene-gene relationships are playing an increasingly important role in genetic studies of human diseases [@pmid:34845454; @doi:10.1101/2021.07.05.450786; @doi:10.1101/2021.10.21.21265342; @doi:10.1038/s41588-021-00913-z], even in specific fields 

In [42]:
# par1 = " ".join(mod_section_paragraphs[2:5]).strip()
par1 = process_paragraph(mod_section_paragraphs[2])
print(par1)

In the field of transcriptomics, many analyses begin by assessing the correlation between genes. More advanced methods, based on correlation analysis, can provide insights into gene function (Smith et al., 2010), assist in identifying common and cell lineage-specific regulatory networks (Jones et al., 2015), and reveal crucial interactions within living organisms that may shed light on molecular mechanisms in different species (Brown et al., 2011; White et al., 2005). Analyzing extensive RNA-seq datasets (Green et al., 2021; Black et al., 2022) can unveil intricate transcriptional mechanisms linked to human diseases (Gray et al., 2016; Brown et al., 2019; White et al., 2020; Black et al., 2021; Yellow et al., 2023). Following the introduction of the omnigenic model for complex traits (Red et al., 2017; Blue et al., 2019), gene-gene relationships have gained significance in genetic studies of human diseases (Black et al., 2022; Smith et al., 2021; White et al., 2021; Green et al., 2021)

In [43]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [44]:
display(paragraph_matches[-1])

('introduction',
 'In transcriptomics, many analyses start with estimating the correlation between genes. More sophisticated approaches built on correlation analysis can suggest gene function [@pmid:21241896], aid in discovering common and cell lineage-specific regulatory networks [@pmid:25915600], and capture important interactions in a living organism that can uncover molecular mechanisms in other species [@pmid:21606319; @pmid:16968540]. The analysis of large RNA-seq datasets [@pmid:32913098; @pmid:34844637] can also reveal complex transcriptional mechanisms underlying human diseases [@pmid:27479844; @pmid:31121115; @pmid:30668570; @pmid:32424349; @pmid:34475573]. Since the introduction of the omnigenic model of complex traits [@pmid:28622505; @pmid:31051098], gene-gene relationships are playing an increasingly important role in genetic studies of human diseases [@pmid:34845454; @doi:10.1101/2021.07.05.450786; @doi:10.1101/2021.10.21.21265342; @doi:10.1038/s41588-021-00913-z], even 

####  Paragraph 02

In [45]:
par0 = process_paragraph(orig_section_paragraphs[3])
print(par0)

The Pearson and Spearman correlation coefficients are widely used because they reveal intuitive relationships and can be computed quickly. However, they are designed to capture linear or monotonic patterns (referred to as linear-only) and may miss complex yet critical relationships. Novel coefficients have been proposed as metrics that capture nonlinear patterns such as the Maximal Information Coefficient (MIC) [@pmid:22174245] and the Distance Correlation (DC) [@doi:10.1214/009053607000000505]. MIC, in particular, is one of the most commonly used statistics to capture more complex relationships, with successful applications across several domains [@pmid:33972855; @pmid:33001806; @pmid:27006077]. However, the computational complexity makes them impractical for even moderately sized datasets [@pmid:33972855; @pmid:27333001]. Recent implementations of MIC, for example, take several seconds to compute on a single variable pair across a few thousand objects or conditions [@pmid:33972855]. 

In [46]:
par1 = process_paragraph(mod_section_paragraphs[3])
print(par1)

The Pearson and Spearman correlation coefficients are commonly used to identify relationships quickly, but they are limited to linear or monotonic patterns. New coefficients like the Maximal Information Coefficient (MIC) and Distance Correlation (DC) have been introduced to capture more complex nonlinear relationships. However, these coefficients are computationally intensive, making them impractical for larger datasets. Our previous work introduced a clustering method that outperformed traditional coefficients in detecting linear and nonlinear relationships. In this paper, we present the Clustermatch Correlation Coefficient (CCC), a new and efficient metric that can handle both quantitative and qualitative variables. The CCC has a single parameter that controls the complexity of relationships identified and computation time. Our implementation of CCC is highly parallelizable, allowing for fast computation on large datasets. We applied CCC to gene expression data from the GTEx project 

In [47]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [48]:
display(paragraph_matches[-1])

('introduction',
 'The Pearson and Spearman correlation coefficients are widely used because they reveal intuitive relationships and can be computed quickly. However, they are designed to capture linear or monotonic patterns (referred to as linear-only) and may miss complex yet critical relationships. Novel coefficients have been proposed as metrics that capture nonlinear patterns such as the Maximal Information Coefficient (MIC) [@pmid:22174245] and the Distance Correlation (DC) [@doi:10.1214/009053607000000505]. MIC, in particular, is one of the most commonly used statistics to capture more complex relationships, with successful applications across several domains [@pmid:33972855; @pmid:33001806; @pmid:27006077]. However, the computational complexity makes them impractical for even moderately sized datasets [@pmid:33972855; @pmid:27333001]. Recent implementations of MIC, for example, take several seconds to compute on a single variable pair across a few thousand objects or conditions

## Results (intro)

In [49]:
section_name = "results"

In [50]:
pr_filename = pr_files[2].filename
assert section_name in pr_filename
assert "intro" in pr_filename
print(pr_filename)

content/04.05.results_intro.md


### Original

In [51]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

### A robust and efficient not-only-linear depende


In [52]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

6

### Modified

In [53]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

### A robust and efficient not-only-linear depende


In [54]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

6

### Match

In [55]:
orig_section_paragraphs[0]

'### A robust and efficient not-only-linear dependence coefficient'

In [56]:
mod_section_paragraphs[0]

'### A robust and efficient not-only-linear dependence coefficient'

####  Paragraph 00

In [57]:
par0 = process_paragraph(orig_section_paragraphs[2])
print(par0)

The CCC provides a similarity measure between any pair of variables, either with numerical or categorical values. The method assumes that if there is a relationship between two variables/features describing $n$ data points/objects, then the **cluster**ings of those objects using each variable should **match**. In the case of numerical values, CCC uses quantiles to efficiently separate data points into different clusters (e.g., the median separates numerical data into two clusters). Once all clusterings are generated according to each variable, we define the CCC as the maximum adjusted Rand index (ARI) [@doi:10.1007/BF01908075] between them, ranging between 0 and 1. Details of the CCC algorithm can be found in [Methods](#sec:ccc_algo).


In [58]:
par1 = process_paragraph(mod_section_paragraphs[2])
print(par1)

The CCC calculates the similarity between pairs of variables, whether they have numerical or categorical values. It assumes that if two variables describe n data points, the clusters of those points should align. For numerical values, CCC uses quantiles to create clusters (e.g., the median splits data into two clusters). The CCC is defined as the maximum adjusted Rand index between the clusterings, with a range of 0 to 1. More information on the CCC algorithm is available in the Methods section.


In [59]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [60]:
display(paragraph_matches[-1])

('results',
 'The CCC provides a similarity measure between any pair of variables, either with numerical or categorical values. The method assumes that if there is a relationship between two variables/features describing $n$ data points/objects, then the **cluster**ings of those objects using each variable should **match**. In the case of numerical values, CCC uses quantiles to efficiently separate data points into different clusters (e.g., the median separates numerical data into two clusters). Once all clusterings are generated according to each variable, we define the CCC as the maximum adjusted Rand index (ARI) [@doi:10.1007/BF01908075] between them, ranging between 0 and 1. Details of the CCC algorithm can be found in [Methods](#sec:ccc_algo).',
 'The CCC calculates the similarity between pairs of variables, whether they have numerical or categorical values. It assumes that if two variables describe n data points, the clusters of those points should align. For numerical values, CC

####  Paragraph 01

In [61]:
par0 = process_paragraph(orig_section_paragraphs[3])
print(par0)

We examined how the Pearson ($p$), Spearman ($s$) and CCC ($c$) correlation coefficients behaved on different simulated data patterns. In the first row of Figure @fig:datasets_rel, we examine the classic Anscombe's quartet [@doi:10.1080/00031305.1973.10478966], which comprises four synthetic datasets with different patterns but the same data statistics (mean, standard deviation and Pearson's correlation). This kind of simulated data, recently revisited with the "Datasaurus" [@url:http://www.thefunctionalart.com/2016/08/download-datasaurus-never-trust-summary.html; @doi:10.1145/3025453.3025912; @doi:10.1111/dsji.12233], is used as a reminder of the importance of going beyond simple statistics, where either undesirable patterns (such as outliers) or desirable ones (such as biologically meaningful nonlinear relationships) can be masked by summary statistics alone.


In [62]:
par1 = process_paragraph(mod_section_paragraphs[3])
print(par1)

We analyzed the behavior of the Pearson ($p$), Spearman ($s$), and CCC ($c$) correlation coefficients using various simulated data patterns. Figure @fig:datasets_rel shows the results for Anscombe's quartet, consisting of four synthetic datasets with different patterns but the same data statistics. This classic dataset highlights the limitations of relying solely on summary statistics, as it can mask both undesirable patterns, like outliers, and desirable ones, such as biologically meaningful nonlinear relationships. The importance of looking beyond simple statistics is further emphasized by the "Datasaurus" dataset, which has been revisited in recent studies.


In [63]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [64]:
display(paragraph_matches[-1])

('results',
 'We examined how the Pearson ($p$), Spearman ($s$) and CCC ($c$) correlation coefficients behaved on different simulated data patterns. In the first row of Figure @fig:datasets_rel, we examine the classic Anscombe\'s quartet [@doi:10.1080/00031305.1973.10478966], which comprises four synthetic datasets with different patterns but the same data statistics (mean, standard deviation and Pearson\'s correlation). This kind of simulated data, recently revisited with the "Datasaurus" [@url:http://www.thefunctionalart.com/2016/08/download-datasaurus-never-trust-summary.html; @doi:10.1145/3025453.3025912; @doi:10.1111/dsji.12233], is used as a reminder of the importance of going beyond simple statistics, where either undesirable patterns (such as outliers) or desirable ones (such as biologically meaningful nonlinear relationships) can be masked by summary statistics alone.',
 'We analyzed the behavior of the Pearson ($p$), Spearman ($s$), and CCC ($c$) correlation coefficients usin

####  Paragraph 02

In [65]:
par0 = process_paragraph(orig_section_paragraphs[4])
print(par0)

Anscombe I contains a noisy but clear linear pattern, similar to Anscombe III where the linearity is perfect besides one outlier. In these two examples, CCC separates data points using two clusters (one red line for each variable $x$ and $y$), yielding 1.0 and thus indicating a strong relationship. Anscombe II seems to follow a partially quadratic relationship interpreted as linear by Pearson and Spearman. In contrast, for this potentially undersampled quadratic pattern, CCC yields a lower yet non-zero value of 0.34, reflecting a more complex relationship than a linear pattern. Anscombe IV shows a vertical line of data points where $x$ values are almost constant except for one outlier. This outlier does not influence CCC as it does for Pearson or Spearman. Thus $c=0.00$ (the minimum value) correctly indicates no association for this variable pair because, besides the outlier, for a single value of $x$ there are ten different values for $y$. This pair of variables does not fit the CCC a

In [66]:
par1 = process_paragraph(mod_section_paragraphs[4])
print(par1)

Anscombe I and III display clear linear patterns, with one outlier in Anscombe III. The Correlation Coefficient based on Machine Learning (CCC) accurately separates data points in these examples, yielding a strong relationship value of 1.0. Anscombe II exhibits a partially quadratic relationship, which is interpreted as linear by traditional correlation methods. However, CCC identifies a more complex relationship with a value of 0.34. In Anscombe IV, a vertical line of data points with one outlier is observed. CCC correctly indicates no association with a value of 0.00 due to the outlier and the variation in values for one variable. This demonstrates that CCC is sensitive to deviations from linear patterns. Pearson's correlation coefficient remains consistent across all examples, while Spearman's coefficient varies. Overall, Pearson and Spearman are effective in detecting linear patterns but are less robust when faced with nonlinear relationships or outliers.


In [67]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [68]:
display(paragraph_matches[-1])

('results',
 "Anscombe I contains a noisy but clear linear pattern, similar to Anscombe III where the linearity is perfect besides one outlier. In these two examples, CCC separates data points using two clusters (one red line for each variable $x$ and $y$), yielding 1.0 and thus indicating a strong relationship. Anscombe II seems to follow a partially quadratic relationship interpreted as linear by Pearson and Spearman. In contrast, for this potentially undersampled quadratic pattern, CCC yields a lower yet non-zero value of 0.34, reflecting a more complex relationship than a linear pattern. Anscombe IV shows a vertical line of data points where $x$ values are almost constant except for one outlier. This outlier does not influence CCC as it does for Pearson or Spearman. Thus $c=0.00$ (the minimum value) correctly indicates no association for this variable pair because, besides the outlier, for a single value of $x$ there are ten different values for $y$. This pair of variables does not

####  Paragraph 03

In [69]:
par0 = process_paragraph(orig_section_paragraphs[5])  # .replace(" - ", "\n- ")
print(par0)

We simulated additional types of relationships (Figure @fig:datasets_rel, second row), including some previously described from gene expression data [@doi:10.1126/science.1205438; @doi:10.3389/fgene.2019.01410; @doi:10.1091/mbc.9.12.3273]. For the random/independent pair of variables, all coefficients correctly agree with a value close to zero. The non-coexistence pattern, captured by all coefficients, represents a case where one gene ($x$) might be expressed while the other one ($y$) is inhibited, highlighting a potentially strong biological relationship (such as a microRNA negatively regulating another gene). For the other two examples (quadratic and two-lines), Pearson and Spearman do not capture the nonlinear pattern between variables $x$ and $y$. These patterns also show how CCC uses different degrees of complexity to capture the relationships. For the quadratic pattern, for example, CCC separates $x$ into more clusters (four in this case) to reach the maximum ARI. The two-lines e

In [70]:
par1 = process_paragraph(mod_section_paragraphs[5])
print(par1)

We examined various types of relationships, as shown in Figure @fig:datasets_rel (second row), including those observed in gene expression data. For a random/independent pair of variables, all coefficients approached zero. The non-coexistence pattern, identified by all coefficients, indicates a scenario where one gene ($x$) may be active while the other ($y$) is suppressed, suggesting a potentially significant biological association (e.g., microRNA regulation). In the case of the quadratic and two-lines examples, Pearson and Spearman coefficients failed to capture the nonlinear relationship between variables $x$ and $y. The CCC method demonstrated its ability to capture different levels of complexity in these relationships. For the quadratic pattern, CCC segmented $x$ into four clusters to optimize the Adjusted Rand Index (ARI). In the two-lines example, where two linear relationships with distinct slopes were present, neither Pearson nor Spearman coefficients detected the pattern ($p=

In [71]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [72]:
display(paragraph_matches[-1])

('results',
 'We simulated additional types of relationships (Figure @fig:datasets_rel, second row), including some previously described from gene expression data [@doi:10.1126/science.1205438; @doi:10.3389/fgene.2019.01410; @doi:10.1091/mbc.9.12.3273]. For the random/independent pair of variables, all coefficients correctly agree with a value close to zero. The non-coexistence pattern, captured by all coefficients, represents a case where one gene ($x$) might be expressed while the other one ($y$) is inhibited, highlighting a potentially strong biological relationship (such as a microRNA negatively regulating another gene). For the other two examples (quadratic and two-lines), Pearson and Spearman do not capture the nonlinear pattern between variables $x$ and $y$. These patterns also show how CCC uses different degrees of complexity to capture the relationships. For the quadratic pattern, for example, CCC separates $x$ into more clusters (four in this case) to reach the maximum ARI. T

## Results (comp)

In [73]:
# section_name = "results"

In [74]:
pr_filename = pr_files[3].filename
assert section_name in pr_filename
assert "comp" in pr_filename
print(pr_filename)

content/04.10.results_comp.md


### Original

In [75]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

### The CCC reveals linear and nonlinear patterns 


In [76]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

9

### Modified

In [77]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

### The CCC reveals linear and nonlinear patterns 


In [78]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

9

### Match

In [79]:
orig_section_paragraphs[0]

'### The CCC reveals linear and nonlinear patterns in human transcriptomic data'

In [80]:
mod_section_paragraphs[0]

'### The CCC reveals linear and nonlinear patterns in human transcriptomic data'

####  Paragraph 00

In [81]:
par0 = process_paragraph(orig_section_paragraphs[2])
print(par0)

We examined the distribution of each coefficient's absolute values in GTEx (Figure @fig:dist_coefs). CCC (mean=0.14, median=0.08, sd=0.15) has a much more skewed distribution than Pearson (mean=0.31, median=0.24, sd=0.24) and Spearman (mean=0.39, median=0.37, sd=0.26). The coefficients reach a cumulative set containing 70% of gene pairs at different values (Figure @fig:dist_coefs b), $c=0.18$, $p=0.44$ and $s=0.56$, suggesting that for this type of data, the coefficients are not directly comparable by magnitude, so we used ranks for further comparisons. In GTEx v8, CCC values were closer to Spearman and vice versa than either was to Pearson (Figure @fig:dist_coefs c). We also compared the Maximal Information Coefficient (MIC) in this data (see [Supplementary Note 1](#sec:mic)). We found that CCC behaved very similarly to MIC, although CCC was up to two orders of magnitude faster to run (see [Supplementary Note 2](#sec:time_test)). MIC, an advanced correlation coefficient able to captur

In [82]:
par1 = process_paragraph(mod_section_paragraphs[2])
print(par1)

We analyzed the distribution of absolute values of each coefficient in GTEx, as shown in Figure 1. The CCC (mean=0.14, median=0.08, sd=0.15) exhibited a more skewed distribution compared to Pearson (mean=0.31, median=0.24, sd=0.24) and Spearman (mean=0.39, median=0.37, sd=0.26). The cumulative set containing 70% of gene pairs had different values for CCC, Pearson, and Spearman (Figure 1b), with values of $c=0.18$, $p=0.44$, and $s=0.56, indicating that direct magnitude comparisons are not suitable for this data type. Therefore, ranks were used for further analysis. In GTEx v8, CCC values were closer to Spearman than to Pearson (Figure 1c). Additionally, the Maximal Information Coefficient (MIC) was compared in this data (see Supplementary Note 1). CCC showed similar behavior to MIC, but with significantly faster processing times (see Supplementary Note 2). MIC, a correlation coefficient capable of capturing general patterns beyond linear relationships, has been successfully applied in 

In [83]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [84]:
display(paragraph_matches[-1])

('results',
 "We examined the distribution of each coefficient's absolute values in GTEx (Figure @fig:dist_coefs). CCC (mean=0.14, median=0.08, sd=0.15) has a much more skewed distribution than Pearson (mean=0.31, median=0.24, sd=0.24) and Spearman (mean=0.39, median=0.37, sd=0.26). The coefficients reach a cumulative set containing 70% of gene pairs at different values (Figure @fig:dist_coefs b), $c=0.18$, $p=0.44$ and $s=0.56$, suggesting that for this type of data, the coefficients are not directly comparable by magnitude, so we used ranks for further comparisons. In GTEx v8, CCC values were closer to Spearman and vice versa than either was to Pearson (Figure @fig:dist_coefs c). We also compared the Maximal Information Coefficient (MIC) in this data (see [Supplementary Note 1](#sec:mic)). We found that CCC behaved very similarly to MIC, although CCC was up to two orders of magnitude faster to run (see [Supplementary Note 2](#sec:time_test)). MIC, an advanced correlation coefficient 

####  Paragraph 01

In [85]:
par0 = process_paragraph(orig_section_paragraphs[4])
print(par0)

A closer inspection of gene pairs that were either prioritized or disregarded by these coefficients revealed that they captured different patterns. We analyzed the agreements and disagreements by obtaining, for each coefficient, the top 30% of gene pairs with the largest correlation values ("high" set) and the bottom 30% ("low" set), resulting in six potentially overlapping categories. For most cases (76.4%), an UpSet analysis [@doi:10.1109/TVCG.2014.2346248] (Figure @fig:upsetplot_coefs a) showed that the three coefficients agreed on whether there is a strong correlation (42.1%) or there is no relationship (34.3%). Since Pearson and Spearman are linear-only, and CCC can also capture these patterns, we expect that these concordant gene pairs represent clear linear patterns. CCC and Spearman agree more on either highly or poorly correlated pairs (4.0% in "high", and 7.0% in "low") than any of these with Pearson (all between 0.3%-3.5% for "high", and 2.8%-5.5% for "low"). In summary, CCC

In [86]:
par1 = process_paragraph(mod_section_paragraphs[4])
print(par1)

An examination of gene pairs prioritized or disregarded by the correlation coefficients showed that they captured different patterns. We compared the top 30% and bottom 30% of gene pairs based on correlation values for each coefficient, resulting in six overlapping categories. Most cases (76.4%) agreed on strong correlation (42.1%) or no relationship (34.3%), as shown in an UpSet analysis (Figure 1a). Since Pearson and Spearman are linear-only, and CCC can also capture these patterns, concordant gene pairs likely represent clear linear patterns. CCC and Spearman agree more on highly or poorly correlated pairs (4.0% in "high", and 7.0% in "low") than with Pearson (0.3%-3.5% for "high", and 2.8%-5.5% for "low"). In summary, CCC agrees with either Pearson or Spearman in 90.5% of gene pairs by assigning high or low correlation values.


In [87]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [88]:
display(paragraph_matches[-1])

('results',
 'A closer inspection of gene pairs that were either prioritized or disregarded by these coefficients revealed that they captured different patterns. We analyzed the agreements and disagreements by obtaining, for each coefficient, the top 30% of gene pairs with the largest correlation values ("high" set) and the bottom 30% ("low" set), resulting in six potentially overlapping categories. For most cases (76.4%), an UpSet analysis [@doi:10.1109/TVCG.2014.2346248] (Figure @fig:upsetplot_coefs a) showed that the three coefficients agreed on whether there is a strong correlation (42.1%) or there is no relationship (34.3%). Since Pearson and Spearman are linear-only, and CCC can also capture these patterns, we expect that these concordant gene pairs represent clear linear patterns. CCC and Spearman agree more on either highly or poorly correlated pairs (4.0% in "high", and 7.0% in "low") than any of these with Pearson (all between 0.3%-3.5% for "high", and 2.8%-5.5% for "low"). I

####  Paragraph 02

In [89]:
par0 = process_paragraph(orig_section_paragraphs[6])
print(par0)

While there was broad agreement, more than 20,000 gene pairs with a high CCC value were not highly ranked by the other coefficients (right part of Figure @fig:upsetplot_coefs a). There were also gene pairs with a high Pearson value and either low CCC (1,075), low Spearman (87) or both low CCC and low Spearman values (531). However, our examination suggests that many of these cases appear to be driven by potential outliers (Figure @fig:upsetplot_coefs b, and analyzed later). We analyzed gene pairs among the top five of each intersection in the "Disagreements" group (Figure @fig:upsetplot_coefs a, right) where CCC disagrees with Pearson, Spearman or both.


In [90]:
par1 = process_paragraph(mod_section_paragraphs[6])
print(par1)

While the majority of gene pairs showed agreement, over 20,000 gene pairs with high CCC values were not ranked highly by other coefficients (Figure 1a). Some gene pairs had high Pearson values but low CCC (1,075), low Spearman (87), or low values for both CCC and Spearman (531). Our analysis indicates that many of these discrepancies may be due to potential outliers (Figure 1b). We focused on gene pairs in the top five of each intersection within the "Disagreements" group (Figure 1a, right) where CCC disagreed with Pearson, Spearman, or both.


In [91]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [92]:
display(paragraph_matches[-1])

('results',
 'While there was broad agreement, more than 20,000 gene pairs with a high CCC value were not highly ranked by the other coefficients (right part of Figure @fig:upsetplot_coefs a). There were also gene pairs with a high Pearson value and either low CCC (1,075), low Spearman (87) or both low CCC and low Spearman values (531). However, our examination suggests that many of these cases appear to be driven by potential outliers (Figure @fig:upsetplot_coefs b, and analyzed later). We analyzed gene pairs among the top five of each intersection in the "Disagreements" group (Figure @fig:upsetplot_coefs a, right) where CCC disagrees with Pearson, Spearman or both.',
 'While the majority of gene pairs showed agreement, over 20,000 gene pairs with high CCC values were not ranked highly by other coefficients (Figure 1a). Some gene pairs had high Pearson values but low CCC (1,075), low Spearman (87), or low values for both CCC and Spearman (531). Our analysis indicates that many of thes

####  Paragraph 03

In [93]:
par0 = process_paragraph(orig_section_paragraphs[8])
print(par0)

The first three gene pairs at the top (*IFNG* - *SDS*, *JUN* - *APOC1*, and *ZDHHC12* - *CCL18*), with high CCC and low Pearson values, appear to follow a non-coexistence relationship: in samples where one of the genes is highly (slightly) expressed, the other is slightly (highly) activated, suggesting a potentially inhibiting effect. The following three gene pairs (*UTY* - *KDM6A*, *RASSF2* - *CYTIP*, and *AC068580.6* - *KLHL21*) follow patterns combining either two linear or one linear and one independent relationships. In particular, genes *UTY* and *KDM6A* (paralogs) show a nonlinear relationship where a subset of samples follows a robust linear pattern and another subset has a constant (independent) expression of one gene. This relationship is explained by the fact that *UTY* is in chromosome Y (Yq11) whereas *KDM6A* is in chromosome X (Xp11), and samples with a linear pattern are males, whereas those with no expression for *UTY* are females. This combination of linear and indepen

In [94]:
par1 = process_paragraph(mod_section_paragraphs[8])
print(par1)

The top three gene pairs (*IFNG* - *SDS*, *JUN* - *APOC1*, and *ZDHHC12* - *CCL18*) exhibit high CCC and low Pearson values, suggesting a non-coexistence relationship where one gene is highly expressed while the other is activated to a lesser extent, indicating a potential inhibiting effect. The subsequent three gene pairs (*UTY* - *KDM6A*, *RASSF2* - *CYTIP*, and *AC068580.6* - *KLHL21*) display combinations of linear or independent relationships. For instance, *UTY* and *KDM6A* (paralogs) show a nonlinear relationship where some samples exhibit a robust linear pattern while others have constant expression levels for one gene. This discrepancy is due to the location of *UTY* on chromosome Y and *KDM6A* on chromosome X, resulting in males showing a linear pattern and females lacking expression of *UTY*. This combination of linear and independent patterns is captured by CCC ($c=0.29$) but not by Pearson ($p=0.24$) or Spearman ($s=0.10$). Additionally, this gene pair pattern is consisten

In [95]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [96]:
display(paragraph_matches[-1])

('results',
 'The first three gene pairs at the top (*IFNG* - *SDS*, *JUN* - *APOC1*, and *ZDHHC12* - *CCL18*), with high CCC and low Pearson values, appear to follow a non-coexistence relationship: in samples where one of the genes is highly (slightly) expressed, the other is slightly (highly) activated, suggesting a potentially inhibiting effect. The following three gene pairs (*UTY* - *KDM6A*, *RASSF2* - *CYTIP*, and *AC068580.6* - *KLHL21*) follow patterns combining either two linear or one linear and one independent relationships. In particular, genes *UTY* and *KDM6A* (paralogs) show a nonlinear relationship where a subset of samples follows a robust linear pattern and another subset has a constant (independent) expression of one gene. This relationship is explained by the fact that *UTY* is in chromosome Y (Yq11) whereas *KDM6A* is in chromosome X (Xp11), and samples with a linear pattern are males, whereas those with no expression for *UTY* are females. This combination of line

## Results (giant)

In [97]:
# section_name = "results"

In [98]:
pr_filename = pr_files[4].filename
assert section_name in pr_filename
assert "giant" in pr_filename
print(pr_filename)

content/04.12.results_giant.md


### Original

In [99]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

### Replication of gene associations using tissue-


In [100]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

4

### Modified

In [101]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

### Replication of gene associations using tissue-


In [102]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

4

### Match

In [103]:
orig_section_paragraphs[0]

'### Replication of gene associations using tissue-specific gene networks from GIANT'

In [104]:
mod_section_paragraphs[0]

'### Replication of gene associations using tissue-specific gene networks from GIANT'

####  Paragraph 00

In [105]:
par0 = process_paragraph(orig_section_paragraphs[1])
print(par0)

We sought to systematically analyze discrepant scores to assess whether associations were replicated in other datasets besides GTEx. This is challenging and prone to bias because linear-only correlation coefficients are usually used in gene co-expression analyses. We used 144 tissue-specific gene networks from the Genome-wide Analysis of gene Networks in Tissues (GIANT) [@pmcid:PMC4828725; @url:https://hb.flatironinstitute.org], where nodes represent genes and each edge a functional relationship weighted with a probability of interaction between two genes (see [Methods](#sec:giant)). Importantly, the version of GIANT used in this study did not include GTEx samples [@url:https://hb.flatironinstitute.org/data], making it an ideal case for replication. These networks were built from expression and different interaction measurements, including protein-interaction, transcription factor regulation, chemical/genetic perturbations and microRNA target profiles from the Molecular Signatures Data

In [106]:
par1 = process_paragraph(mod_section_paragraphs[1])
print(par1)

We analyzed discrepant scores to determine if associations were consistent across datasets beyond GTEx. Using 144 tissue-specific gene networks from GIANT, where nodes represent genes and edges represent functional relationships weighted with probabilities of interaction, we examined gene pairs in whole blood (Figure 1). These networks were constructed from various interaction measurements, including protein-interaction, transcription factor regulation, chemical/genetic perturbations, and microRNA target profiles from MSigDB. Highly-ranked gene pairs in blood that showed real patterns were expected to replicate in related tissues or cell lineages using GIANT's multi-cell type functional interaction networks. For example, gene pairs *RASSF2* - *CYTIP* with a high CCC value and gene pairs *MYOZ1* - *TNNI2* with high Pearson values were analyzed. The networks revealed strong connections for *RASSF2* - *CYTIP* in blood and leukocytes, while *MYOZ1* - *TNNI2* showed weaker connections in sk

In [107]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [108]:
display(paragraph_matches[-1])

('results',
 "We sought to systematically analyze discrepant scores to assess whether associations were replicated in other datasets besides GTEx. This is challenging and prone to bias because linear-only correlation coefficients are usually used in gene co-expression analyses. We used 144 tissue-specific gene networks from the Genome-wide Analysis of gene Networks in Tissues (GIANT) [@pmcid:PMC4828725; @url:https://hb.flatironinstitute.org], where nodes represent genes and each edge a functional relationship weighted with a probability of interaction between two genes (see [Methods](#sec:giant)). Importantly, the version of GIANT used in this study did not include GTEx samples [@url:https://hb.flatironinstitute.org/data], making it an ideal case for replication. These networks were built from expression and different interaction measurements, including protein-interaction, transcription factor regulation, chemical/genetic perturbations and microRNA target profiles from the Molecular S

####  Paragraph 01

In [109]:
par0 = process_paragraph(orig_section_paragraphs[3])
print(par0)

We next performed a systematic evaluation using the top 100 discrepant gene pairs between CCC and the other two coefficients. For each gene pair prioritized in GTEx (whole blood), we autodetected a relevant cell type using GIANT to assess whether genes were predicted to be specifically expressed in a blood-relevant cell lineage. For this, we used the top five most commonly autodetected cell types for each coefficient and assessed connectivity in the resulting networks (see [Methods](#sec:giant)). The top 5 predicted cell types for gene pairs highly ranked by CCC and not by the rest were all blood-specific (Figure @fig:giant_gene_pairs c, top left), including macrophage, leukocyte, natural killer cell, blood and mononuclear phagocyte. The average probability of interaction between genes in these CCC-ranked networks was significantly higher than the other coefficients (Figure @fig:giant_gene_pairs c, top right), with all medians larger than 67% and first quartiles above 41% across predic

In [110]:
par1 = process_paragraph(mod_section_paragraphs[3])
print(par1)

We systematically evaluated the top 100 discrepant gene pairs between CCC and two other coefficients. Using GTEx data for whole blood, we identified relevant cell types for each gene pair through GIANT analysis. The top five predicted cell types for gene pairs highly ranked by CCC were all blood-specific, including macrophage, leukocyte, natural killer cell, blood, and mononuclear phagocyte (Figure 1c, top left). The average probability of interaction between genes in CCC-ranked networks was significantly higher compared to other coefficients, with medians above 67% and first quartiles above 41% across predicted cell types (Figure 1c, top right). In contrast, most Pearson's gene pairs were predicted to be specific to non-blood tissues, with skeletal muscle being the most common prediction (Figure 1c, bottom left). Interaction probabilities in Pearson-ranked networks were generally lower than CCC, except for blood-specific gene pairs (Figure 1c, bottom right). The associations exclusive

In [111]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [112]:
display(paragraph_matches[-1])

('results',
 "We next performed a systematic evaluation using the top 100 discrepant gene pairs between CCC and the other two coefficients. For each gene pair prioritized in GTEx (whole blood), we autodetected a relevant cell type using GIANT to assess whether genes were predicted to be specifically expressed in a blood-relevant cell lineage. For this, we used the top five most commonly autodetected cell types for each coefficient and assessed connectivity in the resulting networks (see [Methods](#sec:giant)). The top 5 predicted cell types for gene pairs highly ranked by CCC and not by the rest were all blood-specific (Figure @fig:giant_gene_pairs c, top left), including macrophage, leukocyte, natural killer cell, blood and mononuclear phagocyte. The average probability of interaction between genes in these CCC-ranked networks was significantly higher than the other coefficients (Figure @fig:giant_gene_pairs c, top right), with all medians larger than 67% and first quartiles above 41%

## Discussion

In [113]:
section_name = "discussion"

In [114]:
pr_filename = pr_files[5].filename
assert section_name in pr_filename
print(pr_filename)

content/06.discussion.md


### Original

In [115]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

## Discussion

We introduce the Clustermatch Corre


In [116]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

7

### Modified

In [117]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

## Discussion

We present the Clustermatch Correla


In [118]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

10

### Match

In [119]:
orig_section_paragraphs[0]

'## Discussion'

In [120]:
mod_section_paragraphs[0]

'## Discussion'

####  Paragraph 00

In [121]:
par0 = process_paragraph(orig_section_paragraphs[1])
print(par0)

We introduce the Clustermatch Correlation Coefficient (CCC), an efficient not-only-linear machine learning-based statistic. Applying CCC to GTEx v8 revealed that it was robust to outliers and detected linear relationships as well as complex and biologically meaningful patterns that standard coefficients missed. In particular, CCC alone detected gene pairs with complex nonlinear patterns from the sex chromosomes, highlighting the way that not-only-linear coefficients can play in capturing sex-specific differences. The ability to capture these nonlinear patterns, however, extends beyond sex differences: it provides a powerful approach to detect complex relationships where a subset of samples or conditions are explained by other factors (such as differences between health and disease). We found that top CCC-ranked gene pairs in whole blood from GTEx were replicated in independent tissue-specific networks trained from multiple data types and attributed to cell lineages from blood, even tho

In [122]:
par1 = process_paragraph(mod_section_paragraphs[1:4])
print(par1)

We present the Clustermatch Correlation Coefficient (CCC), a machine learning-based statistic that is not only efficient but also capable of capturing nonlinear relationships. When applied to GTEx v8 data, CCC demonstrated robustness to outliers and identified both linear and complex biologically relevant patterns that traditional coefficients overlooked. Specifically, CCC was able to identify gene pairs with intricate nonlinear patterns on the sex chromosomes, illustrating its ability to capture sex-specific differences. This capability extends beyond sex differences and provides a powerful tool for detecting complex relationships, such as those between health and disease, where certain samples or conditions are influenced by other factors. Our analysis of CCC's performance on GTEx data showed that top-ranked gene pairs in whole blood were consistent with tissue-specific networks built from diverse data types and cell lineage information, despite CCC not having access to this specific

In [123]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [124]:
display(paragraph_matches[-1])

('discussion',
 'We introduce the Clustermatch Correlation Coefficient (CCC), an efficient not-only-linear machine learning-based statistic. Applying CCC to GTEx v8 revealed that it was robust to outliers and detected linear relationships as well as complex and biologically meaningful patterns that standard coefficients missed. In particular, CCC alone detected gene pairs with complex nonlinear patterns from the sex chromosomes, highlighting the way that not-only-linear coefficients can play in capturing sex-specific differences. The ability to capture these nonlinear patterns, however, extends beyond sex differences: it provides a powerful approach to detect complex relationships where a subset of samples or conditions are explained by other factors (such as differences between health and disease). We found that top CCC-ranked gene pairs in whole blood from GTEx were replicated in independent tissue-specific networks trained from multiple data types and attributed to cell lineages fro

####  Paragraph 01

In [125]:
par0 = process_paragraph(orig_section_paragraphs[2])
print(par0)

Datasets such as Anscombe or "Datasaurus" highlight the value of visualization instead of relying on simple data summaries. While visual analysis is helpful, for many datasets examining each possible relationship is infeasible, and this is where more sophisticated and robust correlation coefficients are necessary. Advanced yet interpretable coefficients like CCC can focus human interpretation on patterns that are more likely to reflect real biology. The complexity of these patterns might reflect heterogeneity in samples that mask clear relationships between variables. For example, genes *UTY* - *KDM6A* (from sex chromosomes), detected by CCC, have a strong linear relationship but only in a subset of samples (males), which was not captured by linear-only coefficients. This example, in particular, highlights the importance of considering sex as a biological variable (SABV) [@doi:10.1038/509282a] to avoid overlooking important differences between men and women, for instance, in disease ma

In [126]:
par1 = process_paragraph(mod_section_paragraphs[4])
print(par1)

Datasets like Anscombe or "Datasaurus" demonstrate the importance of visualization over basic data summaries. Visual analysis is beneficial, but for many datasets, exploring every possible relationship is impractical. This is where more advanced and robust correlation coefficients become essential. Coefficients like CCC can guide human interpretation towards patterns that likely reflect actual biological phenomena. The complexity of these patterns may indicate sample heterogeneity that obscures clear relationships between variables. For instance, genes *UTY* - *KDM6A* (from sex chromosomes) exhibit a strong linear relationship in a specific subset of samples (males), which traditional linear coefficients fail to capture. This example emphasizes the significance of considering sex as a biological variable (SABV) to prevent overlooking critical differences between genders, such as in disease presentations. In a broader sense, a correlation coefficient like CCC, which is not solely linear

In [127]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [128]:
display(paragraph_matches[-1])

('discussion',
 'Datasets such as Anscombe or "Datasaurus" highlight the value of visualization instead of relying on simple data summaries. While visual analysis is helpful, for many datasets examining each possible relationship is infeasible, and this is where more sophisticated and robust correlation coefficients are necessary. Advanced yet interpretable coefficients like CCC can focus human interpretation on patterns that are more likely to reflect real biology. The complexity of these patterns might reflect heterogeneity in samples that mask clear relationships between variables. For example, genes *UTY* - *KDM6A* (from sex chromosomes), detected by CCC, have a strong linear relationship but only in a subset of samples (males), which was not captured by linear-only coefficients. This example, in particular, highlights the importance of considering sex as a biological variable (SABV) [@doi:10.1038/509282a] to avoid overlooking important differences between men and women, for instan

####  Paragraph 02

In [129]:
par0 = process_paragraph(orig_section_paragraphs[3])
print(par0)

It is well-known that biomedical research is biased towards a small fraction of human genes [@pmid:17620606; @pmid:17472739]. Some genes highlighted in CCC-ranked pairs (Figure @fig:upsetplot_coefs b), such as *SDS* (12q24) and *ZDHHC12* (9q34), were previously found to be the focus of fewer than expected publications [@pmid:30226837]. It is possible that the widespread use of linear coefficients may bias researchers away from genes with complex coexpression patterns. A beyond-linear gene co-expression analysis on large compendia might shed light on the function of understudied genes. For example, gene *KLHL21* (1p36) and *AC068580.6* (*ENSG00000235027*, in 11p15) have a high CCC value and are missed by the other coefficients. *KLHL21* was suggested as a potential therapeutic target for hepatocellular carcinoma [@pmid:27769251] and other cancers [@pmid:29574153; @pmid:35084622]. Its nonlinear correlation with *AC068580.6* might unveil other important players in cancer initiation or pro

In [130]:
par1 = process_paragraph(mod_section_paragraphs[5])
print(par1)

Biomedical research tends to focus on a small number of human genes, as indicated by previous studies [@pmid:17620606; @pmid:17472739]. Some genes identified in the CCC-ranked pairs (see Figure 1b) have received less attention in the literature than expected, such as *SDS* (12q24) and *ZDHHC12* (9q34) [@pmid:30226837]. This lack of attention may be due to the common use of linear coefficients, which could overlook genes with complex coexpression patterns. Conducting gene co-expression analyses beyond linear methods on large datasets could provide insights into the functions of underexplored genes. For instance, genes like *KLHL21* (1p36) and *AC068580.6* (*ENSG00000235027*, in 11p15) show high CCC values but are not captured by other coefficients. Previous studies have suggested *KLHL21* as a potential therapeutic target for hepatocellular carcinoma [@pmid:27769251] and other cancers [@pmid:29574153; @pmid:35084622]. Exploring its nonlinear correlation with *AC068580.6* could reveal ad

In [131]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [132]:
display(paragraph_matches[-1])

('discussion',
 'It is well-known that biomedical research is biased towards a small fraction of human genes [@pmid:17620606; @pmid:17472739]. Some genes highlighted in CCC-ranked pairs (Figure @fig:upsetplot_coefs b), such as *SDS* (12q24) and *ZDHHC12* (9q34), were previously found to be the focus of fewer than expected publications [@pmid:30226837]. It is possible that the widespread use of linear coefficients may bias researchers away from genes with complex coexpression patterns. A beyond-linear gene co-expression analysis on large compendia might shed light on the function of understudied genes. For example, gene *KLHL21* (1p36) and *AC068580.6* (*ENSG00000235027*, in 11p15) have a high CCC value and are missed by the other coefficients. *KLHL21* was suggested as a potential therapeutic target for hepatocellular carcinoma [@pmid:27769251] and other cancers [@pmid:29574153; @pmid:35084622]. Its nonlinear correlation with *AC068580.6* might unveil other important players in cancer 

####  Paragraph 03

In [133]:
par0 = process_paragraph(orig_section_paragraphs[4])
print(par0)

Not-only-linear correlation coefficients might also be helpful in the field of genetic studies. In this context, genome-wide association studies (GWAS) have been successful in understanding the molecular basis of common diseases by estimating the association between genotype and phenotype [@doi:10.1016/j.ajhg.2017.06.005]. However, the estimated effect sizes of genes identified with GWAS are generally modest, and they explain only a fraction of the phenotype variance, hampering the clinical translation of these findings [@doi:10.1038/s41576-019-0127-1]. Recent theories, like the omnigenic model for complex traits [@pmid:28622505; @pmid:31051098], argue that these observations are explained by highly-interconnected gene regulatory networks, with some core genes having a more direct effect on the phenotype than others. Using this omnigenic perspective, we and others [@doi:10.1101/2021.07.05.450786; @doi:10.1186/s13040-020-00216-9; @doi:10.1101/2021.10.21.21265342] have shown that integra

In [134]:
par1 = process_paragraph(mod_section_paragraphs[6:8])
print(par1)

Not-only-linear correlation coefficients could benefit genetic studies, particularly in genome-wide association studies (GWAS) that aim to understand the genetic basis of common diseases by examining the relationship between genotype and phenotype (Smith et al., 2017). However, genes identified through GWAS typically have modest effects and only explain a small portion of the phenotype variance, limiting their clinical implications (Jones et al., 2019). Recent theories, such as the omnigenic model, propose that interconnected gene regulatory networks, where core genes have a more direct impact on the phenotype, can explain these observations (Brown et al., 2017; White et al., 2021). From an omnigenic perspective, integrating gene co-expression networks into genetic studies has the potential to identify core genes overlooked by linear models like GWAS (Black et al., 2021; Green et al., 2020; Red et al., 2021). Our findings suggest that using more advanced and efficient correlation coeff

In [135]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [136]:
display(paragraph_matches[-1])

('discussion',
 'Not-only-linear correlation coefficients might also be helpful in the field of genetic studies. In this context, genome-wide association studies (GWAS) have been successful in understanding the molecular basis of common diseases by estimating the association between genotype and phenotype [@doi:10.1016/j.ajhg.2017.06.005]. However, the estimated effect sizes of genes identified with GWAS are generally modest, and they explain only a fraction of the phenotype variance, hampering the clinical translation of these findings [@doi:10.1038/s41576-019-0127-1]. Recent theories, like the omnigenic model for complex traits [@pmid:28622505; @pmid:31051098], argue that these observations are explained by highly-interconnected gene regulatory networks, with some core genes having a more direct effect on the phenotype than others. Using this omnigenic perspective, we and others [@doi:10.1101/2021.07.05.450786; @doi:10.1186/s13040-020-00216-9; @doi:10.1101/2021.10.21.21265342] have s

####  Paragraph 04

In [137]:
par0 = process_paragraph(orig_section_paragraphs[5])
print(par0)

Our analyses have some limitations. We worked on a sample with the top variable genes to keep computation time feasible. Although CCC is much faster than MIC, Pearson and Spearman are still the most computationally efficient since they only rely on simple data statistics. Our results, however, reveal the advantages of using more advanced coefficients like CCC for detecting and studying more intricate molecular mechanisms that replicated in independent datasets. The application of CCC on larger compendia, such as recount3 [@pmid:34844637] with thousands of heterogeneous samples across different conditions, can reveal other potentially meaningful gene interactions. The single parameter of CCC, $k_{\mathrm{max}}$, controls the maximum complexity of patterns found and also impacts the compute time. Our analysis suggested that $k_{\mathrm{max}}=10$ was sufficient to identify both linear and more complex patterns in gene expression. A more comprehensive analysis of optimal values for this pa

In [138]:
par1 = process_paragraph(mod_section_paragraphs[8])
print(par1)

Our study has some limitations. We focused on a subset of genes to ensure manageable computation time. While the Correlation Coefficient based on Machine Learning (CCC) is faster than Maximal Information Coefficient (MIC), Pearson and Spearman correlations remain the most computationally efficient due to their reliance on basic data statistics. Nonetheless, our findings highlight the benefits of utilizing advanced coefficients like CCC for uncovering and investigating intricate molecular mechanisms that are reproducible in independent datasets. Applying CCC to larger datasets, such as recount3 (Huang et al., 2021) with diverse samples under various conditions, could unveil additional meaningful gene interactions. The sole parameter of CCC, $k_{\mathrm{max}}$, determines the maximum complexity of patterns identified and influences computation time. Our analysis indicated that setting $k_{\mathrm{max}}=10$ was adequate for detecting both linear and complex patterns in gene expression. Fu

In [139]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [140]:
display(paragraph_matches[-1])

('discussion',
 'Our analyses have some limitations. We worked on a sample with the top variable genes to keep computation time feasible. Although CCC is much faster than MIC, Pearson and Spearman are still the most computationally efficient since they only rely on simple data statistics. Our results, however, reveal the advantages of using more advanced coefficients like CCC for detecting and studying more intricate molecular mechanisms that replicated in independent datasets. The application of CCC on larger compendia, such as recount3 [@pmid:34844637] with thousands of heterogeneous samples across different conditions, can reveal other potentially meaningful gene interactions. The single parameter of CCC, $k_{\\mathrm{max}}$, controls the maximum complexity of patterns found and also impacts the compute time. Our analysis suggested that $k_{\\mathrm{max}}=10$ was sufficient to identify both linear and more complex patterns in gene expression. A more comprehensive analysis of optimal

####  Paragraph 05

In [141]:
par0 = process_paragraph(orig_section_paragraphs[6])
print(par0)

While linear and rank-based correlation coefficients are exceptionally fast to calculate, not all relevant patterns in biological datasets are linear. For example, patterns associated with sex as a biological variable are not apparent to the linear-only coefficients that we evaluated but are revealed by not-only-linear methods. Beyond sex differences, being able to use a method that inherently identifies patterns driven by other factors is likely to be desirable. Not-only-linear coefficients can also disentangle intricate yet relevant patterns from expression data alone that were replicated in models integrating different data modalities. CCC, in particular, is highly parallelizable, and we anticipate efficient GPU-based implementations that could make it even faster. The CCC is an efficient, next-generation correlation coefficient that is highly effective in transcriptome analyses and potentially useful in a broad range of other domains.


In [142]:
par1 = process_paragraph(mod_section_paragraphs[9])
print(par1)

While linear and rank-based correlation coefficients are quick to calculate, they may not capture all important patterns in biological datasets due to their linear nature. For instance, correlations related to sex as a biological variable may not be detected by linear-only coefficients but can be uncovered by not-only-linear methods. In addition to sex differences, it is advantageous to utilize a method that can identify patterns influenced by other factors. Not-only-linear coefficients have the ability to unravel complex patterns from gene expression data alone, which have been validated in models combining various data types. The Correlation Coefficient based on Machine Learning (CCC) stands out for its high level of parallelizability, and we foresee potential for even faster implementations using GPUs. The CCC represents a cutting-edge correlation coefficient that is particularly effective in analyzing transcriptomes and holds promise for a wide array of applications.


In [143]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [144]:
display(paragraph_matches[-1])

('discussion',
 'While linear and rank-based correlation coefficients are exceptionally fast to calculate, not all relevant patterns in biological datasets are linear. For example, patterns associated with sex as a biological variable are not apparent to the linear-only coefficients that we evaluated but are revealed by not-only-linear methods. Beyond sex differences, being able to use a method that inherently identifies patterns driven by other factors is likely to be desirable. Not-only-linear coefficients can also disentangle intricate yet relevant patterns from expression data alone that were replicated in models integrating different data modalities. CCC, in particular, is highly parallelizable, and we anticipate efficient GPU-based implementations that could make it even faster. The CCC is an efficient, next-generation correlation coefficient that is highly effective in transcriptome analyses and potentially useful in a broad range of other domains.',
 'While linear and rank-base

## Methods (ccc)

In [145]:
section_name = "methods"

In [146]:
pr_filename = pr_files[6].filename
assert section_name in pr_filename
assert "ccc" in pr_filename
print(pr_filename)

content/08.01.methods.ccc.md


### Original

In [147]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

## Methods

The code needed to reproduce all of ou


In [148]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

10

### Modified

In [149]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

## Methods

The code needed to reproduce all of ou


In [150]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

19

### Match

In [151]:
orig_section_paragraphs[0]

'## Methods'

In [152]:
mod_section_paragraphs[0]

'## Methods'

####  Paragraph 00

In [153]:
par0 = process_paragraph(orig_section_paragraphs[3])
print(par0)

The Clustermatch Correlation Coefficient (CCC) computes a similarity value $c \in \left[0,1\right]$ between any pair of numerical or categorical features/variables $\mathbf{x}$ and $\mathbf{y}$ measured on $n$ objects. CCC assumes that if two features $\mathbf{x}$ and $\mathbf{y}$ are similar, then the partitioning by clustering of the $n$ objects using each feature separately should match. For example, given $\mathbf{x}=(11, 27, 32, 40)$ and $\mathbf{y}=10x=(110, 270, 320, 400)$, where $n=4$, partitioning each variable into two clusters ($k=2$) using their medians (29.5 for $\mathbf{x}$ and 295 for $\mathbf{y}$) would result in partition $\Omega^{\mathbf{x}}_{k=2}=(1, 1, 2, 2)$ for $\mathbf{x}$, and partition $\Omega^{\mathbf{y}}_{k=2}=(1, 1, 2, 2)$ for $\mathbf{y}$. Then, the agreement between $\Omega^{\mathbf{x}}_{k=2}$ and $\Omega^{\mathbf{y}}_{k=2}$ can be computed using any measure of similarity between partitions, like the adjusted Rand index (ARI) [@doi:10.1007/BF01908075]. In 

In [154]:
par1 = (
    process_paragraph(mod_section_paragraphs[3])
    .replace("$$", "\n$$")
    .replace("\\text", "\n\\text")
)
print(par1)

The Clustermatch Correlation Coefficient (CCC) calculates a similarity value $c \in \left[0,1\right]$ between any pair of numerical or categorical features/variables $\mathbf{x}$ and $\mathbf{y}$ measured on $n$ objects. CCC operates under the assumption that if two features $\mathbf{x}$ and $\mathbf{y}$ are similar, then clustering the $n$ objects using each feature separately should result in matching partitions. For example, consider $\mathbf{x}=(11, 27, 32, 40)$ and $\mathbf{y}=10\mathbf{x}=(110, 270, 320, 400)$, where $n=4$. Partitioning each variable into two clusters ($k=2$) based on their medians (29.5 for $\mathbf{x}$ and 295 for $\mathbf{y}$) yields partition $\Omega^{\mathbf{x}}_{k=2}=(1, 1, 2, 2)$ for $\mathbf{x}$ and partition $\Omega^{\mathbf{y}}_{k=2}=(1, 1, 2, 2)$ for $\mathbf{y}$. The agreement between $\Omega^{\mathbf{x}}_{k=2}$ and $\Omega^{\mathbf{y}}_{k=2}$ can be assessed using a similarity measure for partitions, such as the adjusted Rand index (ARI) [@doi:10.100

In [155]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [156]:
display(paragraph_matches[-1])

('methods',
 'The Clustermatch Correlation Coefficient (CCC) computes a similarity value $c \\in \\left[0,1\\right]$ between any pair of numerical or categorical features/variables $\\mathbf{x}$ and $\\mathbf{y}$ measured on $n$ objects. CCC assumes that if two features $\\mathbf{x}$ and $\\mathbf{y}$ are similar, then the partitioning by clustering of the $n$ objects using each feature separately should match. For example, given $\\mathbf{x}=(11, 27, 32, 40)$ and $\\mathbf{y}=10x=(110, 270, 320, 400)$, where $n=4$, partitioning each variable into two clusters ($k=2$) using their medians (29.5 for $\\mathbf{x}$ and 295 for $\\mathbf{y}$) would result in partition $\\Omega^{\\mathbf{x}}_{k=2}=(1, 1, 2, 2)$ for $\\mathbf{x}$, and partition $\\Omega^{\\mathbf{y}}_{k=2}=(1, 1, 2, 2)$ for $\\mathbf{y}$. Then, the agreement between $\\Omega^{\\mathbf{x}}_{k=2}$ and $\\Omega^{\\mathbf{y}}_{k=2}$ can be computed using any measure of similarity between partitions, like the adjusted Rand index (

####  Paragraph 01

In [157]:
par0 = process_paragraph(orig_section_paragraphs[5])
print(par0)

The main function of the algorithm, `ccc`, generates a list of partitionings $\Omega^{\mathbf{x}}$ and $\Omega^{\mathbf{y}}$ (lines 14 and 15), for each feature $\mathbf{x}$ and $\mathbf{y}$. Then, it computes the ARI between each partition in $\Omega^{\mathbf{x}}$ and $\Omega^{\mathbf{y}}$ (line 16), and then it keeps the pair that generates the maximum ARI. Finally, since ARI does not have a lower bound (it could return negative values, which in our case are not meaningful), CCC returns only values between 0 and 1 (line 17).


In [158]:
par1 = process_paragraph(
    mod_section_paragraphs[9]
)  # .replace("$$", "\n$$").replace("\\text", "\n\\text")
print(par1)

The primary function of the algorithm, `ccc`, is to generate a list of partitionings $\Omega^{\mathbf{x}}$ and $\Omega^{\mathbf{y}}$ (lines 14 and 15) for each feature $\mathbf{x}$ and $\mathbf{y}$. Subsequently, the algorithm computes the Adjusted Rand Index (ARI) between each partition in $\Omega^{\mathbf{x}}$ and $\Omega^{\mathbf{y}}$ (line 16) and retains the pair that yields the maximum ARI. Since ARI does not have a lower bound (it could potentially return negative values, which are not meaningful in this context), CCC only outputs values between 0 and 1 (line 17).


In [159]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [160]:
display(paragraph_matches[-1])

('methods',
 'The main function of the algorithm, `ccc`, generates a list of partitionings $\\Omega^{\\mathbf{x}}$ and $\\Omega^{\\mathbf{y}}$ (lines 14 and 15), for each feature $\\mathbf{x}$ and $\\mathbf{y}$. Then, it computes the ARI between each partition in $\\Omega^{\\mathbf{x}}$ and $\\Omega^{\\mathbf{y}}$ (line 16), and then it keeps the pair that generates the maximum ARI. Finally, since ARI does not have a lower bound (it could return negative values, which in our case are not meaningful), CCC returns only values between 0 and 1 (line 17).',
 'The primary function of the algorithm, `ccc`, is to generate a list of partitionings $\\Omega^{\\mathbf{x}}$ and $\\Omega^{\\mathbf{y}}$ (lines 14 and 15) for each feature $\\mathbf{x}$ and $\\mathbf{y}$. Subsequently, the algorithm computes the Adjusted Rand Index (ARI) between each partition in $\\Omega^{\\mathbf{x}}$ and $\\Omega^{\\mathbf{y}}$ (line 16) and retains the pair that yields the maximum ARI. Since ARI does not have a low

####  Paragraph 02

In [161]:
par0 = process_paragraph(orig_section_paragraphs[6])
print(par0)

Interestingly, since CCC only needs a pair of partitions to compute a similarity value, any type of feature that can be used to perform clustering/grouping is supported. If the feature is numerical (lines 2 to 5 in the `get_partitions` function), then quantiles are used for clustering (for example, the median generates $k=2$ clusters of objects), from $k=2$ to $k=k_{\mathrm{max}}$. If the feature is categorical (lines 7 to 9), the categories are used to group objects together. Consequently, since features are internally categorized into clusters, numerical and categorical variables can be naturally integrated since clusters do not need an order.


In [162]:
par1 = process_paragraph(
    mod_section_paragraphs[11]
)  # .replace("$$", "\n$$").replace("\\text", "\n\\text")
print(par1)

Interestingly, the Correlation Coefficient Calculation (CCC) only requires a pair of partitions to calculate a similarity value, making it compatible with any type of feature that can be utilized for clustering or grouping. When the feature is numerical (lines 2 to 5 in the `get_partitions` function), quantiles are employed for clustering. For example, the median generates $k=2$ clusters of objects, with the number of clusters ranging from $k=2$ to $k=k_{\mathrm{max}}$. On the other hand, if the feature is categorical (lines 7 to 9), the categories are utilized to group objects together. As a result, numerical and categorical variables can be seamlessly integrated since features are internally categorized into clusters, and clusters do not require an order.


In [163]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [164]:
display(paragraph_matches[-1])

('methods',
 'Interestingly, since CCC only needs a pair of partitions to compute a similarity value, any type of feature that can be used to perform clustering/grouping is supported. If the feature is numerical (lines 2 to 5 in the `get_partitions` function), then quantiles are used for clustering (for example, the median generates $k=2$ clusters of objects), from $k=2$ to $k=k_{\\mathrm{max}}$. If the feature is categorical (lines 7 to 9), the categories are used to group objects together. Consequently, since features are internally categorized into clusters, numerical and categorical variables can be naturally integrated since clusters do not need an order.',
 'Interestingly, the Correlation Coefficient Calculation (CCC) only requires a pair of partitions to calculate a similarity value, making it compatible with any type of feature that can be utilized for clustering or grouping. When the feature is numerical (lines 2 to 5 in the `get_partitions` function), quantiles are employed f

####  Paragraph 03

In [165]:
par0 = process_paragraph(orig_section_paragraphs[7])
print(par0)

For all our analyses we used $k_{\mathrm{max}}=10$. This means that for each gene pair, 18 partitions are generated (9 for each gene, from $k=2$ to $k=10$), and 81 ARI comparisons are performed. Smaller values of $k_{\mathrm{max}}$ can reduce computation time, although at the expense of missing more complex/general relationships. Our examples in Figure @fig:datasets_rel suggest that using $k_{\mathrm{max}}=2$ would force CCC to find linear-only patterns, which could be a valid use case scenario where only this kind of relationships are desired. In addition, $k_{\mathrm{max}}=2$ implies that only two partitions are generated, and only one ARI comparison is performed. In this regard, our Python implementation of CCC provides flexibility in specifying $k_{\mathrm{max}}$. For instance, instead of the maximum $k$ (an integer), the parameter could be a custom list of integers: for example, `[2, 5, 10]` will partition the data into two, five and ten clusters.


In [166]:
par1 = process_paragraph(mod_section_paragraphs[15]).replace(
    "$$", "\n$$"
)  # .replace("\\text\{LLM", "\n\\text\{LLM")
print(par1)

For all our analyses, we utilized $k_{\mathrm{max}}=10$. This implies that 18 partitions are generated for each gene pair (9 for each gene, ranging from $k=2$ to $k=10$), leading to 81 Adjusted Rand Index (ARI) comparisons. While smaller values of $k_{\mathrm{max}}$ can decrease computation time, they may overlook more intricate or general relationships. Our findings in Figure @fig:datasets_rel indicate that setting $k_{\mathrm{max}}=2$ would constrain the Correlation Coefficient (CCC) to identify only linear patterns, which could be suitable in scenarios where linear relationships are specifically sought. Moreover, with $k_{\mathrm{max}}=2$, only two partitions are created, and a single ARI comparison is carried out. In this context, our Python implementation of CCC offers flexibility in defining $k_{\mathrm{max}}$. For example, instead of a maximum $k$ value (an integer), the parameter could be a customized list of integers; for instance, `[2, 5, 10]` would partition the data into tw

In [167]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [168]:
display(paragraph_matches[-1])

('methods',
 'For all our analyses we used $k_{\\mathrm{max}}=10$. This means that for each gene pair, 18 partitions are generated (9 for each gene, from $k=2$ to $k=10$), and 81 ARI comparisons are performed. Smaller values of $k_{\\mathrm{max}}$ can reduce computation time, although at the expense of missing more complex/general relationships. Our examples in Figure @fig:datasets_rel suggest that using $k_{\\mathrm{max}}=2$ would force CCC to find linear-only patterns, which could be a valid use case scenario where only this kind of relationships are desired. In addition, $k_{\\mathrm{max}}=2$ implies that only two partitions are generated, and only one ARI comparison is performed. In this regard, our Python implementation of CCC provides flexibility in specifying $k_{\\mathrm{max}}$. For instance, instead of the maximum $k$ (an integer), the parameter could be a custom list of integers: for example, `[2, 5, 10]` will partition the data into two, five and ten clusters.',
 'For all ou

## Methods (data)

In [169]:
# section_name = "methods"

In [170]:
pr_filename = pr_files[7].filename
assert section_name in pr_filename
assert "data" in pr_filename
print(pr_filename)

content/08.05.methods.data.md


### Original

In [171]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

### Gene expression data and preprocessing {#sec:d


In [172]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

2

### Modified

In [173]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

### Gene expression data and preprocessing {#sec:d


In [174]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

4

### Match

In [175]:
orig_section_paragraphs[0]

'### Gene expression data and preprocessing {#sec:data_gtex}'

In [176]:
mod_section_paragraphs[0]

'### Gene expression data and preprocessing {#sec:data_gtex}'

####  Paragraph 00

In [177]:
par0 = process_paragraph(orig_section_paragraphs[1])
print(par0)

We downloaded GTEx v8 data for all tissues, normalized using TPM (transcripts per million), and focused our primary analysis on whole blood, which has a good sample size (755). We selected the top 5,000 genes from whole blood with the largest variance after standardizing with $log(x + 1)$ to avoid a bias towards highly-expressed genes. We then computed Pearson, Spearman, MIC and CCC on these 5,000 genes across all 755 samples on the TPM-normalized data, generating a pairwise similarity matrix of size 5,000 x 5,000.


In [178]:
par1 = process_paragraph(mod_section_paragraphs[1])
print(par1)

We downloaded GTEx version 8 data for all tissues and normalized it using TPM (transcripts per million). Our primary analysis focused on whole blood, which had a sample size of 755. From whole blood, we selected the top 5,000 genes with the largest variance after standardizing with $\log(x + 1)$ to prevent bias towards highly-expressed genes. Subsequently, we calculated Pearson, Spearman, MIC, and CCC for these 5,000 genes across all 755 samples on the TPM-normalized data. This computation resulted in a pairwise similarity matrix of size 5,000 x 5,000.


In [179]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [180]:
display(paragraph_matches[-1])

('methods',
 'We downloaded GTEx v8 data for all tissues, normalized using TPM (transcripts per million), and focused our primary analysis on whole blood, which has a good sample size (755). We selected the top 5,000 genes from whole blood with the largest variance after standardizing with $log(x + 1)$ to avoid a bias towards highly-expressed genes. We then computed Pearson, Spearman, MIC and CCC on these 5,000 genes across all 755 samples on the TPM-normalized data, generating a pairwise similarity matrix of size 5,000 x 5,000.',
 'We downloaded GTEx version 8 data for all tissues and normalized it using TPM (transcripts per million). Our primary analysis focused on whole blood, which had a sample size of 755. From whole blood, we selected the top 5,000 genes with the largest variance after standardizing with $\\log(x + 1)$ to prevent bias towards highly-expressed genes. Subsequently, we calculated Pearson, Spearman, MIC, and CCC for these 5,000 genes across all 755 samples on the TPM

## Methods (giant)

In [181]:
# section_name = "methods"

In [182]:
pr_filename = pr_files[8].filename
assert section_name in pr_filename
assert "giant" in pr_filename
print(pr_filename)

content/08.15.methods.giant.md


### Original

In [183]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

### Tissue-specific network analyses using GIANT {


In [184]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

3

### Modified

In [185]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

### Tissue-specific network analyses using GIANT {


In [186]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

7

### Match

In [187]:
orig_section_paragraphs[0]

'### Tissue-specific network analyses using GIANT {#sec:giant}'

In [188]:
mod_section_paragraphs[0]

'### Tissue-specific network analyses using GIANT {#sec:giant}'

####  Paragraph 00

In [189]:
par0 = process_paragraph(orig_section_paragraphs[1])
print(par0)

We accessed tissue-specific gene networks of GIANT using both the web interface and web services provided by HumanBase [@url:https://hb.flatironinstitute.org]. The GIANT version used in this study included 987 genome-scale datasets with approximately 38,000 conditions from around 14,000 publications. Details on how these networks were built are described in [@doi:10.1038/ng.3259]. Briefly, tissue-specific gene networks were built using gene expression data (without GTEx samples [@url:https://hb.flatironinstitute.org/data]) from the NCBI's Gene Expression Omnibus (GEO) [@doi:10.1093/nar/gks1193], protein-protein interaction (BioGRID [@pmc:PMC3531226], IntAct [@doi:10.1093/nar/gkr1088], MINT [@doi:10.1093/nar/gkr930] and MIPS [@pmc:PMC148093]), transcription factor regulation using binding motifs from JASPAR [@doi:10.1093/nar/gkp950], and chemical and genetic perturbations from MSigDB [@doi:10.1073/pnas.0506580102]. Gene expression data were log-transformed, and the Pearson correlation w

In [190]:
par1 = process_paragraph(mod_section_paragraphs[1:4])
print(par1)

We accessed tissue-specific gene networks of GIANT using both the web interface and web services provided by HumanBase (https://hb.flatironinstitute.org). The GIANT version used in this study included 987 genome-scale datasets with approximately 38,000 conditions from around 14,000 publications. Details on how these networks were built are described in (DOI: 10.1038/ng.3259). Briefly, tissue-specific gene networks were built using gene expression data (without GTEx samples from https://hb.flatironinstitute.org/data) from the NCBI's Gene Expression Omnibus (GEO) (DOI: 10.1093/nar/gks1193), protein-protein interaction databases such as BioGRID (PMC: PMC3531226), IntAct (DOI: 10.1093/nar/gkr1088), MINT (DOI: 10.1093/nar/gkr930), and MIPS (PMC: PMC148093), transcription factor regulation using binding motifs from JASPAR (DOI: 10.1093/nar/gkp950), and chemical and genetic perturbations from MSigDB (DOI: 10.1073/pnas.0506580102). Gene expression data were log-transformed, and the Pearson cor

In [191]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [192]:
display(paragraph_matches[-1])

('methods',
 "We accessed tissue-specific gene networks of GIANT using both the web interface and web services provided by HumanBase [@url:https://hb.flatironinstitute.org]. The GIANT version used in this study included 987 genome-scale datasets with approximately 38,000 conditions from around 14,000 publications. Details on how these networks were built are described in [@doi:10.1038/ng.3259]. Briefly, tissue-specific gene networks were built using gene expression data (without GTEx samples [@url:https://hb.flatironinstitute.org/data]) from the NCBI's Gene Expression Omnibus (GEO) [@doi:10.1093/nar/gks1193], protein-protein interaction (BioGRID [@pmc:PMC3531226], IntAct [@doi:10.1093/nar/gkr1088], MINT [@doi:10.1093/nar/gkr930] and MIPS [@pmc:PMC148093]), transcription factor regulation using binding motifs from JASPAR [@doi:10.1093/nar/gkp950], and chemical and genetic perturbations from MSigDB [@doi:10.1073/pnas.0506580102]. Gene expression data were log-transformed, and the Pearson

####  Paragraph 01

In [193]:
par0 = process_paragraph(orig_section_paragraphs[2])
print(par0)

For each pair of genes prioritized in our study using GTEx, we used GIANT through HumanBase to obtain 1) a predicted gene network for blood (manually selected to match whole blood in GTEx) and 2) a gene network with an automatically predicted tissue using the method described in [@doi:10.1101/gr.155697.113] and provided by HumanBase web interfaces/services. Briefly, the tissue prediction approach trains a machine learning model using comprehensive transcriptional data with human-curated markers of different cell lineages (e.g., macrophages) as gold standards. Then, these models are used to predict other cell lineage-specific genes. In addition to reporting this predicted tissue or cell lineage, we computed the average probability of interaction between all genes in the network retrieved from GIANT. Following the default procedure used in GIANT, we included the top 15 genes with the highest probability of interaction with the queried gene pair for each network.


In [194]:
par1 = process_paragraph(
    [
        mod_section_paragraphs[4],
        mod_section_paragraphs[6],
    ]
)
print(par1)

For each pair of genes prioritized in our study using GTEx, we utilized GIANT through HumanBase to acquire 1) a predicted gene network for blood (manually selected to match whole blood in GTEx) and 2) a gene network with an automatically predicted tissue using the method described in previous research [@doi:10.1101/gr.155697.113] and provided by HumanBase web interfaces/services. Briefly, the tissue prediction approach involves training a machine learning model using comprehensive transcriptional data with human-curated markers of different cell lineages (e.g., macrophages) as gold standards. These models are then utilized to predict other cell lineage-specific genes. Additionally, besides reporting the predicted tissue or cell lineage, we calculated the average probability of interaction between all genes in the network retrieved from GIANT. Following the default procedure used in GIANT, we included the top 15 genes with the highest probability of interaction with the queried gene pai

In [195]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [196]:
display(paragraph_matches[-1])

('methods',
 'For each pair of genes prioritized in our study using GTEx, we used GIANT through HumanBase to obtain 1) a predicted gene network for blood (manually selected to match whole blood in GTEx) and 2) a gene network with an automatically predicted tissue using the method described in [@doi:10.1101/gr.155697.113] and provided by HumanBase web interfaces/services. Briefly, the tissue prediction approach trains a machine learning model using comprehensive transcriptional data with human-curated markers of different cell lineages (e.g., macrophages) as gold standards. Then, these models are used to predict other cell lineage-specific genes. In addition to reporting this predicted tissue or cell lineage, we computed the average probability of interaction between all genes in the network retrieved from GIANT. Following the default procedure used in GIANT, we included the top 15 genes with the highest probability of interaction with the queried gene pair for each network.',
 'For eac

## Methods (mic)

In [197]:
# section_name = "methods"

In [198]:
pr_filename = pr_files[9].filename
assert section_name in pr_filename
assert "mic" in pr_filename
print(pr_filename)

content/08.20.methods.mic.md


### Original

In [199]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

### Maximal Information Coefficient (MIC) {#sec:me


In [200]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

2

### Modified

In [201]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

### Maximal Information Coefficient (MIC) {#sec:me


In [202]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

4

### Match

In [203]:
orig_section_paragraphs[0]

'### Maximal Information Coefficient (MIC) {#sec:methods:mic}'

In [204]:
mod_section_paragraphs[0]

'### Maximal Information Coefficient (MIC) {#sec:methods:mic}'

####  Paragraph 00

In [205]:
par0 = process_paragraph(orig_section_paragraphs[1])
print(par0)

We used the Python package `minepy` [@doi:10.1093/bioinformatics/bts707; @url:https://github.com/minepy/minepy] (version 1.2.5) to estimate the MIC coefficient. In GTEx v8 (whole blood), we used MIC<sub>e</sub> (an improved implementation of the original MIC introduced in [@Reshef2016]) with the default parameters `alpha=0.6`, `c=15` and `estimator='mic_e'`. We used the `pairwise_distances` function from `scikit-learn` [@Sklearn2011] to parallelize the computation of MIC on GTEx. For our computational complexity analyses (see [Supplementary Material](#sec:time_test)), we ran the original MIC (using parameter `estimator='mic_approx'`) and MIC<sub>e</sub> (`estimator='mic_e'`).


In [206]:
par1 = process_paragraph(mod_section_paragraphs[1])
print(par1)

We used the Python package `minepy` [@doi:10.1093/bioinformatics/bts707; @url:https://github.com/minepy/minepy] (version 1.2.5) to estimate the Maximal Information Coefficient (MIC) coefficient. In GTEx v8 (whole blood), we used MIC<sub>e</sub>, an improved implementation of the original MIC introduced in Reshef et al. (2016) with the default parameters `alpha=0.6`, `c=15`, and `estimator='mic_e'`. We utilized the `pairwise_distances` function from `scikit-learn` [@Sklearn2011] to parallelize the computation of MIC on GTEx. For our computational complexity analyses (see Supplementary Material), we ran the original MIC (using parameter `estimator='mic_approx'`) and MIC<sub>e</sub> (with `estimator='mic_e'`).


In [207]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [208]:
display(paragraph_matches[-1])

('methods',
 "We used the Python package `minepy` [@doi:10.1093/bioinformatics/bts707; @url:https://github.com/minepy/minepy] (version 1.2.5) to estimate the MIC coefficient. In GTEx v8 (whole blood), we used MIC<sub>e</sub> (an improved implementation of the original MIC introduced in [@Reshef2016]) with the default parameters `alpha=0.6`, `c=15` and `estimator='mic_e'`. We used the `pairwise_distances` function from `scikit-learn` [@Sklearn2011] to parallelize the computation of MIC on GTEx. For our computational complexity analyses (see [Supplementary Material](#sec:time_test)), we ran the original MIC (using parameter `estimator='mic_approx'`) and MIC<sub>e</sub> (`estimator='mic_e'`).",
 "We used the Python package `minepy` [@doi:10.1093/bioinformatics/bts707; @url:https://github.com/minepy/minepy] (version 1.2.5) to estimate the Maximal Information Coefficient (MIC) coefficient. In GTEx v8 (whole blood), we used MIC<sub>e</sub>, an improved implementation of the original MIC intr

## Supplementary material

In [209]:
section_name = "supplementary material"

In [210]:
pr_filename = pr_files[10].filename
assert "supplementary" in pr_filename
print(pr_filename)

content/20.00.supplementary_material.md


### Original

In [211]:
# get content
orig_section_content = repo.get_contents(pr_filename, pr_prev).decoded_content.decode(
    "utf-8"
)
print(orig_section_content[:50])

## Supplementary material {.page_break_before}

##


In [212]:
# split by paragraph
orig_section_paragraphs = orig_section_content.split("\n\n")
display(len(orig_section_paragraphs))

14

### Modified

In [213]:
# get content
mod_section_content = repo.get_contents(pr_filename, pr_curr).decoded_content.decode(
    "utf-8"
)
print(mod_section_content[:50])

## Supplementary material {.page_break_before}

##


In [214]:
# split by paragraph
mod_section_paragraphs = mod_section_content.split("\n\n")
display(len(mod_section_paragraphs))

14

### Match

In [215]:
orig_section_paragraphs[0]

'## Supplementary material {.page_break_before}'

In [216]:
mod_section_paragraphs[0]

'## Supplementary material {.page_break_before}'

####  Paragraph 00

In [217]:
par0 = process_paragraph(orig_section_paragraphs[2])
print(par0)

We compared all the coefficients in this study with MIC [@pmid:22174245], a popular nonlinear method that can find complex relationships in data, although very computationally intensive [@doi:10.1098/rsos.201424]. We ran MIC<sub>e</sub> (see Methods) on all possible pairwise comparisons of our 5,000 highly variable genes from whole blood in GTEx v8. This took 4 days and 19 hours to finish (compared with 9 hours for CCC). Then we performed the analysis on the distribution of coefficients (the same as in the main text), shown in Figure @fig:dist_coefs_mic. We verified that CCC and MIC behave similarly in this dataset, with essentially the same distribution but only shifted. Figure @fig:dist_coefs_mic c shows that these two coefficients relate almost linearly, and both compare very similarly with Pearson and Spearman.


In [218]:
par1 = process_paragraph(mod_section_paragraphs[2])
print(par1)

We compared the coefficients from our study with the MIC method, which is known for identifying complex relationships in data but is computationally intensive. We applied MICe to all possible pairwise comparisons of 5,000 highly variable genes from whole blood in GTEx v8. This process took 4 days and 19 hours to complete, significantly longer than the 9 hours required for CCC. The analysis of coefficient distribution, as described in the main text, is shown in Figure 1. We found that CCC and MIC exhibited similar behavior in this dataset, with comparable distributions that were only slightly shifted. Figure 1c illustrates a nearly linear relationship between these two coefficients, which also showed similarities with Pearson and Spearman correlations.


In [219]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [220]:
display(paragraph_matches[-1])

('supplementary material',
 'We compared all the coefficients in this study with MIC [@pmid:22174245], a popular nonlinear method that can find complex relationships in data, although very computationally intensive [@doi:10.1098/rsos.201424]. We ran MIC<sub>e</sub> (see Methods) on all possible pairwise comparisons of our 5,000 highly variable genes from whole blood in GTEx v8. This took 4 days and 19 hours to finish (compared with 9 hours for CCC). Then we performed the analysis on the distribution of coefficients (the same as in the main text), shown in Figure @fig:dist_coefs_mic. We verified that CCC and MIC behave similarly in this dataset, with essentially the same distribution but only shifted. Figure @fig:dist_coefs_mic c shows that these two coefficients relate almost linearly, and both compare very similarly with Pearson and Spearman.',
 'We compared the coefficients from our study with the MIC method, which is known for identifying complex relationships in data but is computa

####  Paragraph 01

In [221]:
par0 = process_paragraph(orig_section_paragraphs[6])
print(par0)

We also compared CCC with the other coefficients in terms of computational complexity. Although CCC and MIC might identify similar gene pairs in gene expression data (see [here](#sec:mic)), the use of MIC in large datasets remains limited due to its very long computation time, despite some methodological/implementation improvements [@doi:10.1093/bioinformatics/bts707; @doi:10.1371/journal.pone.0157567; @doi:10.4137/EBO.S13121; @doi:10.1038/srep06662; @doi:10.1098/rsos.201424]. The original MIC implementation uses ApproxMaxMI, a computationally demanding heuristic estimator [@doi:10.1126/science.1205438]. Recently, a more efficient implementation called MIC<sub>e</sub> was proposed [@Reshef2016]. These two MIC estimators are provided by the `minepy` package [@doi:10.1093/bioinformatics/bts707], a C implementation available for Python. We compared all these coefficients in terms of computation time on randomly generated variables of different sizes, which simulates a scenario of gene exp

In [222]:
par1 = process_paragraph(mod_section_paragraphs[6])
print(par1)

We compared CCC with other correlation coefficients in terms of computational complexity. While CCC and MIC may identify similar gene pairs in gene expression data, MIC is limited in large datasets due to its long computation time. Some improvements have been made to the MIC method, but its original implementation, which uses ApproxMaxMI, is computationally demanding. A more efficient implementation called MICe has been proposed. Both MIC estimators are included in the minepy package, a C implementation available for Python. We compared these coefficients in terms of computation time on randomly generated variables of different sizes to simulate scenarios of gene expression data with varying numbers of conditions. Unlike other coefficients, CCC allows for easy parallelization of the computation of a single gene pair. We tested the cases using 1 and 3 CPU cores. The results are shown in Figure 1.


In [223]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [224]:
display(paragraph_matches[-1])

('supplementary material',
 'We also compared CCC with the other coefficients in terms of computational complexity. Although CCC and MIC might identify similar gene pairs in gene expression data (see [here](#sec:mic)), the use of MIC in large datasets remains limited due to its very long computation time, despite some methodological/implementation improvements [@doi:10.1093/bioinformatics/bts707; @doi:10.1371/journal.pone.0157567; @doi:10.4137/EBO.S13121; @doi:10.1038/srep06662; @doi:10.1098/rsos.201424]. The original MIC implementation uses ApproxMaxMI, a computationally demanding heuristic estimator [@doi:10.1126/science.1205438]. Recently, a more efficient implementation called MIC<sub>e</sub> was proposed [@Reshef2016]. These two MIC estimators are provided by the `minepy` package [@doi:10.1093/bioinformatics/bts707], a C implementation available for Python. We compared all these coefficients in terms of computation time on randomly generated variables of different sizes, which sim

####  Paragraph 02

In [225]:
par0 = process_paragraph(orig_section_paragraphs[8])
print(par0)

As we already expected, Pearson and Spearman were the fastest, given that they only need to compute basic summary statistics from the data. For example, Pearson is three orders of magnitude faster than CCC. Among the nonlinear coefficients, CCC was faster than the two MIC variations (up to two orders of magnitude), with the only exception in very small data sizes. The difference is important because both MIC variants were implemented in C [@doi:10.1093/bioinformatics/bts707], a high-performance programming language, whereas CCC was implemented in Python (optimized with `numba`). For a data size of a million, the multi-core CCC was twice as fast as the single-core CCC. This suggests that new implementations using more advanced processing units (such as GPUs) are feasible and could make CCC reach speeds closer to Pearson.


In [226]:
par1 = process_paragraph(mod_section_paragraphs[8])
print(par1)

As we anticipated, Pearson and Spearman were the quickest as they only require basic summary statistics from the data. For instance, Pearson is significantly faster than CCC. Among the nonlinear coefficients, CCC was quicker than the two MIC variations (up to two orders of magnitude), except for very small data sizes. This difference is noteworthy because both MIC variants were coded in C, a high-performance programming language, while CCC was coded in Python (optimized with `numba`). For a dataset of a million, the multi-core CCC was twice as fast as the single-core CCC. This indicates that new implementations utilizing more advanced processing units (such as GPUs) are possible and could potentially increase CCC's speed closer to that of Pearson.


In [227]:
paragraph_matches.append(
    (
        section_name,
        par0,
        par1,
    )
)

In [228]:
display(paragraph_matches[-1])

('supplementary material',
 'As we already expected, Pearson and Spearman were the fastest, given that they only need to compute basic summary statistics from the data. For example, Pearson is three orders of magnitude faster than CCC. Among the nonlinear coefficients, CCC was faster than the two MIC variations (up to two orders of magnitude), with the only exception in very small data sizes. The difference is important because both MIC variants were implemented in C [@doi:10.1093/bioinformatics/bts707], a high-performance programming language, whereas CCC was implemented in Python (optimized with `numba`). For a data size of a million, the multi-core CCC was twice as fast as the single-core CCC. This suggests that new implementations using more advanced processing units (such as GPUs) are feasible and could make CCC reach speeds closer to Pearson.',
 "As we anticipated, Pearson and Spearman were the quickest as they only require basic summary statistics from the data. For instance, Pe

# Close connections

In [229]:
g.close()

# Save

In [230]:
len(paragraph_matches)

31

In [231]:
paragraph_matches[:2]

[('abstract',
  'Correlation coefficients are widely used to identify patterns in data that may be of particular interest. In transcriptomics, genes with correlated expression often share functions or are part of disease-relevant biological processes. Here we introduce the Clustermatch Correlation Coefficient (CCC), an efficient, easy-to-use and not-only-linear coefficient based on machine learning models. CCC reveals biologically meaningful linear and nonlinear patterns missed by standard, linear-only correlation coefficients. CCC captures general patterns in data by comparing clustering solutions while being much faster than state-of-the-art coefficients such as the Maximal Information Coefficient. When applied to human gene expression data, CCC identifies robust linear relationships while detecting nonlinear patterns associated, for example, with sex differences that are not captured by linear-only coefficients. Gene pairs highly ranked by CCC were enriched for interactions in integ

In [232]:
df = pd.DataFrame(paragraph_matches).rename(
    columns={
        0: "section",
        1: "original",
        2: "modified",
    }
)

In [233]:
df.shape

(31, 3)

In [234]:
df.head()

Unnamed: 0,section,original,modified
0,abstract,Correlation coefficients are widely used to id...,"In transcriptomics, identifying patterns in ge..."
1,introduction,New technologies have vastly improved data col...,Recent advancements in data collection have le...
2,introduction,"In transcriptomics, many analyses start with e...","In the field of transcriptomics, many analyses..."
3,introduction,The Pearson and Spearman correlation coefficie...,The Pearson and Spearman correlation coefficie...
4,results,The CCC provides a similarity measure between ...,The CCC calculates the similarity between pair...


In [235]:
df.to_pickle(OUTPUT_FILE_PATH)

# Reverse original/modified columns

In [236]:
df_reversed = df.rename(columns={"original": "modified2"}).rename(
    columns={"modified": "original", "modified2": "modified"}
)

In [237]:
df_reversed.shape

(31, 3)

In [238]:
df_reversed.head()

Unnamed: 0,section,modified,original
0,abstract,Correlation coefficients are widely used to id...,"In transcriptomics, identifying patterns in ge..."
1,introduction,New technologies have vastly improved data col...,Recent advancements in data collection have le...
2,introduction,"In transcriptomics, many analyses start with e...","In the field of transcriptomics, many analyses..."
3,introduction,The Pearson and Spearman correlation coefficie...,The Pearson and Spearman correlation coefficie...
4,results,The CCC provides a similarity measure between ...,The CCC calculates the similarity between pair...


## Save

In [239]:
df_reversed.to_pickle(REVERSED_OUTPUT_FILE_PATH)