# Pearson correlation analysis
This iPython notebook (`.ipynb` file) can be used to conduct Pearson correlation  
analysis for a series of data files.

The code expects data files, where the variable of interest is present in the  
second column, e.g., xy- or xye-formats.

The correlation analysis is done using the `numpy.corrcoef` function.  
For more info on Pearson correlation please refer to:
- https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html

Imports.

In [None]:
import sys
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from diffpy.utils.parsers.loaddata import loadData

Collecting data files.

In [None]:
data_path = Path.cwd() / "data"
if not data_path.exists():
    print("TEST")
    data_path.mkdir()
    sys.exit(f"\n{80*'-'}\nA folder called '{data_path.name}' has been created."
             f"\nPlease put your data files there and rerun the cell."
             f"\n{80*'-'}")
data_files = list(data_path.glob("*.*"))
if len(data_files) == 0:
    sys.exit(f"\n{80*'-'}\nNo files were found in the '{data_path.name}' "
             f"folder.\nPlease place your data files there and rerun the cell."
             f"\n{80*'-'}")

Function for plotting Pearson correlation matrix.

In [None]:
def plot(corr, name, plot_paths):
    fontsize_labels, fontsize_ticklabels = 20, 14
    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(corr, aspect="equal", cmap="YlOrRd")
    ax.set_xlabel(f"index", fontsize=fontsize_labels)
    ax.set_ylabel(f"index", fontsize=fontsize_labels)
    ax.xaxis.set_label_position("top")
    ax.tick_params(axis="x",
                   which="both",
                   bottom=True,
                   top=True,
                   labelbottom=False,
                   labeltop=True,
                   direction="inout",
                   labelsize=fontsize_ticklabels,
                   )
    ax.tick_params(axis="y",
                   which="both",
                   left=True,
                   right=True,
                   labelleft=True,
                   labelright=False,
                   direction="inout",
                   labelsize=fontsize_ticklabels,
                   )
    ax.minorticks_on()
    cb = plt.colorbar(im)
    cb.ax.tick_params(direction="inout", labelsize=fontsize_ticklabels)
    cb.set_label(r"$R_{\mathrm{Pearson}}$", fontsize=fontsize_labels)
    for p in plot_paths:
        print(f"\t\t{p.name}")
        plt.savefig(p / f"{name}.{p.name}", bbox_inches="tight", dpi=300)
    plt.show()

    return None

Plot folders.

In [None]:
plot_folders = ["png",
                # "pdf",
                # "svg",
                ]
plot_paths = [Path.cwd() / folder for folder in plot_folders]
for p in plot_paths:
    if not p.exists():
        p.mkdir()

Loading data, conducting Pearson correlation analysis, and plotting.

In [None]:
print(f"{80*'-'}\nLoading data...")
array = []
for i, f in enumerate(data_files):
    print(f"\t{i}\t{f.name}")
    data = loadData(f)
    x, y = data[:, 0], data[:, 1]
    array.append(y)
print(f"Done loading data. Conducting Pearson correlation analysis...")
corr = np.corrcoef(np.array(array))
print(f"Plotting correlation matrix...")
plot(corr, "pearson_correlation_matrix", plot_paths)
print(f"Done. Please see the {plot_folders} folder(s).\n{80*'-'}")