# Perform a t-test between well correlations of the same or different genotypes

## Import libraries

In [1]:
from scipy.stats import ttest_ind
import pathlib
import pandas as pd

## Load in correlation data

In [2]:
# Path to correlation per plate results
corr_results_dir = pathlib.Path("../../0.data_analysis/plate_correlation_analyses/construct_correlation_data")

# Load data
corr_results_file = corr_results_dir / "well_agg_plate_genotype_correlations.parquet"
corr_results_df = pd.read_parquet(corr_results_file)

# Add a new column `same_genotype` to check if the correlation row is comparing between the same genotype
corr_results_df['same_genotype'] = corr_results_df['Metadata_genotype__group0'] == corr_results_df['Metadata_genotype__group1']

# Add a new column `same_plate` to check if the correlation row is comparing between the same plate
corr_results_df['same_plate'] = corr_results_df['Metadata_Plate__group0'] == corr_results_df['Metadata_Plate__group1']

# Display dimensions and first few rows of the DataFrame
print(corr_results_df.shape)
corr_results_df.head()


(10296, 7)


Unnamed: 0,correlation,Metadata_Plate__group0,Metadata_Plate__group1,Metadata_genotype__group0,Metadata_genotype__group1,same_genotype,same_plate
0,0.041393,Plate_3,Plate_3_prime,WT,WT,True,False
1,0.161703,Plate_3,Plate_3_prime,WT,WT,True,False
2,0.30589,Plate_3,Plate_3_prime,WT,WT,True,False
3,-0.204942,Plate_3,Plate_3_prime,WT,WT,True,False
4,0.307921,Plate_3,Plate_3_prime,WT,WT,True,False


## Perform two sample t-test

In [3]:
# Split the DataFrame based on the `same_genotype` column
same_genotype_df = corr_results_df[corr_results_df['same_genotype'] == True]
different_genotype_df = corr_results_df[corr_results_df['same_genotype'] == False]

# Perform a t-test between the two groups
# Replace 'your_column_of_interest' with the column you want to test
t_stat, p_value = ttest_ind(same_genotype_df['correlation'], 
                            different_genotype_df['correlation'])

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

T-statistic: 14.127139465381415
P-value: 6.780397765300712e-45


A large t-statistic and very low p-value indicates we can reject the null hypothesis and conclude that overall single-cell populations at the well level that are from the same genotype have a significantly different mean than the wells with different genotypes.

## Show the means of the different distributions

In [4]:
same_genotype_mean = same_genotype_df['correlation'].mean()
different_genotype_mean = different_genotype_df['correlation'].mean()

print(f"Mean (same_genotype): {same_genotype_mean}")
print(f"Mean (different_genotype): {different_genotype_mean}")


Mean (same_genotype): 0.2352622084592727
Mean (different_genotype): 0.15985902728796325
