In [54]:
import pandas as pd
import numpy as np
from itertools import product
from scipy import stats
import scipy

In [50]:
def getCounts(filename: str) -> pd.DataFrame:
    """
    returns the dataframes for the y and b ions as a combined dataframe
    """
    
    df = pd.read_csv(filename, sep="\t")
    df = df[["Intensity", "sector", "Ion Type"]]

    df_b = df[df["Ion Type"] == "b"]
    df_b = df_b[["Intensity", "sector"]]

    df_y = df[df["Ion Type"] == "y"]
    df_y = df_y[["Intensity", "sector"]]

    b_counts = pd.DataFrame(df_b.value_counts())
    b_counts = b_counts.reset_index()
    b_counts.rename(columns = {0:'counts'}, inplace = True)

    y_counts = pd.DataFrame(df_y.value_counts())
    y_counts.rename(columns = {0:'counts'}, inplace = True)
    y_counts = y_counts.reset_index()

    intensities = ["none", "low", "medium", "high"]
    sectors = [1, 2, 3]

    df_template = pd.DataFrame(product(intensities, sectors), columns=["Intensity", "sector"])
    b_template = df_template.copy()
    y_template = df_template.copy()

    b = pd.merge(b_template, b_counts, on=["Intensity", "sector"], how="outer")
    b_counts_sum = b.counts.sum()
    b.counts = b.counts.apply(lambda x: x / b_counts_sum)

    y = pd.merge(y_template, y_counts, on=["Intensity", "sector"], how="outer")
    y_counts_sum = y.counts.sum()
    y.counts = y.counts.apply(lambda x: x / y_counts_sum)
    
    y['ionType'] = 'y'
    b['ionType'] = 'b'
    
    result = pd.concat([y,b], ignore_index=True)
    
    return result
    
def calculateChiSquare(expected: pd.DataFrame, observed: pd.DataFrame) -> float:
    """
    Calculates the chi squared statistic for our dataframes of 
    counts that have been normalized
    """
    
    statistic = 0
    for intIndex in range(expected.shape[0]):
        expectedCount = expected.iloc[intIndex]['counts']
        observedCount = observed.iloc[intIndex]['counts']
        statistic += ((observedCount - expectedCount)**2)/expectedCount
        
    return statistic

def calculateChiSquareFromFilenames(expected: str, observed: str) -> float:
    expectedDataframe = getCounts(expected)
    observedDataframe = getCounts(observed)
    return calculateChiSquare(expectedDataframe,observedDataframe)

def calculateChiSquareFromFilenamesScipy(expected: str, observed: str) -> float:
    expectedDataframe = getCounts(expected)['counts']
    observedDataframe = getCounts(observed)['counts']
    return scipy.stats.mstats.chisquare(observedDataframe,expectedDataframe,ddof=6)

In [65]:
ions_SC = getCounts("data/combined/all_sc.tsv")
ions_02 = getCounts("data/combined/all_2ng.tsv")
calculateChiSquare(ions_SC,ions_SC)
sum = 0
sum += ions_SC.iloc[2]['counts']
ions_SC

Unnamed: 0,Intensity,sector,counts,ionType
0,none,1,0.089711,y
1,none,2,0.087303,y
2,none,3,0.114861,y
3,low,1,0.27486,y
4,low,2,0.210009,y
5,low,3,0.025993,y
6,medium,1,0.039517,y
7,medium,2,0.082864,y
8,medium,3,0.007073,y
9,high,1,0.012462,y


In [62]:
filenames = ['data/combined/all_sc.tsv','data/combined/all_0.2ng.tsv','data/combined/all_2ng.tsv']

def calcChiSquareStatisticMatrix(filenames: list[str]) -> np.ndarray:
    numFiles = len(filenames)
    fileMatrix = list(product(filenames,filenames))
    statisticMatrix = np.zeros((numFiles,numFiles))
    for i in range(numFiles):
        for j in range(numFiles):
            statisticMatrix[i,j] = calculateChiSquareFromFilenamesScipy(filenames[i],filenames[j])[0] #get p value
            
    return statisticMatrix

In [63]:
smh2 = calcChiSquareStatisticMatrix(filenames)

In [64]:
smh2

array([[0.        , 0.06485065, 0.0364285 ],
       [0.09098296, 0.        , 0.04732409],
       [0.03932364, 0.04709307, 0.        ]])