<a href="https://colab.research.google.com/github/oakati/Estimating-pH-of-Solutions-Using-pH-Indicators-and-Absorbance-Spectroscopy/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [13]:
# @title Libraries
# Importing Libraries
# Linear Regression is imported from the sklearn library
from sklearn.linear_model import LinearRegression
# mean_squared_error and mean_absolute_error are imported from sklearn.metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
# mean is imported from statistics library
from statistics import mean
# interp1d is imported from scipy.interpolate library
from scipy.interpolate import interp1d
# drive is imported from google.colab library
from google.colab import drive
# pandas, numpy, matplotlib, seaborn, math libraries are imported
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from math import ceil, floor
# savgol_filter is imported from scipy.signal library
from scipy.signal import savgol_filter

# Functions

In [14]:
# @title Functions

def drive_mount():
    """
    Mount google drive to access the input csv file
    """
    drive.mount('/content/drive')

def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False
def read_csv(fid, inputFile):
    """
    Read the input csv file and format the dataframe
    pHs: list of pH values to be used in the analysis
    """
    # Concatenate the file path and the file name
    fidin = fid+"/"+inputFile
    # Read the csv file as a dataframe
    df = pd.read_csv(fidin, header=[0, 1])
    # Create a list of tuples to be used as columns in the dataframe
    l_tuple = []

    pHs = [float(df.columns[i][0]) for i in range(len(df.columns)) if is_float(df.columns[i][0])]

    for pH in pHs:
        l_tuple.append((pH,"X"))
        l_tuple.append((pH,"Y"))
    # Assign the tuples as columns in the dataframe
    df.columns = pd.MultiIndex.from_tuples(l_tuple)
    # Replace negative values in the dataframe with 0
    df = df.clip(lower=0)
    return df, pHs

def change_colnames(colnames):
    """
    Replace the prefix 'pH ' in the column names with 'X'
    colnames: dataframe columns
    """
    colnames = colnames.str.replace('pH ', 'X')

# @title ip_limits
def ip_limits(df):
    """
    This function is used to create an array of limits of the interpolated points.
    df: DataFrame - input dataframe (not used in this function)
    return: numpy array - array of limits of the interpolated points
    """
    start = 375
    stop = 575
    step = 5
    return np.arange(start,stop+1,step, dtype=float)

# @title interpolation
def interpolation(pHs, df, df_ip):
    """
    This function is used to perform interpolation on the input dataframe.
    pHs: List - list of pH values
    df: DataFrame - input dataframe
    df_ip: DataFrame - dataframe to hold the interpolated values
    return: DataFrame - dataframe of interpolated values
    """
    xp = df_ip.index
    for pH in pHs:
        x = df[pH,'X'].loc[df[pH,'Y'].notna()]
        y = df[pH,'Y'].loc[df[pH,'Y'].notna()]
        f = interp1d(x,y)
        yp = f(xp)
        df_ip[pH] = yp
    return df_ip

# @title background_correction
def background_correction(df):
    """
    This function is used to perform background correction on the input dataframe.
    df: DataFrame - input dataframe
    return: DataFrame - dataframe after background correction
    """
    df = df.sub(df.min(axis=1), axis=0)
    return df

# @title drop_nonmonotone_wavelengths
def drop_nonmonotone_wavelengths(df, pHs):
    """
    This function is used to drop non-monotone wavelengths from the input dataframe.
    df: DataFrame - input dataframe
    pHs: List - list of pH values
    return: DataFrame - dataframe after dropping non-monotone wavelengths
    """
    df = df.loc[(df[pHs[1]] != 0) & (df[pHs[2]] != 0) & (df[pHs[3]] != 0) & (df[pHs[4]] != 0) & (df[pHs[5]] != 0)]
    return df

# @title normalization_by_maxVal
def normalization_by_maxVal(df):
    """
    This function is used to normalize the input dataframe by the max value.
    df: DataFrame - input dataframe
    return: DataFrame - dataframe after normalization
    """
    df = df.div(df.max(axis=1),axis=0)
    return df

def all_lambda_combinations(l1_, l2_, df_new):
    """
    This function takes three inputs:
    l1_: Dataframe containing the maximum absorbance values at the first isosbestic point.
    l2_: Dataframe containing the maximum absorbance values at the second isosbestic point.
    df_new: Empty dataframe to store the mean squared error values between the two isosbestic points.

    The function calculates the mean squared error between the two isosbestic points for each wavelength combination.
    It returns the df_new dataframe with the mean squared error values.
    """
    x = np.array(l2_.columns)
    for i, rowi in l1_.iterrows():
      for j, rowj in l2_.iterrows():
        y = np.log10(rowj / rowi).values
        df_new.at[i, j] = np.delete(x[0] + (y - y[0]),0)
    return df_new

def drop_inf_columns(l1, l2, inf_cols):
    """
    This function takes three inputs:
    l1: Dataframe containing the maximum absorbance values at the first isosbestic point.
    l2: Dataframe containing the maximum absorbance values at the second isosbestic point.
    inf_cols: List of columns with "inf" values that need to be dropped from the dataframes.

    The function drops the specified columns from the dataframes and returns the modified dataframes.
    """
    l1_ = l1.drop(inf_cols, axis=1)
    l2_ = l2.drop(inf_cols, axis=1)
    return l1_, l2_

def drop_points_near_isosbestic(df, pHs):
  l1 = df.loc[df[max(pHs)] == 0]
  l2 = df.loc[df[min(pHs)] == 0]
  return l1, l2
def red_sensor_colors(ax, m_list):
  for t in ax.get_xticklabels():
    txt = t.get_text()
    if int(txt) in m_list:
        t.set_color('r')
  for t in ax.get_yticklabels():
    txt = t.get_text()
    if int(txt) in m_list:
        t.set_color('r')

# Main

In [15]:
# Define the file path where the input csv file is located
fid = "/content/drive/My Drive/boun/5_1boun/EE492/indicator_data/pool"

# Define the name of the input csv file
inputFile = "sp_data.csv"
drive_mount()

# Read in data from CSV file
df, pHs = read_csv(fid, inputFile)

# Interpolate data
df_ip = pd.DataFrame(index=ip_limits(df),columns=pHs)
df_ip = interpolation(pHs, df, df_ip)

# Perform background correction
df_bc = background_correction(df_ip)

# Normalize data by max value
df_nm = normalization_by_maxVal(df_bc)

# Remove points near isosbestic points
l1, l2 = drop_points_near_isosbestic(df_nm, pHs)

#Create an empty dataframe with columns from l2 and index from l1
df_hm = pd.DataFrame(columns=l2.index.values.astype(int), index=l1.index.values.astype(int))

#Remove infinite values from l1 and l2 and store the result in l1_ and l2_
l1_, l2_ = drop_inf_columns(l1, l2, [min(pHs), max(pHs)])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
#Calculate all combinations of lambda values for each pH
df_hm = all_lambda_combinations(l1_, l2_, df_hm)

#filter the dataframe to only include specific wavelengths, and select every fifth index
df_hm = df_hm.astype(float)

# Take the mean as the final estimate
df_hm.mean().mean()

#Reference to the estimation method
# https://chemlab.truman.edu/files/2015/07/pka1.pdf

8.113650948714234