In [1]:
import os

In [10]:
import pandas as pd

In [5]:
protein_file = os.path.join('CSV_folder', 'protein_assay.csv')
results_df = pd.read_csv(protein_file)
results_df

Unnamed: 0,Protein Concentration (mg/mL),A595
0,0.2,0.285
1,0.4,0.485
2,0.6,0.621
3,0.8,0.799
4,1.0,1.01
5,1.2,1.118


In [6]:
xdata = results_df['Protein Concentration (mg/mL)']
ydata = results_df['A595']
print(xdata, ydata)

0    0.2
1    0.4
2    0.6
3    0.8
4    1.0
5    1.2
Name: Protein Concentration (mg/mL), dtype: float64 0    0.285
1    0.485
2    0.621
3    0.799
4    1.010
5    1.118
Name: A595, dtype: float64


# import the stats module from SciPy to perform the linear regression.

In [7]:
from scipy import stats

In [19]:
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata, ydata)
print("Slope =", slope)
print("Intercept =", intercept)
print("R-squared =", r_value**2)
print("P value =", p_value)
print("Standard error =", std_err)

Slope = 0.8454285714285716
Intercept = 0.12786666666666657
R-squared = 0.994690398528738
P value = 1.0590717448341336e-05
Standard error = 0.030884027089284245


# Solving for protein concentrations in samples

In [11]:
samples_file = os.path.join("CSV_folder", "protein_samples.csv")
samples_df = pd.read_csv(samples_file)
samples_df

Unnamed: 0,Sample,A595
0,1,0.183
1,2,0.682
2,3,0.759
3,4,1.34
4,5,0.935
5,6,1.013


In [12]:
# Create the Equation
protein_conc = (samples_df['A595'] - intercept) / slope
print(protein_conc)

0    0.065213
1    0.655447
2    0.746525
3    1.433750
4    0.954703
5    1.046964
Name: A595, dtype: float64


# Add the column to the dataframe

Adding a column to a pandas dataframe is simple - you just use the name of the dataframe followed by the name of the new column in single quotes within square brackets.

In [17]:
samples_df["ProtConc"] = protein_conc
samples_df

Unnamed: 0,Sample,A595,ProtConc
0,1,0.183,0.065213
1,2,0.682,0.655447
2,3,0.759,0.746525
3,4,1.34,1.43375
4,5,0.935,0.954703
5,6,1.013,1.046964


# Eliminating values outside the calibration curve (optional)

There is one more issue we can address - the absorbance value for two of the unknown samples were outside the absorbance values of the standards. Therefore, these concentrations are not experimentally valid and should be reported as being “Out of Range”.

We can make this change in the dataframe by using loc to look sequentially at the rows in the dataframe. The syntax here tells python to work with the samples_df dataframe, look at one row at a time (loc), perform a conditional analysis of the A595 value on that row, and change the value for ProtConc to NaN if the A595 < 0.285 (the value for the lowest standard) or A > 1.118 (the value for the highest standard).

np.nan is a NumPy function that results in a value of NaN (for not a number) as the output. This is typically used when there is no value in a cell, but it still preserves the datatype in that cell as a float. This is preferred to using a string such as ‘Out of Range’ which would change the datatype for those cells in the column to a string.

In [14]:
import numpy as np

In [15]:
samples_df.loc[samples_df['A595'] < 0.285, 'ProtConc'] = np.nan
samples_df.loc[samples_df['A595'] > 1.118, 'ProtConc'] = np.nan
samples_df

Unnamed: 0,Sample,A595,ProtConc
0,1,0.183,
1,2,0.682,0.655447
2,3,0.759,0.746525
3,4,1.34,
4,5,0.935,0.954703
5,6,1.013,1.046964


Optional: Rather than hard-coding the minimum and maximum values for the calibration curve, we could also do this programmatically with the .min() and .max() functions.

In [18]:
samples_df.loc[samples_df['A595'] < results_df['A595'].min(), 'ProtConc'] = np.nan
samples_df.loc[samples_df['A595'] > results_df['A595'].max(), 'ProtConc'] = np.nan
samples_df

Unnamed: 0,Sample,A595,ProtConc
0,1,0.183,
1,2,0.682,0.655447
2,3,0.759,0.746525
3,4,1.34,
4,5,0.935,0.954703
5,6,1.013,1.046964
