# Data Analysis for Lab3 Dataset

## Introduction
In this section, we analyze the yield data for the month of March.
We visualize the actual and predicted values to observe any discrepancies.

## Plotting Yield Data

In [1]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np


## Basic values

In [None]:
# Define a dictionary to map input variable names to their descriptive labels
inputs_labels = {
    'c': 'concentration',
    'T': 'Temperature'
}

# Define the initial values for concentration, temperature, and reaction rate
c = 2.0  # mM (concentration)
T = 25.0  # °C (temperature)
rate = 5.77  # mole/s (reaction rate)

## Value intervals

In [12]:
# Create a list of tuples with each variable's low, center, and high values
dat = [
    ('c', 0.80 * c, c, 1.20 * c),  # Concentration: low is 80% of c, center is c, high is 120% of c
    ('T', 0.80 * T, T, 1.20 * T)    # Temperature: low is 80% of T, center is T, high is 120% of T
]

# Create a Pandas DataFrame from the data list, specifying the column names
inputs_df = pd.DataFrame(dat, columns=['index', 'low', 'center', 'high'])

# Set the 'index' column as the DataFrame index for easier access
inputs_df = inputs_df.set_index('index')

# Map the variable labels to the DataFrame index, providing a default empty string for unmapped values
inputs_df['label'] = inputs_df.index.map(lambda z: inputs_labels.get(z, ''))

# Print the resulting DataFrame to display its contents
print(inputs_df)

from computing import computing

real_experiment, results = computing(inputs_df)

c_array  = real_experiment['concentration']
T_array = real_experiment['Temperature']

        low  center  high          label
index                                   
c       1.6     2.0   2.4  concentration
T      20.0    25.0  30.0    Temperature


**EXPERIMENT**

In [13]:
y=[4.093000401645197,4.77827155867177,6.403139782334854,8.199982171532273,5.847007863003852,5.695005284140945,5.584338442337459,5.806999272226782,5.814771183739139]

results['y']= y
results

Unnamed: 0,c,T,y
0,-1,-1,4.093
1,1,-1,4.778272
2,-1,1,6.40314
3,1,1,8.199982
4,0,0,5.847008
5,0,0,5.695005
6,0,0,5.584338
7,0,0,5.806999
8,0,0,5.814771


In [15]:
# Data , 4 corners and 5 center points:

df = pd.DataFrame(results,columns=['c','T','y'])
#inputs_df = inputs_df.set_index(['index'])

#print dataframe
df

Unnamed: 0,c,T,y
0,-1,-1,4.093
1,1,-1,4.778272
2,-1,1,6.40314
3,1,1,8.199982
4,0,0,5.847008
5,0,0,5.695005
6,0,0,5.584338
7,0,0,5.806999
8,0,0,5.814771


In [16]:
# Compute the mean effect of the factor on the response,
# conditioned on each variable
labels = ['c','T']
print('ybar is', results['y'].mean())
main_effects = {}

print('main effects')
for key in labels:
        average_effects = results.groupby(key)['y'].mean()
        main_effects[key] = sum( [i*average_effects[i] for i in [-1,1]])
print(main_effects)

ybar is 5.802501773292475
main effects
{'c': np.float64(1.2410567731119961), 'T': np.float64(2.86592499677508)}


In [17]:
import itertools

twoway_labels = list(itertools.combinations(labels, 2))


twoway_effects = {}
for key in twoway_labels:

    effects = results.groupby([key[0],key[1]])['y'].mean()

    twoway_effects[key] = sum([ i*j*effects[i][j]/2 for i in [-1,1] for j in [-1,1] ])
twoway_effects

{('c', 'T'): np.float64(0.5557856160854229)}

In [7]:
y1 = results['y']
xlabs=['c','T']
x = results[xlabs]


res1 = smf.ols(formula='y ~ c + T + c:T', data=results).fit()

res1.summary()

  return hypotest_fun_in(*args, **kwds)


0,1,2,3
Dep. Variable:,y,R-squared:,0.992
Model:,OLS,Adj. R-squared:,0.988
Method:,Least Squares,F-statistic:,212.9
Date:,"Tue, 08 Oct 2024",Prob (F-statistic):,1.08e-05
Time:,09:19:15,Log-Likelihood:,8.5521
No. Observations:,9,AIC:,-9.104
Df Residuals:,5,BIC:,-8.315
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.8025,0.042,138.682,0.000,5.695,5.910
c,0.6205,0.063,9.887,0.000,0.459,0.782
T,1.4330,0.063,22.832,0.000,1.272,1.594
c:T,0.2779,0.063,4.428,0.007,0.117,0.439

0,1,2,3
Omnibus:,7.69,Durbin-Watson:,1.085
Prob(Omnibus):,0.021,Jarque-Bera (JB):,3.108
Skew:,-1.408,Prob(JB):,0.211
Kurtosis:,3.595,Cond. No.,1.5


In [18]:
#est1 = sm.OLS(y1,x).fit()
res1 = smf.ols(formula='y ~ c + T', data=results).fit()
print(res1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.962
Model:                            OLS   Adj. R-squared:                  0.949
Method:                 Least Squares   F-statistic:                     75.48
Date:                Tue, 08 Oct 2024   Prob (F-statistic):           5.59e-05
Time:                        09:27:07   Log-Likelihood:                 1.3812
No. Observations:                   9   AIC:                             3.238
Df Residuals:                       6   BIC:                             3.829
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.8025      0.085     68.482      0.0

  return hypotest_fun_in(*args, **kwds)


ybar = 5.8025 + 0.6205\*c + 1.4330\*T

In [9]:
def ybar(c,T):
    return 5.8025 + 0.6205*c + 1.4330*T

# 40.444 + 0.775t + 0.325T

y = ybar(1,1)
y

7.856

In [21]:
# Create a list of tuples with each variable's low, center, and high values
dat = [
    ('c', 1.0 * c, c, 1.40 * c),  # Concentration: low is 80% of c, center is c, high is 120% of c
    ('T', 1.0 * T, T, 1.40 * T)    # Temperature: low is 80% of T, center is T, high is 120% of T
]

# Create a Pandas DataFrame from the data list, specifying the column names
inputs_df = pd.DataFrame(dat, columns=['index', 'low', 'center', 'high'])

# Set the 'index' column as the DataFrame index for easier access
inputs_df = inputs_df.set_index('index')

# Map the variable labels to the DataFrame index, providing a default empty string for unmapped values
inputs_df['label'] = inputs_df.index.map(lambda z: inputs_labels.get(z, ''))

# Print the resulting DataFrame to display its contents
print(inputs_df)

from computing import computing

real_experiment, results = computing(inputs_df)

c_array  = real_experiment['concentration']
T_array = real_experiment['Temperature']

print(c_array)
print(T_array)

c = [2.0, 2.8, 2.0, 2.8, 2.0, 2.0, 2.0, 2.0, 2.0]
T = [25.0, 25.0, 35.0, 35.0, 25.0, 25.0, 25.0, 25.0, 25.0]


        low  center  high          label
index                                   
c       2.0     2.0   2.8  concentration
T      25.0    25.0  35.0    Temperature
0    2.0
1    2.8
2    2.0
3    2.8
4    2.0
5    2.0
6    2.0
7    2.0
8    2.0
Name: concentration, dtype: float64
0    25.0
1    25.0
2    35.0
3    35.0
4    25.0
5    25.0
6    25.0
7    25.0
8    25.0
Name: Temperature, dtype: float64


In [10]:
Origin = [0,0]
delta= [1.0, 1.4330/0.6205]
y=[41.0,42.9,47.1,49.7,53.8,59.9,65.0,70.4,77.6,80.3,76.2,75.1]

march=[]
for i in range(0,len(y)):
    march.append((Origin[0]+(i+1)*delta[0],Origin[1]+(i+1)*delta[1]))

March=pd.DataFrame(march,columns=['t', 'T'])
ypred=res1.predict(March)

March['y']=y
March['ypred']=ypred

March

PatsyError: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Number of rows mismatch between data argument and c (12 versus 1)
    y ~ c + T
        ^

In [49]:
# Plotting data for the 'March' DataFrame
# The plot shows actual data ('y') and predicted data ('ypred')
ax=March['y'].plot()
ax=March['ypred'].plot()
ax.set_xlabel('step')  # Set x-axis label as 'step'
ax.set_ylabel('Yield')  # Set y-axis label as 'Yield'


NameError: name 'March' is not defined

In [None]:
# Plotting data for the 'March' DataFrame
# The plot shows actual data ('y') and predicted data ('ypred')
ax=March['y'].plot()
ax=March['ypred'].plot()
ax.set_xlabel('step')  # Set x-axis label as 'step'
ax.set_ylabel('Yield')  # Set y-axis label as 'Yield'
