# Data Analysis for Lab3 Dataset

## Introduction
In this section, we analyze the yield data for the month of March.
We visualize the actual and predicted values to observe any discrepancies.

## Plotting Yield Data

In [15]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from lab_functions import run_experiment, add_noise_func

In [16]:
import pandas as pd

# Define a dictionary to map input variable names to their descriptive labels
inputs_labels = {
    'c': 'concentration',
    'T': 'Temperature'
}

# Define the initial values for concentration, temperature, and reaction rate
c = 1.9  # mM (concentration)
T = 30.1  # °C (temperature)
rate = 4.46  # mole/s (reaction rate)

# Create a list of tuples with each variable's low, center, and high values
dat = [
    ('c', 0.80 * c, c, 1.20 * c),  # Concentration: low is 80% of c, center is c, high is 120% of c
    ('T', 0.80 * T, T, 1.20 * T)    # Temperature: low is 80% of T, center is T, high is 120% of T
]

# Create a Pandas DataFrame from the data list, specifying the column names
inputs_df = pd.DataFrame(dat, columns=['index', 'low', 'center', 'high'])

# Set the 'index' column as the DataFrame index for easier access
inputs_df = inputs_df.set_index('index')

# Map the variable labels to the DataFrame index, providing a default empty string for unmapped values
inputs_df['label'] = inputs_df.index.map(lambda z: inputs_labels.get(z, ''))

# Print the resulting DataFrame to display its contents
print(inputs_df)

         low  center   high          label
index                                     
c       1.52     1.9   2.28  concentration
T      24.08    30.1  36.12    Temperature


In [17]:
# Compute the average and span for each variable in the inputs DataFrame
# The average is calculated as the midpoint between 'low' and 'high' values
inputs_df['average'] = inputs_df.apply(lambda z: (z['high'] + z['low']) / 2, axis=1)

# The span is calculated as half the difference between 'high' and 'low' values
inputs_df['span'] = inputs_df.apply(lambda z: (z['high'] - z['low']) / 2, axis=1)

# Encode the data using standardized values for 'low', 'center', and 'high'
# The encoding formula centers the data around the average and scales it by the span
inputs_df['encoded_low'] = inputs_df.apply(lambda z: (z['low'] - z['average']) / z['span'], axis=1)
inputs_df['encoded_center'] = inputs_df.apply(lambda z: (z['center'] - z['average']) / z['span'], axis=1)
inputs_df['encoded_high'] = inputs_df.apply(lambda z: (z['high'] - z['average']) / z['span'], axis=1)

# Drop the 'average' and 'span' columns as they are no longer needed for further analysis
inputs_df = inputs_df.drop(['average', 'span'], axis=1)

# Display the modified inputs DataFrame
inputs_df


Unnamed: 0_level_0,low,center,high,label,encoded_low,encoded_center,encoded_high
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c,1.52,1.9,2.28,concentration,-1.0,0.0,1.0
T,24.08,30.1,36.12,Temperature,-1.0,0.0,1.0


In [22]:
import itertools

# Generate all combinations of -1 and 1 for two variables using itertools.product
encoded_inputs = list(itertools.product([-1, 1], [-1, 1]))

# Print the initial combinations
print("Initial encoded inputs:", encoded_inputs)

# Append the tuple (0, 0) five times to the list of encoded inputs
for i in range(0, 5):
    encoded_inputs.append((0, 0))

# Print the updated list of encoded inputs after appending (0, 0)
print("Updated encoded inputs:", encoded_inputs)

Initial encoded inputs: [(-1, -1), (-1, 1), (1, -1), (1, 1)]
Updated encoded inputs: [(-1, -1), (-1, 1), (1, -1), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


In [25]:
# Create a DataFrame from the list of encoded inputs
results = pd.DataFrame(encoded_inputs)

# Reverse the order of the columns in the DataFrame
results = results[results.columns[::-1]]

# Rename the columns to 't' for the first column and 'T' for the second column
results.columns = ['c', 'T']

# Display the resulting DataFrame
print(results)

   c  T
0 -1 -1
1  1 -1
2 -1  1
3  1  1
4  0  0
5  0  0
6  0  0
7  0  0
8  0  0


In [26]:
# def parse_values(x):
#     if x < 2:
#        return x * 10
#     elif x < 4:
#        return x ** 2
#     else:
#        return x + 10


# Create a copy of results for real_experiment
real_experiment = results.copy()
var_labels = []

# Loop through the existing variables in inputs_df
for var in inputs_df.index:
    # Get the label for the variable
    var_label = inputs_df.loc[var]['label']
    var_labels.append(var_label)
    
    # Apply the function to create a new column based on conditions
    real_experiment[var_label] = results.apply(
        lambda z: inputs_df.loc[var]['low'] if z[var] < 0 else 
                   (inputs_df.loc[var]['high'] if z[var] > 0 else 
                    inputs_df.loc[var]['center']),
        axis=1
    )

# Print the values of each real variable in the experiment
print("The values of each real variable in the experiment:")
print(real_experiment[var_labels])


The values of each real variable in the experiment:
   concentration  Temperature
0           1.52        24.08
1           2.28        24.08
2           1.52        36.12
3           2.28        36.12
4           1.90        30.10
5           1.90        30.10
6           1.90        30.10
7           1.90        30.10
8           1.90        30.10


**EXPERIMENT**

In [27]:
y= [39.3, 40.9, 40.0, 41.5, 40.3, 40.5, 40.7, 40.2, 40.6]
results['y']= y
results

Unnamed: 0,c,T,y
0,-1,-1,39.3
1,1,-1,40.9
2,-1,1,40.0
3,1,1,41.5
4,0,0,40.3
5,0,0,40.5
6,0,0,40.7
7,0,0,40.2
8,0,0,40.6


In [None]:
# Data , 4 corners and 5 center points:

df = pd.DataFrame(results,columns=['t','T','y'])
#inputs_df = inputs_df.set_index(['index'])

#print dataframe
df

In [None]:
# Compute the mean effect of the factor on the response,
# conditioned on each variable
labels = ['t','T']
print('ybar is', results['y'].mean())
main_effects = {}

print('main effects')
for key in labels:
        average_effects = results.groupby(key)['y'].mean()
        main_effects[key] = sum( [i*average_effects[i] for i in [-1,1]])
print(main_effects)

In [None]:
import itertools

twoway_labels = list(itertools.combinations(labels, 2))


twoway_effects = {}
for key in twoway_labels:

    effects = results.groupby([key[0],key[1]])['y'].mean()

    twoway_effects[key] = sum([ i*j*effects[i][j]/2 for i in [-1,1] for j in [-1,1] ])
twoway_effects

In [None]:
y1 = results['y']
xlabs=['t','T']
x = results[xlabs]


res1 = smf.ols(formula='y ~ t + T + t:T', data=results).fit()

res1.summary()

In [None]:
#est1 = sm.OLS(y1,x).fit()
res1 = smf.ols(formula='y ~ t + T', data=results).fit()
print(res1.summary())

In [None]:
# Plotting data for the 'March' DataFrame
# The plot shows actual data ('y') and predicted data ('ypred')
ax=March['y'].plot()
ax=March['ypred'].plot()
ax.set_xlabel('step')  # Set x-axis label as 'step'
ax.set_ylabel('Yield')  # Set y-axis label as 'Yield'


In [None]:
# Plotting data for the 'March' DataFrame
# The plot shows actual data ('y') and predicted data ('ypred')
ax=March['y'].plot()
ax=March['ypred'].plot()
ax.set_xlabel('step')  # Set x-axis label as 'step'
ax.set_ylabel('Yield')  # Set y-axis label as 'Yield'
