<a href="https://colab.research.google.com/github/mccoymb/AAE-590-DSMM/blob/main/590DSMM_HW6_5_big.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
import numpy as np
import cv2
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
time_values = np.array([25000, 28000, 31000, 35000, 37500, 40000, 45000, 50000, 75000, 108000,
                        180000, 220000, 250000, 270000, 300000, 360000, 430000, 540000, 900000, 1080000])
log_time = np.log(time_values)

In [3]:
data = np.load('/content/drive/My Drive/590TrainingData/train.npz')
print("Keys in npz file:", data.files)
input_raw_data = data['input_raw_data']  # Shape: (200000, 1, 64, 64)


Keys in npz file: ['clips', 'dims', 'input_raw_data']


In [6]:
# Function to compute grain size
def compute_grain_size(image):
    _, binary = cv2.threshold((image * 255).astype(np.uint8), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    areas = [cv2.contourArea(cnt) for cnt in contours if cv2.contourArea(cnt) > 5]  # Ignore small artifacts
    return np.mean(areas) if areas else 1e-6  # Avoid division by zero

# Final results
results = []

# Loop over different sequence counts
for num_sequences in [50, 200, 500]:
    subset_raw_data = input_raw_data[:num_sequences * 20]
    all_grain_sizes = np.zeros((num_sequences, 20))

    for seq in range(num_sequences):
        images = subset_raw_data[seq * 20:(seq + 1) * 20, 0, :, :]
        grain_sizes = np.array([compute_grain_size(img) for img in images])
        first = max(grain_sizes[0], 1)
        all_grain_sizes[seq] = grain_sizes / first

    # Flatten and apply IQR filtering
    X = np.tile(log_time, num_sequences)
    y = all_grain_sizes.flatten()
    Q1, Q3 = np.percentile(y, [25, 75])
    IQR = Q3 - Q1
    mask = (y >= Q1 - 1.5 * IQR) & (y <= Q3 + 1.5 * IQR)
    X_filtered = X[mask]
    y_filtered = y[mask]

    # Normalize inputs
    X_mean = X_filtered.mean()
    X_std = X_filtered.std()
    X_norm = (X_filtered - X_mean) / X_std
    X_input = X_norm.reshape(-1, 1)
    y_output = y_filtered.reshape(-1, 1)

    # ANN model
    model = Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    model.fit(X_input, y_output, epochs=500, batch_size=32, verbose=0)

    # Predict
    y_pred = model.predict(X_input).flatten()

    # Metrics
    r, _ = pearsonr(y_filtered, y_pred)
    r2 = r2_score(y_filtered, y_pred)
    mse = mean_squared_error(y_filtered, y_pred)

    results.append({
        "Sequences": num_sequences,
        "R²": round(r2, 4),
        "MSE": round(mse, 6),
        "Pearson R": round(r, 4)
    })

# Results table
results_df = pd.DataFrame(results)
csv_path = "/content/ann_results_loop.csv"
results_df.to_csv(csv_path, index=False)
print(results_df.to_string(index=False))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
 Sequences     R²      MSE  Pearson R
        50 0.3825 0.042362     0.6398
       200 0.2910 0.076758     0.5473
       500 0.2877 0.091639     0.5377
