<a href="https://colab.research.google.com/github/mccoymb/AAE-590-DSMM/blob/main/590DSMM_HW6_2_large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import numpy as np

# Load using numpy.load() first to inspect contents
data = np.load('/content/drive/My Drive/590TrainingData/train.npz')

# Print stored variable names
print("Keys in npz file:", data.files)
# Load dataset
input_raw_data = data['input_raw_data']  # Shape: (200000, 1, 64, 64)

Keys in npz file: ['clips', 'dims', 'input_raw_data']


In [21]:
import numpy as np
import cv2
import pandas as pd
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
# Time sequence (repeats for each sequence)
time_values = np.array([25000, 28000, 31000, 35000, 37500, 40000, 45000, 50000, 75000, 108000,
                        180000, 220000, 250000, 270000, 300000, 360000, 430000, 540000, 900000, 1080000])

In [23]:
# Function to compute grain size
def compute_grain_size(image):
    _, binary = cv2.threshold((image * 255).astype(np.uint8), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    areas = [cv2.contourArea(cnt) for cnt in contours if cv2.contourArea(cnt) > 5]
    return np.mean(areas) if areas else 1e-6

# Prepare results list
results = []

# Run for both 100 and 500 sequences
for num_sequences in [100, 500, 1000, 10000]:
    subset_raw_data = input_raw_data[:num_sequences * 20]
    all_grain_sizes = np.zeros((num_sequences, 20))

    for seq in range(num_sequences):
        imgs = subset_raw_data[seq * 20:(seq + 1) * 20, 0, :, :]
        grain_sizes = np.array([compute_grain_size(img) for img in imgs])
        first = max(grain_sizes[0], 1)
        all_grain_sizes[seq] = grain_sizes / first

    X = np.tile(log_time, num_sequences).reshape(-1, 1)
    y = all_grain_sizes.flatten()

    # IQR outlier removal
    Q1, Q3 = np.percentile(y, [25, 75])
    IQR = Q3 - Q1
    mask = (y >= Q1 - 1.5 * IQR) & (y <= Q3 + 1.5 * IQR)
    X_filtered = X[mask]
    y_filtered = y[mask]

    # Models
    models = {
        "Ridge": Ridge(alpha=1.0),
        "Lasso": Lasso(alpha=0.001)
    }

    for name, model in models.items():
        model.fit(X_filtered, y_filtered)
        y_pred = model.predict(X_filtered)

        r2 = r2_score(y_filtered, y_pred)
        n = len(y_filtered)
        p = 1  # one predictor
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        mse = mean_squared_error(y_filtered, y_pred)
        cv_r2 = np.mean(cross_val_score(model, X_filtered, y_filtered, cv=5, scoring='r2'))

        results.append({
            "Model": name,
            "Sequences": num_sequences,
            "R²": round(r2, 4),
            "Adj. R²": round(adj_r2, 4),
            "MSE": round(mse, 6),
            "5-Fold CV R²": round(cv_r2, 4)
        })

# Save to CSV
results_df = pd.DataFrame(results)
csv_path = "/content/ridge_lasso_bigresults.csv"
results_df.to_csv(csv_path, index=False)
print(results_df)

   Model  Sequences      R²  Adj. R²       MSE  5-Fold CV R²
0  Ridge        100  0.3639   0.3635  0.050353        0.3410
1  Lasso        100  0.3639   0.3635  0.050354        0.3410
2  Ridge        500  0.2817   0.2816  0.092415        0.2727
3  Lasso        500  0.2817   0.2816  0.092416        0.2727
4  Ridge       1000  0.2746   0.2745  0.113206        0.2534
5  Lasso       1000  0.2746   0.2745  0.113206        0.2534
6  Ridge      10000  0.2741   0.2741  0.118927        0.2740
7  Lasso      10000  0.2741   0.2741  0.118928        0.2740
