In [None]:
!pip install git+https://github.com/moment-timeseries-foundation-model/moment.git --no-deps

!pip install "huggingface-hub>=0.24.0"
!pip install "transformers>=4.33.0"

Collecting git+https://github.com/moment-timeseries-foundation-model/moment.git
  Cloning https://github.com/moment-timeseries-foundation-model/moment.git to /tmp/pip-req-build-l99xqi1e
  Running command git clone --filter=blob:none --quiet https://github.com/moment-timeseries-foundation-model/moment.git /tmp/pip-req-build-l99xqi1e
  Resolved https://github.com/moment-timeseries-foundation-model/moment.git to commit 284d7ff16a817cbdb1337ffc7f7fa5146453a50c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/My Drive/final_dataset.csv'
df_train = pd.read_csv(file_path)

In [None]:
# this particular example is for training MOMENT on 72 hour horizon
columns_to_keep = [
    'datetime', 'machineID',
    'volt', 'rotate', 'pressure', 'vibration',
    'will_fail_72h'
]

df_train = df_train[columns_to_keep]

In [None]:
df_train.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,will_fail_72h
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,0
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,0
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,0
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,0
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,0


In [None]:
import pandas as pd
import numpy as np

# 0) Basics: ensure datetime + sort
df_train['datetime'] = pd.to_datetime(df_train['datetime'])
df_train = df_train.sort_values(['machineID', 'datetime']).reset_index(drop=True)

# 3) Time-based split (global cutoffs by datetime quantiles)
cut1, cut2 = df_train['datetime'].quantile([0.70, 0.85])
train = df_train[df_train['datetime'] <= cut1].copy()
val   = df_train[(df_train['datetime'] > cut1) & (df_train['datetime'] <= cut2)].copy()
test  = df_train[df_train['datetime'] > cut2].copy()

# 4) Imbalance report
targets = ['will_fail_72h']

def describe_split_multi(name, d, targets):
    n = len(d)
    tmin, tmax = d['datetime'].min(), d['datetime'].max()
    print(f"{name:>6}: n={n:,}  time {tmin} → {tmax}")
    for t in targets:
        pos = int(d[t].sum())
        rate = pos / n if n else 0.0
        print(f"   {t}: positives={pos:,}  rate={rate:.3%}")

describe_split_multi('train', train, targets)
describe_split_multi('  val', val, targets)
describe_split_multi(' test', test, targets)


 train: n=613,300  time 2015-01-01 06:00:00 → 2015-09-13 18:00:00
   will_fail_72h: positives=36,928  rate=6.021%
   val: n=131,400  time 2015-09-13 19:00:00 → 2015-11-07 12:00:00
   will_fail_72h: positives=6,594  rate=5.018%
  test: n=131,400  time 2015-11-07 13:00:00 → 2016-01-01 06:00:00
   will_fail_72h: positives=7,213  rate=5.489%


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Identify columns to scale (exclude identifiers and all targets)
cols_to_exclude = ['datetime', 'machineID', 'will_fail_72h']
feature_cols = [col for col in train.columns if col not in cols_to_exclude]

print(f"Scaling {len(feature_cols)} features...")

# Fit scaler ONLY on training data
scaler = StandardScaler()
scaler.fit(train[feature_cols])

# Transform all splits using training statistics
train_scaled = train.copy()
val_scaled = val.copy()
test_scaled = test.copy()

train_scaled[feature_cols] = scaler.transform(train[feature_cols]).astype(np.float32)
val_scaled[feature_cols]   = scaler.transform(val[feature_cols]).astype(np.float32)
test_scaled[feature_cols]  = scaler.transform(test[feature_cols]).astype(np.float32)

print("Scaling completed!")
print("Train scaled shape:", train_scaled.shape)
print("Val scaled shape:", val_scaled.shape)
print("Test scaled shape:", test_scaled.shape)


Scaling 4 features...
Scaling completed!
Train scaled shape: (613300, 7)
Val scaled shape: (131400, 7)
Test scaled shape: (131400, 7)


In [None]:
train_scaled.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,will_fail_72h
0,2015-01-01 06:00:00,1,0.349298,-0.533639,1.114366,0.87276,0
1,2015-01-01 07:00:00,1,-0.511021,-0.832952,-0.486021,0.561305,0
2,2015-01-01 08:00:00,1,0.012104,1.533999,-2.32307,-1.157225,0
3,2015-01-01 09:00:00,1,-0.537877,-1.908092,0.766501,0.134827,0
4,2015-01-01 10:00:00,1,-0.850876,-0.213123,1.006148,-2.680961,0


In [None]:
# Check if any features are suspiciously correlated with target
correlations = train_scaled.corr()['will_fail_72h'].abs().sort_values(ascending=False)
print("Top correlated features:")
print(correlations.head(10))

Top correlated features:
will_fail_72h    1.000000
rotate           0.086225
vibration        0.072533
pressure         0.064690
volt             0.056719
machineID        0.022442
datetime         0.007422
Name: will_fail_72h, dtype: float64


In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import gc
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Data preparation for the MOMENT model
class TimeSeriesDataset(Dataset):
    def __init__(self, data, sequence_length=24, stride=1):

        self.sequence_length = sequence_length
        self.stride = stride

        # Prepare features and labels
        feature_cols = [col for col in data.columns
                       if col not in ['datetime', 'machineID', 'will_fail_72h']]

        self.features = data[feature_cols].values.astype(np.float32)
        self.labels = data['will_fail_72h'].values.astype(np.float32)
        self.machine_ids = data['machineID'].values

        # Create valid indices for sequences (with stride)
        self.valid_indices = []

        for machine_id in data['machineID'].unique():
            machine_mask = self.machine_ids == machine_id
            machine_indices = np.where(machine_mask)[0]

            if len(machine_indices) >= sequence_length:
                # Use stride to reduce number of sequences
                for i in range(0, len(machine_indices) - sequence_length + 1, stride):
                    start_idx = machine_indices[i]
                    end_idx = start_idx + sequence_length - 1

                    # Ensure sequence is contiguous
                    if machine_indices[i + sequence_length - 1] == end_idx:
                        self.valid_indices.append(end_idx)

        print(f"Created {len(self.valid_indices)} sequences from {len(data)} records")

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        end_idx = self.valid_indices[idx]
        start_idx = end_idx - self.sequence_length + 1

        # Get sequence of features
        sequence = self.features[start_idx:end_idx + 1]

        # Get label (failure in next 24/48/72h at the end of sequence)
        label = self.labels[end_idx]

        return torch.tensor(sequence), torch.tensor(label)

# Simplified MOMENT model
class SimpleMOMENT(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_heads=4, num_layers=2, dropout=0.1):
        super(SimpleMOMENT, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        # Input projection
        self.input_projection = nn.Linear(input_dim, hidden_dim)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, hidden_dim) * 0.02)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim * 2,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x):
        # x shape: (batch_size, sequence_length, input_dim)
        batch_size, seq_len, _ = x.shape

        # Project input
        x = self.input_projection(x)

        # Add positional encoding
        x = x + self.positional_encoding[:, :seq_len, :]

        # Apply transformer
        x = self.transformer(x)

        # Use last timestep for classification
        x = x[:, -1, :]

        # Classify
        output = self.classifier(x)

        return output.squeeze()


# Training functions
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc='Training', leave=False)
    for sequences, labels in progress_bar:
        sequences = sequences.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Store predictions for metrics
        preds = (outputs > 0.8).float().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    return total_loss / len(dataloader), np.array(all_preds), np.array(all_labels)

def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc='Validation', leave=False)
        for sequences, labels in progress_bar:
            sequences = sequences.to(device)
            labels = labels.to(device)

            outputs = model(sequences)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Store predictions
            probs = outputs.cpu().numpy()
            preds = (outputs > 0.5).float().cpu().numpy()
            all_probs.extend(probs)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(dataloader), np.array(all_preds), np.array(all_labels), np.array(all_probs)


# Main Training Part
# Parameters (can be adjusted for memory efficiency)
SEQUENCE_LENGTH = 24  # Look back 24 hours
STRIDE = 1
BATCH_SIZE = 64
HIDDEN_DIM = 64
NUM_HEADS = 4
NUM_LAYERS = 2
LEARNING_RATE = 0.001
NUM_EPOCHS = 6
DROPOUT = 0.2

print("Creating datasets...")
# Create datasets with stride to reduce memory usage
train_dataset = TimeSeriesDataset(train_scaled, sequence_length=SEQUENCE_LENGTH, stride=STRIDE)
val_dataset = TimeSeriesDataset(val_scaled, sequence_length=SEQUENCE_LENGTH, stride=STRIDE)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# Calculate positive weight for imbalanced data
pos_samples = train_scaled['will_fail_72h'].sum()
neg_samples = len(train_scaled) - pos_samples
# Possible to play around with pos_weight
pos_weight = torch.tensor([neg_samples / pos_samples * 0.2]).to(device)

print(f"Positive weight for loss: {pos_weight.item():.2f}")

# Initialize model
input_dim = len([col for col in train_scaled.columns
                 if col not in ['datetime', 'machineID', 'will_fail_72h']])

model = SimpleMOMENT(
    input_dim=input_dim,
    hidden_dim=HIDDEN_DIM,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
print("\nStarting training...")
best_val_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")

    # Train
    train_loss, train_preds, train_labels = train_epoch(model, train_loader, criterion, optimizer, device)

    # Validate
    val_loss, val_preds, val_labels, val_probs = validate_epoch(model, val_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_moment_model.pth')
        print("Saved best model!")

    # Clear cache periodically
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


# Evaluation
print("\n" + "="*50)
print("VALIDATION EVALUATION")
print("="*50)

# Load best model
model.load_state_dict(torch.load('best_moment_model.pth'))
model.eval()

# Get final predictions
_, val_preds, val_labels, val_probs = validate_epoch(model, val_loader, criterion, device)

# Classification report
print("\n=== CLASSIFICATION REPORT ===")
print(classification_report(val_labels, val_preds,
                          target_names=['No Failure', 'Failure'],
                          digits=3))

# Confusion matrix
print("\n=== CONFUSION MATRIX ===")
tn, fp, fn, tp = confusion_matrix(val_labels, val_preds).ravel()
print(f"True Negatives:  {tn:,}")
print(f"False Positives: {fp:,}")
print(f"False Negatives: {fn:,}")
print(f"True Positives:  {tp:,}")

# Business metrics
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\n=== KEY METRICS ===")
print(f"Recall (Sensitivity): {recall:.3f} - We catch {recall:.1%} of actual failures")
print(f"Precision: {precision:.3f} - When we predict failure, we're right {precision:.1%} of the time")
print(f"F1 Score: {f1:.3f}")

# ROC-AUC
try:
    roc_auc = roc_auc_score(val_labels, val_probs)
    print(f"ROC-AUC: {roc_auc:.3f}")
except:
    print("ROC-AUC: Could not compute (possibly single class in batch)")

# Business cost analysis
emergency_cost = 10000
maintenance_cost = 500
total_emergency = fn * emergency_cost
total_maintenance = fp * maintenance_cost

print(f"\n=== BUSINESS IMPACT ===")
print(f"Missed failures: {fn} → ${total_emergency:,}")
print(f"False alarms: {fp} → ${total_maintenance:,}")
print(f"Total cost: ${total_emergency + total_maintenance:,}")


# TEST EVALUATION
print("\n" + "="*50)
print("TEST EVALUATION")
print("="*50)

print("\n" + "="*50)
print("THRESHOLD TESTING")
print("="*50)

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

for threshold in thresholds:
    # Get predictions with this threshold
    threshold_preds = (test_probs > threshold).astype(int)

    # Calculate metrics
    test_tn, test_fp, test_fn, test_tp = confusion_matrix(test_labels, threshold_preds).ravel()

    test_recall = test_tp / (test_tp + test_fn) if (test_tp + test_fn) > 0 else 0
    test_precision = test_tp / (test_tp + test_fp) if (test_tp + test_fp) > 0 else 0
    test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall) if (test_precision + test_recall) > 0 else 0

    # Business costs
    emergency_cost = 10000
    maintenance_cost = 500
    test_total_emergency = test_fn * emergency_cost
    test_total_maintenance = test_fp * maintenance_cost
    total_cost = test_total_emergency + test_total_maintenance

    print(f"\nThreshold: {threshold}")
    print(f"Recall: {test_recall:.3f} | Precision: {test_precision:.3f} | F1: {test_f1:.3f}")
    print(f"Missed: {test_fn} | False Alarms: {test_fp} | Total Cost: ${total_cost:,}")

# Create test dataset and loader
print("Creating test dataset...")
test_dataset = TimeSeriesDataset(test_scaled, sequence_length=SEQUENCE_LENGTH, stride=STRIDE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# Get test predictions
_, test_preds, test_labels, test_probs = validate_epoch(model, test_loader, criterion, device)

# Test Classification report
print("\n=== TEST CLASSIFICATION REPORT (will_fail_72h) ===")
print(classification_report(test_labels, test_preds,
                          target_names=['No Failure', 'Failure'],
                          digits=3))

# Test Confusion matrix
print("\n=== TEST CONFUSION MATRIX ===")
test_tn, test_fp, test_fn, test_tp = confusion_matrix(test_labels, test_preds).ravel()
print(f"True Negatives:  {test_tn:,}")
print(f"False Positives: {test_fp:,}")
print(f"False Negatives: {test_fn:,}")
print(f"True Positives:  {test_tp:,}")

# Test Business metrics
test_recall = test_tp / (test_tp + test_fn) if (test_tp + test_fn) > 0 else 0
test_precision = test_tp / (test_tp + test_fp) if (test_tp + test_fp) > 0 else 0
test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall) if (test_precision + test_recall) > 0 else 0

print(f"\n=== TEST KEY METRICS ===")
print(f"Recall (Sensitivity): {test_recall:.3f} - We catch {test_recall:.1%} of actual failures")
print(f"Precision: {test_precision:.3f} - When we predict failure, we're right {test_precision:.1%} of the time")
print(f"F1 Score: {test_f1:.3f}")

# Test ROC-AUC
try:
    test_roc_auc = roc_auc_score(test_labels, test_probs)
    print(f"ROC-AUC: {test_roc_auc:.3f}")
except:
    print("ROC-AUC: Could not compute (possibly single class in batch)")

# Test Business cost analysis
test_total_emergency = test_fn * emergency_cost
test_total_maintenance = test_fp * maintenance_cost

print(f"\n=== TEST BUSINESS IMPACT ===")
print(f"Missed failures: {test_fn} → ${test_total_emergency:,}")
print(f"False alarms: {test_fp} → ${test_total_maintenance:,}")
print(f"Total cost: ${test_total_emergency + test_total_maintenance:,}")

del train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n Training and evaluation complete!")

Using device: cuda
Creating datasets...
Created 611000 sequences from 613300 records
Created 129100 sequences from 131400 records
Positive weight for loss: 3.12
Model parameters: 75,777

Starting training...

Epoch 1/6




Train Loss: 0.3979 | Val Loss: 0.3470
Saved best model!

Epoch 2/6




Train Loss: 0.3913 | Val Loss: 0.3534

Epoch 3/6




Train Loss: 0.3890 | Val Loss: 0.3492

Epoch 4/6




Train Loss: 0.3879 | Val Loss: 0.3459
Saved best model!

Epoch 5/6




Train Loss: 0.3875 | Val Loss: 0.3617

Epoch 6/6




Train Loss: 0.3871 | Val Loss: 0.3464

VALIDATION EVALUATION





=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

  No Failure      0.959     0.977     0.968    122609
     Failure      0.333     0.218     0.263      6491

    accuracy                          0.939    129100
   macro avg      0.646     0.597     0.616    129100
weighted avg      0.928     0.939     0.933    129100


=== CONFUSION MATRIX ===
True Negatives:  119,780
False Positives: 2,829
False Negatives: 5,079
True Positives:  1,412

=== KEY METRICS ===
Recall (Sensitivity): 0.218 - We catch 21.8% of actual failures
Precision: 0.333 - When we predict failure, we're right 33.3% of the time
F1 Score: 0.263
ROC-AUC: 0.793

=== BUSINESS IMPACT ===
Missed failures: 5079 → $50,790,000
False alarms: 2829 → $1,414,500
Total cost: $52,204,500

TEST EVALUATION

THRESHOLD TESTING

Threshold: 0.3
Recall: 0.696 | Precision: 0.282 | F1: 0.401
Missed: 1448 | False Alarms: 8475 | Total Cost: $18,717,500

Threshold: 0.4
Recall: 0.638 | Precision: 0.290 | F1: 0.3




=== TEST CLASSIFICATION REPORT (will_fail_72h) ===
              precision    recall  f1-score   support

  No Failure      0.957     0.972     0.965    122014
     Failure      0.347     0.255     0.294      7086

    accuracy                          0.933    129100
   macro avg      0.652     0.614     0.629    129100
weighted avg      0.924     0.933     0.928    129100


=== TEST CONFUSION MATRIX ===
True Negatives:  118,610
False Positives: 3,404
False Negatives: 5,277
True Positives:  1,809

=== TEST KEY METRICS ===
Recall (Sensitivity): 0.255 - We catch 25.5% of actual failures
Precision: 0.347 - When we predict failure, we're right 34.7% of the time
F1 Score: 0.294
ROC-AUC: 0.789

=== TEST BUSINESS IMPACT ===
Missed failures: 5277 → $52,770,000
False alarms: 3404 → $1,702,000
Total cost: $54,472,000

 Training and evaluation complete!
