In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 1. Load your dataset
# Update the path to your actual file location
df = pd.read_csv('E:/satoshi-squad-yantra-2026\ML_Models\ML_Pharamachain\data\LSTM_data\LSTM_demo.csv')
print(f"--- Data Loaded --- \nRows: {df.shape[0]}, Columns: {df.shape[1]}")

# 2. Temporal Sorting
# 'scanned_at' is used to order the drug's journey correctly
df['scanned_at'] = pd.to_datetime(df['scanned_at'])
df = df.sort_values(['id', 'scanned_at'])
print("Success: Sorted data by 'id' and 'scanned_at' for chronological sequences.")

# 3. Encode Categorical Data
# Converting User IDs and Statuses into numbers the model can process
le = LabelEncoder()
df['user_enc'] = le.fit_transform(df['scanner_user_id'].astype(str))
df['status_enc'] = le.fit_transform(df['verification_status'].astype(str))
print("Success: Encoded 'scanner_user_id' and 'verification_status' into numeric labels.")

# 4. Feature Scaling
# Normalizing latitude and longitude to help the LSTM learn geospatial patterns
num_cols = ['latitude', 'longitude']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print(f"Success: Normalized numerical columns: {num_cols}")

# 5. Define Feature Set
# We use the encoded categories + the normalized coordinates
features = ['user_enc', 'status_enc', 'latitude', 'longitude']
print(f"Final feature set for LSTM: {features}")

--- Data Loaded --- 
Rows: 60, Columns: 8
Success: Sorted data by 'id' and 'scanned_at' for chronological sequences.
Success: Encoded 'scanner_user_id' and 'verification_status' into numeric labels.
Success: Normalized numerical columns: ['latitude', 'longitude']
Final feature set for LSTM: ['user_enc', 'status_enc', 'latitude', 'longitude']


In [10]:
# --- ADD THIS BEFORE CREATING SEQUENCES ---
# Convert anomaly_flags from strings to numbers (0 and 1)
le_label = LabelEncoder()
df['anomaly_flags'] = le_label.fit_transform(df['anomaly_flags'].astype(str))
print(f"DEBUG: Encoded labels found: {le_label.classes_}")
# ------------------------------------------

def create_sequences(df, seq_length=1):
    X, y = [], []
    for item_id, group in df.groupby('id'):
        data = group[features].values
        # These are now guaranteed to be integers
        labels = group['anomaly_flags'].values 
        
        if len(data) >= seq_length:
            for i in range(len(data) - seq_length + 1):
                X.append(data[i:i + seq_length])
                y.append(labels[i + seq_length - 1])
    
    return np.array(X), np.array(y)

# Re-run the sequence creation
X_seq, y_seq = create_sequences(df, seq_length=1)

# Now these lines will work without the TypeError
X_tensor = torch.FloatTensor(X_seq)
y_tensor = torch.FloatTensor(y_seq.astype(float)).view(-1, 1) # Force float type for PyTorch

# Initialize the loader
from torch.utils.data import DataLoader, TensorDataset
loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=32, shuffle=True)
print("✅ Success: DataLoader is ready with numeric labels!")

DEBUG: Encoded labels found: ['["Rapid scanning detected","Geographic anomaly: 450km in 10 min"]' '[]']
✅ Success: DataLoader is ready with numeric labels!


In [11]:
import torch.nn as nn

class PharmaChainLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, layer_dim=2):
        super(PharmaChainLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # LSTM Layer: batch_first=True means input is (batch, seq, feature)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=0.2)

        # Fully connected layer to convert hidden state to a single output
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Initialize hidden and cell states with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)

        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(x, (h0, c0))

        # We only care about the last time step's output
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)

# Initialize the model using the number of features we defined earlier
model = PharmaChainLSTM(input_dim=len(features))
print(model)

PharmaChainLSTM(
  (lstm): LSTM(4, 64, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [12]:
# Binary Cross Entropy Loss
criterion = nn.BCELoss()

# Adam optimizer is usually the best "all-rounder" for LSTMs
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("Optimizer and Loss Function ready.")

Optimizer and Loss Function ready.


In [16]:
epochs = 60

print(f"--- Starting Training on {len(X_tensor)} sequences ---")

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    
    for batch_X, batch_y in loader:
        # 1. Clear gradients
        optimizer.zero_grad()
        
        # 2. Forward pass
        outputs = model(batch_X)
        
        # 3. Calculate Loss
        loss = criterion(outputs, batch_y)
        
        # 4. Backward pass (Backpropagation)
        loss.backward()
        
        # 5. Update weights
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(loader)
    
    # Print progress every epoch
    print(f"Epoch [{epoch+1}/{epochs}] | Loss: {avg_loss:.4f}")

print("\n✅ Training Complete!")

--- Starting Training on 60 sequences ---
Epoch [1/60] | Loss: 0.0346
Epoch [2/60] | Loss: 0.0320
Epoch [3/60] | Loss: 0.0339
Epoch [4/60] | Loss: 0.0289
Epoch [5/60] | Loss: 0.0260
Epoch [6/60] | Loss: 0.0258
Epoch [7/60] | Loss: 0.0294
Epoch [8/60] | Loss: 0.0218
Epoch [9/60] | Loss: 0.0247
Epoch [10/60] | Loss: 0.0251
Epoch [11/60] | Loss: 0.0176
Epoch [12/60] | Loss: 0.0183
Epoch [13/60] | Loss: 0.0180
Epoch [14/60] | Loss: 0.0209
Epoch [15/60] | Loss: 0.0179
Epoch [16/60] | Loss: 0.0145
Epoch [17/60] | Loss: 0.0166
Epoch [18/60] | Loss: 0.0138
Epoch [19/60] | Loss: 0.0135
Epoch [20/60] | Loss: 0.0134
Epoch [21/60] | Loss: 0.0125
Epoch [22/60] | Loss: 0.0096
Epoch [23/60] | Loss: 0.0134
Epoch [24/60] | Loss: 0.0111
Epoch [25/60] | Loss: 0.0100
Epoch [26/60] | Loss: 0.0118
Epoch [27/60] | Loss: 0.0101
Epoch [28/60] | Loss: 0.0099
Epoch [29/60] | Loss: 0.0116
Epoch [30/60] | Loss: 0.0080
Epoch [31/60] | Loss: 0.0084
Epoch [32/60] | Loss: 0.0112
Epoch [33/60] | Loss: 0.0106
Epoch [34/

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch

# 1. Split into Training and Testing (80/20)
# We use 'stratify' to ensure the test set has fakes to catch!
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_seq, test_size=0.2, random_state=42, stratify=y_seq
)

# 2. Convert to Tensors
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test).view(-1, 1)

# 3. Put Model in Eval Mode (Disables Dropout)
model.eval()

# 4. Generate Predictions
with torch.no_grad():
    # Get raw probabilities (0.0 to 1.0)
    raw_probs = model(X_test_tensor)
    # Convert probabilities to binary (0 or 1) using a 0.5 threshold
    predictions = (raw_probs > 0.5).float()

# 5. Print Statistical Report
print("\n" + "="*30)
print("   LSTM TEST PERFORMANCE")
print("="*30)
print(classification_report(y_test, predictions.numpy(), target_names=['Normal', 'Anomaly']))

print("\n--- Confusion Matrix ---")
print(confusion_matrix(y_test, predictions.numpy()))


   LSTM TEST PERFORMANCE
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00         3
     Anomaly       1.00      1.00      1.00         9

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12


--- Confusion Matrix ---
[[3 0]
 [0 9]]


In [27]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta

def generate_compatible_test_data(original_df, num_items=50):
    # 1. Get the list of User IDs the model already knows
    known_users = original_df['scanner_user_id'].unique()
    known_batches = original_df['batch_id'].unique()
    
    data = []
    # Geographical centers for testing logic
    locations = {
        "Base": (40.7128, -74.0060),  # NYC Area
        "Anomaly_Point": (19.0760, 72.8777) # Mumbai (for "impossible travel")
    }
    
    for _ in range(num_items):
        item_id = str(uuid.uuid4()) # New unique ID for the drug unit
        batch_id = np.random.choice(known_batches)
        
        # Determine if this item's journey will be an Anomaly
        is_anomaly = np.random.choice([0, 1], p=[0.7, 0.3])
        base_time = datetime.now()
        
        # Create a sequence of 3 scans
        for i in range(3):
            # Pick a user ID the model definitely knows
            user_id = np.random.choice(known_users)
            
            if is_anomaly and i == 2:
                # SCENARIO: The drug was in NYC, but 1 hour later it's in Mumbai
                lat, lon = locations["Anomaly_Point"]
                status = "suspicious"
                flags = '["Geographic anomaly: Impossible Speed"]'
                timestamp = base_time + timedelta(hours=1)
            else:
                # SCENARIO: Normal movement around NYC
                lat, lon = locations["Base"]
                lat += np.random.uniform(-0.05, 0.05)
                lon += np.random.uniform(-0.05, 0.05)
                status = "authentic"
                flags = "[]"
                timestamp = base_time + timedelta(days=i)
            
            data.append([
                item_id, batch_id, user_id, status, lat, lon, is_anomaly, timestamp
            ])
            
    return pd.DataFrame(data, columns=['id', 'batch_id', 'scanner_user_id', 'verification_status', 'latitude', 'longitude', 'anomaly_flags', 'scanned_at'])

# Generate the data using your original df to pull user IDs
new_test_df = generate_compatible_test_data(df)

# Save it to your specific path
save_path = r'E:/satoshi-squad-yantra-2026\ML_Models\ML_Pharamachain\data\LSTM_data\new_LSTM.csv'
new_test_df.to_csv(save_path, index=False)

print(f"✅ Created compatible test dataset at: {save_path}")
print(f"Reused {new_test_df['scanner_user_id'].nunique()} known User IDs.")

✅ Created compatible test dataset at: E:/satoshi-squad-yantra-2026\ML_Models\ML_Pharamachain\data\LSTM_data\new_LSTM.csv
Reused 2 known User IDs.


In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix

# 1. LOAD THE TEST DATA
test_path = r'E:/satoshi-squad-yantra-2026\ML_Models\ML_Pharamachain\data\LSTM_data\new_LSTM.csv'
test_df = pd.read_csv(test_path)

# 2. DEFINE SAFE ENCODING FUNCTION
def safe_label_encode(le, series):
    """Encodes labels and handles unseen values by mapping them to 0."""
    valid_labels = set(le.classes_)
    # Replace unseen labels with the first known label (or a default) to prevent crash
    safe_series = series.apply(lambda x: x if x in valid_labels else le.classes_[0])
    return le.transform(safe_series.astype(str))

print("--- Preprocessing Test Data ---")

# 3. APPLY PREPROCESSING
test_df['scanned_at'] = pd.to_datetime(test_df['scanned_at'])
test_df = test_df.sort_values(['id', 'scanned_at'])

# Use the safe encoder for categories
test_df['user_enc'] = safe_label_encode(le, test_df['scanner_user_id'])
test_df['status_enc'] = safe_label_encode(le, test_df['verification_status'])

# Scale coordinates using the scaler from training
test_df[['latitude', 'longitude']] = scaler.transform(test_df[['latitude', 'longitude']])

# 4. CREATE SEQUENCES
features = ['user_enc', 'status_enc', 'latitude', 'longitude']

def create_eval_sequences(df, seq_length=3):
    X, y = [], []
    for item_id, group in df.groupby('id'):
        data = group[features].values
        # Ensure labels are numeric
        labels = group['anomaly_flags'].values 
        
        if len(data) >= seq_length:
            for i in range(len(data) - seq_length + 1):
                X.append(data[i:i + seq_length])
                y.append(labels[i + seq_length - 1])
    return np.array(X), np.array(y)

X_test_seq, y_test_seq = create_eval_sequences(test_df, seq_length=3)

# 5. CONVERT TO TENSORS AND PREDICT
if len(X_test_seq) > 0:
    X_test_tensor = torch.FloatTensor(X_test_seq)
    y_test_tensor = torch.FloatTensor(y_test_seq.astype(float)).view(-1, 1)

    model.eval()
    with torch.no_grad():
        logits = model(X_test_tensor)
        predictions = (logits > 0.5).float()

    # 6. FINAL RESULTS
    print("\n" + "="*40)
    print("       STRESS TEST RESULTS")
    print("="*40)
    print(classification_report(y_test_tensor.numpy(), predictions.numpy(), 
                                target_names=['Authentic', 'Anomaly'],
                                zero_division=0))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test_tensor.numpy(), predictions.numpy()))
else:
    print("Error: No sequences created. Check if each 'id' has at least 3 scans.")

--- Preprocessing Test Data ---

       STRESS TEST RESULTS
              precision    recall  f1-score   support

   Authentic       0.00      0.00      0.00        36
     Anomaly       0.28      1.00      0.44        14

    accuracy                           0.28        50
   macro avg       0.14      0.50      0.22        50
weighted avg       0.08      0.28      0.12        50


Confusion Matrix:
[[ 0 36]
 [ 0 14]]
