In [1]:
# =============================
# Step 1: Import Libraries
# =============================
import os
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# =============================
# Step 2: Dataset Path
# =============================
dataset_path = r"C:\Users\Nandini\osv-hybrid\notebooks\SVC2004_Sample"  # <-- update this path

# =============================
# Step 3: Function to Read One Signature File
# =============================
def read_signature_file(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
        n_points = int(lines[0].strip())
        data = [list(map(float, line.strip().split())) for line in lines[1:n_points+1]]
    df = pd.DataFrame(data, columns=['x', 'y', 'timestamp', 'button', 'azimuth', 'altitude', 'pressure'])
    return df

# =============================
# Step 4: Extract Dynamic Features
# =============================
def extract_dynamic_features(df):
    df = df.sort_values(by='timestamp').reset_index(drop=True)
    
    # 1Ô∏è‚É£ Total Duration
    total_time = df['timestamp'].iloc[-1] - df['timestamp'].iloc[0]
    
    # 2Ô∏è‚É£ Average Speed
    dx = np.diff(df['x'])
    dy = np.diff(df['y'])
    dt = np.diff(df['timestamp'])
    distances = np.sqrt(dx**2 + dy**2)
    speeds = distances / (dt + 1e-6)
    avg_speed = np.mean(speeds)
    
    # 3Ô∏è‚É£ Stroke Count (pen-up ‚Üí pen-down)
    button = df['button'].values
    stroke_count = np.sum((button[1:] - button[:-1]) == 1)
    
    # 4Ô∏è‚É£ Direction Changes
    angles = np.arctan2(dy, dx)
    direction_changes = np.sum(np.abs(np.diff(angles)) > np.pi / 4)
    
    return {
        'total_duration': total_time,
        'avg_speed': avg_speed,
        'stroke_count': stroke_count,
        'direction_changes': direction_changes
    }

# =============================
# Step 5: Extract Features from All Files
# =============================
all_features = []

for filename in os.listdir(dataset_path):
    if filename.endswith(".txt"):
        user = filename.split("_")[0]
        sig_num = int(filename.split("_")[1].split(".")[0])
        label = 1 if sig_num <= 20 else 0  # 1 = genuine, 0 = forgery
        df = read_signature_file(os.path.join(dataset_path, filename))
        feat = extract_dynamic_features(df)
        feat['user'] = user
        feat['label'] = label
        all_features.append(feat)

features_df = pd.DataFrame(all_features)
print("‚úÖ Extracted Features Sample:")
display(features_df.head())

# =============================
# Step 6: Split Dataset into Train & Test
# =============================
X = features_df[['total_duration', 'avg_speed', 'stroke_count', 'direction_changes']]
y = features_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# =============================
# Step 7: Train Random Forest Classifier
# =============================
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# =============================
# Step 8: Evaluate the Model
# =============================
y_pred = rf_model.predict(X_test)

print("‚úÖ Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# =============================
# Step 9: Save the Model
# =============================
model_path = "dynamic_rf_model.pkl"
joblib.dump(rf_model, model_path)
print(f"\nüéØ Model saved successfully at: {model_path}")

# =============================
# Step 10: Test the Model on Random Input
# =============================

def predict_random_signature(df):
    feat = extract_dynamic_features(df)
    feat_values = np.array([[feat['total_duration'], feat['avg_speed'], feat['stroke_count'], feat['direction_changes']]])
    pred = rf_model.predict(feat_values)[0]
    return "‚úÖ Genuine Signature" if pred == 1 else "‚ö†Ô∏è Forged Signature"

# Test with a random file
test_file = os.path.join(dataset_path, "USER1_25.txt")  # choose any file (genuine or forgery)
df_test = read_signature_file(test_file)
result = predict_random_signature(df_test)
print("\nüîç Prediction Result:", result)


‚úÖ Extracted Features Sample:


Unnamed: 0,total_duration,avg_speed,stroke_count,direction_changes,user,label
0,1803.0,15.931164,3,14,USER1,1
1,1662.0,22.187465,3,19,USER1,1
2,1672.0,21.919729,3,19,USER1,1
3,1832.0,17.731168,3,17,USER1,1
4,1732.0,21.720494,3,15,USER1,1


‚úÖ Model Evaluation:
Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.80      0.80        25
           1       0.80      0.80      0.80        25

    accuracy                           0.80        50
   macro avg       0.80      0.80      0.80        50
weighted avg       0.80      0.80      0.80        50


Confusion Matrix:
 [[20  5]
 [ 5 20]]

üéØ Model saved successfully at: dynamic_rf_model.pkl

üîç Prediction Result: ‚ö†Ô∏è Forged Signature




In [None]:
#main datset trainig 

In [2]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

# ============================================================
# STEP 1: CLEAN FILES
# ============================================================
def clean_task1_dataset(raw_folder, clean_folder):
    os.makedirs(clean_folder, exist_ok=True)
    all_files = glob.glob(f"{raw_folder}/**/*.txt", recursive=True)
    valid = 0
    
    for f in all_files:
        try:
            with open(f, 'r', errors='ignore') as file:
                lines = file.readlines()
            if not lines:
                continue
            # Skip first line (number of points)
            lines = lines[1:]
            # Remove empty lines
            lines = [ln for ln in lines if ln.strip()]
            if len(lines) < 5:
                continue
            
            # Save cleaned file
            clean_path = f.replace(raw_folder, clean_folder)
            os.makedirs(os.path.dirname(clean_path), exist_ok=True)
            with open(clean_path, 'w') as wf:
                wf.writelines(lines)
            valid += 1
        except Exception:
            continue
    print(f"‚úÖ Cleaned and saved {valid} valid signature files.")


# ============================================================
# STEP 2: FEATURE EXTRACTION
# ============================================================
def extract_features_from_signature(df):
    try:
        # Keep only first 4 columns: X, Y, T, P
        df = df.iloc[:, :4].copy()
        df.columns = ["X", "Y", "T", "P"]
        df = df.dropna()
        if df.shape[0] < 5:
            return None

        # Ensure numeric
        df = df.apply(pd.to_numeric, errors='coerce').dropna()
        if df["T"].max() == df["T"].min():
            return None
        
        # Compute deltas
        df["dx"] = df["X"].diff().fillna(0)
        df["dy"] = df["Y"].diff().fillna(0)
        df["dt"] = df["T"].diff().replace(0, np.nan).fillna(1)
        
        distances = np.sqrt(df["dx"]**2 + df["dy"]**2)
        speeds = distances / df["dt"]
        
        avg_speed = speeds.mean()
        duration = df["T"].iloc[-1] - df["T"].iloc[0]
        stroke_count = int(((df["P"].shift(1) == 0) & (df["P"] == 1)).sum())
        
        directions = np.arctan2(df["dy"], df["dx"])
        direction_changes = np.sum(np.abs(np.diff(directions)) > np.pi / 4)
        
        return {
            "avg_speed": avg_speed,
            "duration": duration,
            "stroke_count": stroke_count,
            "direction_changes": direction_changes
        }
    except Exception:
        return None


# ============================================================
# STEP 3: BUILD DATASET WITH CORRECT LABELS
# ============================================================
def build_training_dataset(clean_folder):
    files = glob.glob(f"{clean_folder}/**/*.txt", recursive=True)
    features = []
    genuine_count = forgery_count = 0

    for f in files:
        try:
            # Extract user ID x and instance y from filename UxSy.txt
            fname = os.path.basename(f)
            match = re.match(r'U(\d+)S(\d+)\.txt', fname, re.IGNORECASE)
            if not match:
                continue
            user_id = int(match.group(1))
            instance_id = int(match.group(2))

            df = pd.read_csv(f, sep=r"\s+", header=None)
            feat = extract_features_from_signature(df)
            if feat:
                # Label: first 20 instances = genuine, rest = forged
                if 1 <= instance_id <= 20:
                    feat["label"] = 1  # Genuine
                    genuine_count += 1
                else:
                    feat["label"] = 0  # Forged
                    forgery_count += 1
                features.append(feat)
        except Exception:
            continue
    
    feature_df = pd.DataFrame(features)
    feature_df.to_csv("clean_signature_features.csv", index=False)
    print(f"‚úÖ Extracted {len(feature_df)} samples ‚Üí clean_signature_features.csv")
    print(f"   Genuine: {genuine_count} | Forged: {forgery_count}")
    return feature_df


# ============================================================
# STEP 4: TRAIN RANDOM FOREST
# ============================================================
def train_signature_model(df):
    if df.empty or len(df["label"].unique()) < 2:
        print("‚ö†Ô∏è Not enough valid data to train model.")
        return
    
    X = df.drop("label", axis=1)
    y = df["label"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    
    print("\n‚úÖ MODEL TRAINED SUCCESSFULLY ‚úÖ")
    print(f"Accuracy: {acc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, preds))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, preds))


# ============================================================
# MAIN EXECUTION
# ============================================================
RAW_PATH = r"C:\Users\Nandini\osv-hybrid\notebooks\Task1"     # adjust path
CLEAN_PATH = r"C:\Users\Nandini\osv-hybrid\notebooks\Task1_clean"

# 1. Clean raw files
clean_task1_dataset(RAW_PATH, CLEAN_PATH)

# 2. Extract features and build dataset
df = build_training_dataset(CLEAN_PATH)

# 3. Train Random Forest model
train_signature_model(df)


‚úÖ Cleaned and saved 1600 valid signature files.
‚úÖ Extracted 1600 samples ‚Üí clean_signature_features.csv
   Genuine: 800 | Forged: 800

‚úÖ MODEL TRAINED SUCCESSFULLY ‚úÖ
Accuracy: 0.7719

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.80      0.78       166
           1       0.77      0.75      0.76       154

    accuracy                           0.77       320
   macro avg       0.77      0.77      0.77       320
weighted avg       0.77      0.77      0.77       320


Confusion Matrix:
 [[132  34]
 [ 39 115]]


In [1]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

# ============================================================
# STEP 1: CLEAN FILES
# ============================================================
def clean_task1_dataset(raw_folder, clean_folder):
    os.makedirs(clean_folder, exist_ok=True)
    all_files = glob.glob(f"{raw_folder}/*.txt")
    valid = 0

    for f in all_files:
        try:
            with open(f, 'r', errors='ignore') as file:
                lines = file.readlines()

            if not lines:
                continue

            lines = lines[1:]  # skip point count
            lines = [ln for ln in lines if ln.strip()]
            if len(lines) < 6:
                continue

            clean_path = f.replace(raw_folder, clean_folder)
            os.makedirs(os.path.dirname(clean_path), exist_ok=True)

            with open(clean_path, 'w') as wf:
                wf.writelines(lines)

            valid += 1
        except:
            continue

    print(f"‚úÖ Cleaned and saved {valid} valid signature files.")



# ============================================================
# STEP 2: FEATURE EXTRACTION
# ============================================================
def extract_features_from_signature(df):
    try:
        df = df.iloc[:, :3].copy()
        df.columns = ["X", "Y", "T"]
        df = df.apply(pd.to_numeric, errors="coerce").dropna()

        if len(df) < 6:
            return None

        dx = np.diff(df["X"])
        dy = np.diff(df["Y"])
        dt = np.diff(df["T"])
        dt[dt == 0] = 1

        # Velocity
        v = np.sqrt(dx**2 + dy**2) / dt

        # Acceleration
        a = np.diff(v) / dt[1:]

        # Direction
        theta = np.arctan2(dy, dx)

        # Angular velocity
        w = np.diff(theta) / dt[1:]

        # Angular acceleration
        alpha = np.diff(w) / dt[2:]

        # Simulated pressure
        pressure = 1 / (v + 1e-6)

        # Torque proxy
        tau = pressure * v

        def stats(arr):
            return np.mean(arr), np.std(arr), np.max(arr)

        stroke_count = np.sum(dt > np.percentile(dt, 90)) + 1
        direction_changes = np.sum(np.abs(np.diff(theta)) > np.pi / 4)

        return {
            "v_mean": stats(v)[0],
            "v_std": stats(v)[1],
            "v_max": stats(v)[2],

            "a_mean": stats(a)[0],
            "a_std": stats(a)[1],
            "a_max": stats(a)[2],

            "w_mean": stats(w)[0],
            "w_std": stats(w)[1],

            "alpha_mean": stats(alpha)[0],
            "alpha_std": stats(alpha)[1],

            "tau_mean": stats(tau)[0],
            "tau_std": stats(tau)[1],
            "tau_max": stats(tau)[2],

            "stroke_count": stroke_count,
            "direction_changes": direction_changes
        }

    except Exception as e:
        return None



# ============================================================
# STEP 3: BUILD DATASET WITH CORRECT LABELS
# ============================================================
def build_training_dataset(clean_folder):
    files = glob.glob(f"{clean_folder}/*.txt")
    rows = []
    genuine_count = forgery_count = 0

    for f in files:
        try:
            fname = os.path.basename(f)
            match = re.match(r'U(\d+)S(\d+)\.txt', fname, re.IGNORECASE)
            if not match:
                continue

            instance_id = int(match.group(2))

            df = pd.read_csv(f, sep=r"\s+", header=None)
            features = extract_features_from_signature(df)

            if features:
                if 1 <= instance_id <= 20:
                    features["label"] = 1
                    genuine_count += 1
                else:
                    features["label"] = 0
                    forgery_count += 1

                rows.append(features)

        except:
            continue

    feature_df = pd.DataFrame(rows)
    feature_df.to_csv("physics_signature_features.csv", index=False)

    print(f"‚úÖ Extracted {len(feature_df)} samples")
    print(f"   Genuine: {genuine_count} | Forged: {forgery_count}")

    return feature_df


# ============================================================
# STEP 4: TRAIN RANDOM FOREST
# ============================================================
def train_signature_model(df):
    if df.empty or len(df["label"].unique()) < 2:
        print("‚ö† Not enough data to train.")
        return

    X = df.drop("label", axis=1)
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        random_state=42
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    print("\n‚úÖ PHYSICS-BASED RF TRAINED")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("\nReport:\n", classification_report(y_test, preds))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, preds))


# ============================================================
# MAIN EXECUTION
# ============================================================
RAW_PATH = r"C:\Users\Nandini\osv-hybrid\notebooks\Task1"
CLEAN_PATH = r"C:\Users\Nandini\osv-hybrid\notebooks\Task2"

clean_task1_dataset(RAW_PATH, CLEAN_PATH)
df = build_training_dataset(CLEAN_PATH)
train_signature_model(df)


‚úÖ Cleaned and saved 1600 valid signature files.
‚úÖ Extracted 1600 samples
   Genuine: 800 | Forged: 800

‚úÖ PHYSICS-BASED RF TRAINED
Accuracy: 0.76875

Report:
               precision    recall  f1-score   support

           0       0.77      0.78      0.78       166
           1       0.76      0.75      0.76       154

    accuracy                           0.77       320
   macro avg       0.77      0.77      0.77       320
weighted avg       0.77      0.77      0.77       320


Confusion Matrix:
 [[130  36]
 [ 38 116]]


In [2]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import pickle  # ‚úÖ ADD THIS

# ============================================================
# STEP 1: CLEAN FILES
# ============================================================
def clean_task1_dataset(raw_folder, clean_folder):
    os.makedirs(clean_folder, exist_ok=True)
    all_files = glob.glob(f"{raw_folder}/*.txt")
    valid = 0

    for f in all_files:
        try:
            with open(f, 'r', errors='ignore') as file:
                lines = file.readlines()

            if not lines:
                continue

            lines = lines[1:]  # skip point count
            lines = [ln for ln in lines if ln.strip()]
            if len(lines) < 6:
                continue

            clean_path = f.replace(raw_folder, clean_folder)
            os.makedirs(os.path.dirname(clean_path), exist_ok=True)

            with open(clean_path, 'w') as wf:
                wf.writelines(lines)

            valid += 1
        except:
            continue

    print(f"‚úÖ Cleaned and saved {valid} valid signature files.")



# ============================================================
# STEP 2: FEATURE EXTRACTION
# ============================================================
def extract_features_from_signature(df):
    try:
        df = df.iloc[:, :3].copy()
        df.columns = ["X", "Y", "T"]
        df = df.apply(pd.to_numeric, errors="coerce").dropna()

        if len(df) < 6:
            return None

        dx = np.diff(df["X"])
        dy = np.diff(df["Y"])
        dt = np.diff(df["T"])
        dt[dt == 0] = 1

        # Velocity
        v = np.sqrt(dx**2 + dy**2) / dt

        # Acceleration
        a = np.diff(v) / dt[1:]

        # Direction
        theta = np.arctan2(dy, dx)

        # Angular velocity
        w = np.diff(theta) / dt[1:]

        # Angular acceleration
        alpha = np.diff(w) / dt[2:]

        # Simulated pressure
        pressure = 1 / (v + 1e-6)

        # Torque proxy
        tau = pressure * v

        def stats(arr):
            return np.mean(arr), np.std(arr), np.max(arr)

        stroke_count = np.sum(dt > np.percentile(dt, 90)) + 1
        direction_changes = np.sum(np.abs(np.diff(theta)) > np.pi / 4)

        return {
            "v_mean": stats(v)[0],
            "v_std": stats(v)[1],
            "v_max": stats(v)[2],

            "a_mean": stats(a)[0],
            "a_std": stats(a)[1],
            "a_max": stats(a)[2],

            "w_mean": stats(w)[0],
            "w_std": stats(w)[1],

            "alpha_mean": stats(alpha)[0],
            "alpha_std": stats(alpha)[1],

            "tau_mean": stats(tau)[0],
            "tau_std": stats(tau)[1],
            "tau_max": stats(tau)[2],

            "stroke_count": stroke_count,
            "direction_changes": direction_changes
        }

    except Exception as e:
        return None



# ============================================================
# STEP 3: BUILD DATASET WITH CORRECT LABELS
# ============================================================
def build_training_dataset(clean_folder):
    files = glob.glob(f"{clean_folder}/*.txt")
    rows = []
    genuine_count = forgery_count = 0

    for f in files:
        try:
            fname = os.path.basename(f)
            match = re.match(r'U(\d+)S(\d+)\.txt', fname, re.IGNORECASE)
            if not match:
                continue

            instance_id = int(match.group(2))

            df = pd.read_csv(f, sep=r"\s+", header=None)
            features = extract_features_from_signature(df)

            if features:
                if 1 <= instance_id <= 20:
                    features["label"] = 1
                    genuine_count += 1
                else:
                    features["label"] = 0
                    forgery_count += 1

                rows.append(features)

        except:
            continue

    feature_df = pd.DataFrame(rows)
    feature_df.to_csv("physics_signature_features.csv", index=False)

    print(f"‚úÖ Extracted {len(feature_df)} samples")
    print(f"   Genuine: {genuine_count} | Forged: {forgery_count}")

    return feature_df


# ============================================================
# STEP 4: TRAIN RANDOM FOREST + SAVE PKL ‚úÖ
# ============================================================
def train_signature_model(df):
    if df.empty or len(df["label"].unique()) < 2:
        print("‚ö† Not enough data to train.")
        return

    X = df.drop("label", axis=1)
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        random_state=42
    )

    model.fit(X_train, y_train)

    # ‚úÖ SAVE MODEL AS PKL
    with open("rf_model.pkl", "wb") as f:
        pickle.dump(model, f)

    print("‚úÖ Model saved as rf_model.pkl")

    preds = model.predict(X_test)

    print("\n‚úÖ PHYSICS-BASED RF TRAINED")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("\nReport:\n", classification_report(y_test, preds))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, preds))


# ============================================================
# MAIN EXECUTION
# ============================================================
RAW_PATH = r"C:\Users\Nandini\osv-hybrid\notebooks\Task1"
CLEAN_PATH = r"C:\Users\Nandini\osv-hybrid\notebooks\Task2"

clean_task1_dataset(RAW_PATH, CLEAN_PATH)
df = build_training_dataset(CLEAN_PATH)
train_signature_model(df)


‚úÖ Cleaned and saved 1600 valid signature files.
‚úÖ Extracted 1600 samples
   Genuine: 800 | Forged: 800
‚úÖ Model saved as rf_model.pkl

‚úÖ PHYSICS-BASED RF TRAINED
Accuracy: 0.76875

Report:
               precision    recall  f1-score   support

           0       0.77      0.78      0.78       166
           1       0.76      0.75      0.76       154

    accuracy                           0.77       320
   macro avg       0.77      0.77      0.77       320
weighted avg       0.77      0.77      0.77       320


Confusion Matrix:
 [[130  36]
 [ 38 116]]
