In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

# Load and preprocess data
# =======================
from google.colab import files
uploaded = files.upload()

# Assume the file is named 'production.csv'
df = pd.read_csv('production.csv')


# Data preparation
# ================
def prepare_data(df):
    X = df['password']
    y = df['strength']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)

    # Tokenize characters
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(X_train)

    # Convert text to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    # Pad sequences
    max_len = max(len(x) for x in X_train_seq)
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

    # Prepare labels
    y_train_cat = to_categorical(y_train)
    y_test_cat = to_categorical(y_test)

    vocab_size = len(tokenizer.word_index) + 1

    return X_train_pad, X_test_pad, y_train_cat, y_test_cat, max_len, vocab_size

# CNN Model Architectures
# =======================

def build_basic_cnn(vocab_size, max_len):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Conv1D(128, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer=Adam(0.001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

def build_deeper_cnn(vocab_size, max_len):
    model = Sequential([
        Embedding(vocab_size, 64, input_length=max_len),
        Conv1D(128, 5, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        MaxPooling1D(2),
        Conv1D(256, 3, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        MaxPooling1D(2),
        Conv1D(512, 3, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer=Adam(0.0005),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

def build_multi_filter_cnn(vocab_size, max_len):
    input_layer = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, 32)(input_layer)

    # Parallel convolution layers with different filter sizes
    conv1 = Conv1D(64, 2, activation='relu')(embedding)
    pool1 = GlobalMaxPooling1D()(conv1)

    conv2 = Conv1D(64, 3, activation='relu')(embedding)
    pool2 = GlobalMaxPooling1D()(conv2)

    conv3 = Conv1D(64, 4, activation='relu')(embedding)
    pool3 = GlobalMaxPooling1D()(conv3)

    # Concatenate and classify
    merged = Concatenate()([pool1, pool2, pool3])
    dense = Dense(64, activation='relu')(merged)
    dropout = Dropout(0.5)(dense)
    output = Dense(3, activation='softmax')(dropout)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer=Adam(0.001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

# Model Evaluation
# ================

def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test_classes, y_pred_classes, average='weighted')

    print(f"\n{model_name} Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test_classes, y_pred_classes))

    # Confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test_classes, y_pred_classes)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Weak', 'Medium', 'Strong'],
                yticklabels=['Weak', 'Medium', 'Strong'])
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

def plot_training_history(history, model_name):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'{model_name} Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{model_name} Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Main Execution
# ==============

def main():
    # Load data
    df = load_data('production.csv')

    # Check class distribution
    print("Class distribution:\n", df['strength'].value_counts())
    plt.figure(figsize=(8, 5))
    sns.countplot(x='strength', data=df)
    plt.title('Class Distribution')
    plt.show()

    # Prepare data
    X_train, X_test, y_train, y_test, max_len, vocab_size = prepare_data(df)

    # Define models to test
    models = {
        'Basic CNN': build_basic_cnn(vocab_size, max_len),
        'Deeper CNN': build_deeper_cnn(vocab_size, max_len),
        'Multi-Filter CNN': build_multi_filter_cnn(vocab_size, max_len)
    }

    results = {}

    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        history = model.fit(
            X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=15,
            batch_size=64,
            callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)],
            verbose=1
        )

        # Plot training history
        plot_training_history(history, name)

        # Evaluate model
        metrics = evaluate_model(model, X_test, y_test, name)
        results[name] = metrics

    # Compare all models
    print("\nModel Comparison:")
    comparison_df = pd.DataFrame(results).T
    print(comparison_df)

    # Visual comparison
    comparison_df.plot(kind='bar', figsize=(12, 6))
    plt.title('Model Performance Comparison')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.ylim(0.7, 1.0)
    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    main()