In [3]:
pip install --user tensorflow

Collecting tensorflow
  Using cached https://files.pythonhosted.org/packages/55/d1/a3631a36859ee324e1767fa7554fdf7af17965571d8537b20b311b76bcfe/tensorflow-2.11.0-cp37-cp37m-win_amd64.whl
Collecting tensorflow-intel==2.11.0; platform_system == "Windows" (from tensorflow)
  Using cached https://files.pythonhosted.org/packages/f7/8c/18288ac12dc0e1997c73f1b86dbd6f7fa3674ae5341769387e1f13b07c9e/tensorflow_intel-2.11.0-cp37-cp37m-win_amd64.whl
Collecting protobuf<3.20,>=3.9.2 (from tensorflow-intel==2.11.0; platform_system == "Windows"->tensorflow)
  Using cached https://files.pythonhosted.org/packages/70/ee/e3562fd4e692afc6ed396b60ce3a177bc4ce6506ac8ac2413886198880e3/protobuf-3.19.6-cp37-cp37m-win_amd64.whl
Collecting opt-einsum>=2.3.2 (from tensorflow-intel==2.11.0; platform_system == "Windows"->tensorflow)
  Using cached https://files.pythonhosted.org/packages/bc/19/404708a7e54ad2798907210462fd950c3442ea51acc8790f3da48d2bee8b/opt_einsum-3.3.0-py3-none-any.whl
Collecting google-pasta>=0.1.



In [4]:
import tensorflow as tf
print(tf.__version__)

2.11.0


In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Load the dataset
data_file = "jwt.secrets.list"

# Read the file using plain Python file handling
with open(data_file, 'r', encoding='utf-8') as file:
    lines = [line.strip() for line in file.readlines()]

# Create a DataFrame from the list of lines
df = pd.DataFrame(lines, columns=["password"])

# Display the first few rows to verify
print("First few rows of the dataset:")
print(df.head())

# Sample a smaller portion of the dataset
sampled_df = df.sample(frac=0.1, random_state=42)  # Adjust the fraction as needed

# Tokenization
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sampled_df['password'])
X = tokenizer.texts_to_sequences(sampled_df['password'])

# Pad sequences to ensure uniform length
max_len = max([len(x) for x in X])
X = pad_sequences(X, maxlen=max_len)

# Step 3: Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, sampled_df.index, test_size=0.2, random_state=42)

# Step 4: Model Building - Random Forest Classifier
# Using character-level features directly

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("\nRandom Forest Classifier Accuracy:", accuracy_rf)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# Step 5: Model Building - LSTM Neural Network
# Using character-level features with an LSTM network

# Initialize LSTM model
lstm_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_len),
    SpatialDropout1D(0.4),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate LSTM model performance
loss, accuracy_lstm = lstm_model.evaluate(X_test, y_test)
print("\nLSTM Neural Network Accuracy:", accuracy_lstm)

# Summary of the model and data usage
print("\nSummary:")
print("1. Data: jwt.secrets.list - A dataset containing passwords.")
print("2. Models Used: Random Forest Classifier and LSTM Neural Network.")
print("3. Objective: Build models to predict password complexity based on character-level features.")
print("4. Metrics: Accuracy score and classification report used to evaluate model performance.")


First few rows of the dataset:
                    password
0                           
1                !)&@%&*$E^$
2           !@#!@#!@#!@@#!@#
3                   !@#$%^&*
4  !@#)(D4NG_QU0c_CU0NG$%^{}


MemoryError: could not allocate 272334848 bytes