<a href="https://colab.research.google.com/github/matteoturnu/ML_Project/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize as nrm
import random
from importlib.util import find_spec
import json
import os
import shutil
import requests
import sys
import torch
import torch.nn as nn # Contains Required functions and layers
import torch.nn.functional as F # For neural network functions:
import torch.optim as optim # Contains Optimization function available in PyTorch.
project_folder = "/content/password_strength_classifier"



Creation of useful directories and download of password dictionaries

In [None]:
import itertools

def download_file(url, dest_folder_name):
  local_filename = url.split('/')[-1]
  path = os.path.join("/{}/{}".format(dest_folder_name, local_filename))

  with open(path, 'wb') as f:
    f.write(requests.get(url, stream=True).content)
  return path


def read_file(filepath):
  with open(filepath, errors='replace', encoding='utf-8') as f:
    data = {line.split('\n')[0] for line in f.readlines()}
  return data



if os.path.exists(project_folder) is False:
  dict_dir = project_folder + "/dictionaries/"
  dataset_dir = project_folder + "/dataset/"

  os.mkdir(project_folder)
  os.mkdir(dataset_dir)
  os.mkdir(dict_dir)

  f_rockyou = download_file(
    "https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt",
    dict_dir)

  f_jtr = download_file(
      "https://raw.githubusercontent.com/danielmiessler/SecLists/master/Passwords/Software/john-the-ripper.txt",
      dict_dir)


data_rockyou = read_file(f_rockyou)
data_jtr = read_file(f_jtr)


In [None]:
# download the dataset

if find_spec("kaggle") is None:
  ! pip install -q kaggle

if os.path.isdir("/root/.kaggle") is False:
  ! mkdir ~/.kaggle
  ! touch "/root/.kaggle/kaggle.json"

  token = {"username":"matteoturnu","key":"79ea644685a3e574038b40e4019b0927"}
  with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
  !chmod 600 /root/.kaggle/kaggle.json

  ! kaggle datasets download -d bhavikbb/password-strength-classifier-dataset -p $dataset_dir

Downloading password-strength-classifier-dataset.zip to /content/password_strength_classifier/dataset
  0% 0.00/5.01M [00:00<?, ?B/s]
100% 5.01M/5.01M [00:00<00:00, 118MB/s]


In [None]:
# read password dataset

file_path = os.path.join(dataset_dir, "password-strength-classifier-dataset.zip")

pswd_df = pd.read_csv(file_path, on_bad_lines='skip')
print(pswd_df)

            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669640 rows x 2 columns]


In [None]:

# remove missing values
pswd_df.dropna(inplace=True)
print(pswd_df)
psw_array = np.array(pswd_df)
print(psw_array)

# divide into X (passwords) and y (labels) arrays
labels = np.array([p[1] for p in psw_array])
passwords = np.array([p[0] for p in psw_array])


print("Len of UNIQUE passwords: ", len(np.unique(passwords)))

n_samples = len(passwords)
print("Number of samples: ", n_samples)
n_features = 8


            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669639 rows x 2 columns]
[['kzde5577' 1]
 ['kino3434' 1]
 ['visi7k1yr' 1]
 ...
 ['184520socram' 1]
 ['marken22a' 1]
 ['fxx4pw4g' 1]]
Len of UNIQUE passwords:  669639
Number of samples:  669639


In [None]:
def normalize(value, min, max):
  return (value - min) / (max - min)

# LENGTH feature
lengths = np.array([len(p) for p in passwords])
max_len = np.max(lengths)
min_len = np.min(lengths)
print("Lengths shape: ", lengths.shape)
print("Lenghts:", lengths)
print(f"Max: {max_len} --> {passwords[lengths == max_len]} \nMin: {min_len} --> {passwords[lengths == min_len]}")

feat_length = normalize(lengths, min_len, max_len)
print(feat_length)



# ROCKYOU feature
# numpy array of 1 and 0 (0 if rockyou file contains the current password, otherwise is 1)
# int() used to convert boolean into number
feat_rockyou = np.array([int(p not in data_rockyou) for p in passwords])
print("Rockyou feature: ", feat_rockyou)


# JTR feature
feat_jtr = np.array([int(p not in data_jtr) for p in passwords])
print("JTR feature: ", feat_jtr)


Lengths shape:  (669639,)
Lenghts: [ 8  8  9 ... 12  9  8]
Max: 220 --> ['In0LnUoff8wfayJGqzelyDqg4AMl9gBhgl3T2iZeONzh5gPqTyP8IVLsQ960aZwlZcdSjE1XCi8taVT5dWSB3wNJwMqpzmlSIKh21A8TNxpSJ5nu2hULRgjHZF6fubMkwhjPNRryi0BOyas9zlp6JUsNN0RQ4KRma8satN1JwEOAxlhMgJ7OwgRBbwuqCCiwhdylowbq0xpBsXZbhexgZnq4yOUb'] 
Min: 1 --> ['M' '9' '1']
[0.03196347 0.03196347 0.03652968 ... 0.05022831 0.03652968 0.03196347]
Rockyou feature:  [1 1 1 ... 1 1 1]
JTR feature:  [1 1 1 ... 1 1 1]


Calculate characters weights

In [None]:

# Create a dictionary containing occurrencies of all the characters in the dataset
def calculate_occurrencies(passwords):
  tot_occurrencies = 0
  occurrencies_dict = dict()
  for password in passwords:
    for character in password:
      tot_occurrencies += 1
      if character in occurrencies_dict:
        occurrencies_dict[character] += 1
      else:
        occurrencies_dict[character] = 1
  return occurrencies_dict, tot_occurrencies

# Converts occurrencies to weights
def calculate_weights(occurrencies_dict, tot_occurrencies):
  weights_dict = dict()
  for key in occurrencies_dict:
    weights_dict[key] = 1 - (occurrencies_dict[key] / tot_occurrencies)
  return weights_dict

def normalize_weights_dict(weights_dict):
  maximum = max(weights_dict.values())
  minimum = min(weights_dict.values())
  normalized_weights_dict = dict()
  for key in weights_dict:
    normalized_weights_dict[key] = normalize(weights_dict[key], minimum, maximum)
  return normalized_weights_dict


occurrencies_dict, tot_occurrencies = calculate_occurrencies(passwords)
weights_dict = calculate_weights(occurrencies_dict, tot_occurrencies)
normalized_weights_dict = normalize_weights_dict(weights_dict)
print(normalized_weights_dict)



{'k': 0.6657636716788613, 'z': 0.7784853687869094, 'd': 0.680859037257116, 'e': 0.2982357306788878, '5': 0.5838229999191711, '7': 0.6315560007935909, 'i': 0.37886109961275827, 'n': 0.4990337300469549, 'o': 0.3892096181212973, '3': 0.47419728562645974, '4': 0.5838548415145818, 'v': 0.8009630857935044, 's': 0.5261848135184723, '1': 0.055509698215155584, 'y': 0.6797029424083516, 'r': 0.5116895395460378, 'm': 0.6072412686671358, 'g': 0.6905658251504523, '2': 0.312214191064269, 'l': 0.5871467726093695, 'a': 0.0, 'b': 0.7437045491842426, 'h': 0.7130508902175275, 'A': 0.8941389420262521, 'V': 0.9695202451312972, 'Y': 0.9480688072383304, 'q': 0.8649475470949451, 'D': 0.9276583445799487, 'E': 0.9401794396369086, 'M': 0.9007007600343888, 'Z': 0.9692263227121206, 'f': 0.8096288984522538, 'N': 0.9141256665303191, 't': 0.6096514325043904, 'u': 0.6063570520561109, '6': 0.6317788919614675, 'c': 0.6825099015114968, '8': 0.6026805724628994, 'w': 0.761756284428726, '9': 0.5184987422569818, '0': 0.387007

Password structure feature

In [None]:
# Define macros
numbers = 0
lower_case = 1
upper_case = 2
special_char = 3

# Counts numbers, lowercases, uppercases and other characters in a password
def calculate_password_structure(passwords):
  passwords_structure = []
  for password in passwords:
    counts = np.array([0, 0, 0, 0])
    for character in password:
      if character.isnumeric():
        counts[numbers] += 1
      elif character.islower():
        counts[lower_case] += 1
      elif character.isupper():
        counts[upper_case] += 1
      else:
        counts[special_char] += 1

    passwords_structure.append(counts / len(password))
  return np.array(passwords_structure)

feat_structure = calculate_password_structure(passwords)
for i in range(20):
  print(passwords[i], feat_structure[i])

kzde5577 [0.5 0.5 0.  0. ]
kino3434 [0.5 0.5 0.  0. ]
visi7k1yr [0.22222222 0.77777778 0.         0.        ]
megzy123 [0.375 0.625 0.    0.   ]
lamborghin1 [0.09090909 0.90909091 0.         0.        ]
AVYq1lDE4MgAZfNt [0.125  0.3125 0.5625 0.    ]
u6c8vhow [0.25 0.75 0.   0.  ]
v1118714 [0.875 0.125 0.    0.   ]
universe2908 [0.33333333 0.66666667 0.         0.        ]
as326159 [0.75 0.25 0.   0.  ]
asv5o9yu [0.25 0.75 0.   0.  ]
612035180tok [0.75 0.25 0.   0.  ]
jytifok873 [0.3 0.7 0.  0. ]
WUt9IZzE0OQ7PkNE [0.1875 0.1875 0.625  0.    ]
jerusalem393 [0.25 0.75 0.   0.  ]
g067057895 [0.9 0.1 0.  0. ]
52558000aaa [0.72727273 0.27272727 0.         0.        ]
idofo673 [0.375 0.625 0.    0.   ]
6975038lp [0.77777778 0.22222222 0.         0.        ]
sbl571017 [0.66666667 0.33333333 0.         0.        ]


Password score feature by characters appearance frequency

In [None]:
# Calculate passwords score with character weights
def calculate_password_scores(passwords):
  scores = []
  for password in passwords:
    score = 0
    for character in password:
      # If weight does not exist in the dictionary, it is a very rare character so is given value 1
      try:
        score += normalized_weights_dict[character]
      except:
        score += 1
    normalized_score = score / len(password)
    scores.append(normalized_score)

  return np.array(scores)


feat_scores = calculate_password_scores(passwords)
for i in range(20):
  print(passwords[i], feat_scores[i])

kzde5577 0.6067627262284123
kino3434 0.5061215467177443
visi7k1yr 0.5143435501310545
megzy123 0.48701903882470265
lamborghin1 0.47054663557917564
AVYq1lDE4MgAZfNt 0.7849414053410604
u6c8vhow 0.6485382870691287
v1118714 0.35513666167814983
universe2908 0.47833016142056034
as326159 0.38777582782024717
asv5o9yu 0.5130924067592362
612035180tok 0.42952944653592046
jytifok873 0.6007950601438756
WUt9IZzE0OQ7PkNE 0.8146300190314772
jerusalem393 0.47239027508689135
g067057895 0.5648297331919243
52558000aaa 0.3479442464797782
idofo673 0.5481625562432801
6975038lp 0.5715955551310014
sbl571017 0.4668886869533008


In [None]:
from sklearn.model_selection import train_test_split

# Create the feature vector and split dataset

def create_feat_dict():
  feat_dict = dict()
  feat_dict['length'] = feat_length
  feat_dict['rockyou'] = feat_rockyou
  feat_dict['john_the_ripper'] = feat_jtr

  # divide "feat_structure" in its sub-features
  for i, string in enumerate(["numbers", "lowercase", "uppercase", "special"]):
    feat_dict[string] = feat_structure[:, i]

  feat_dict['scores'] = feat_scores

  return feat_dict


def create_feature_vector(features_list, n_feat):
  feature_vector = np.zeros(shape=(n_samples, n_feat))

  for i, feature in enumerate(features_list):
    # "-1" allows to let Python compute the remaining dimension
    feature = feature.reshape(n_samples, -1)
    feature_vector[:, i:i+1] = feature

  return feature_vector



feat_dict = create_feat_dict()

for feature in feat_dict:
  print(f"Feature {feature}: {feat_dict[feature]}")


feature_vector = create_feature_vector(feat_dict.values(), n_features)
print("\n")
print(feature_vector.shape)
print(feature_vector)

# 80% of the dataset is used for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(feature_vector, labels, test_size=0.20, random_state=42)
print(X_train.shape)




Feature length: [0.03196347 0.03196347 0.03652968 ... 0.05022831 0.03652968 0.03196347]
Feature rockyou: [1 1 1 ... 1 1 1]
Feature john_the_ripper: [1 1 1 ... 1 1 1]
Feature numbers: [0.5        0.5        0.22222222 ... 0.5        0.22222222 0.25      ]
Feature lowercase: [0.5        0.5        0.77777778 ... 0.5        0.77777778 0.75      ]
Feature uppercase: [0. 0. 0. ... 0. 0. 0.]
Feature special: [0. 0. 0. ... 0. 0. 0.]
Feature scores: [0.60676273 0.50612155 0.51434355 ... 0.43682709 0.35626581 0.72711691]


(669639, 8)
[[0.03196347 1.         1.         ... 0.         0.         0.60676273]
 [0.03196347 1.         1.         ... 0.         0.         0.50612155]
 [0.03652968 1.         1.         ... 0.         0.         0.51434355]
 ...
 [0.05022831 1.         1.         ... 0.         0.         0.43682709]
 [0.03652968 1.         1.         ... 0.         0.         0.35626581]
 [0.03196347 1.         1.         ... 0.         0.         0.72711691]]
(535711, 8)


Hyper-parameters search through 5-fold cross validation and training of LinearSVC and LogisticRegression classifiers

*LinearSVC*

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

svc_max_iter = 10000
lr_max_iter = 1000

clf = LinearSVC(dual=False, max_iter=svc_max_iter) # default max_iter = 1000



clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(f"Accuracy: {accuracy * 100} %")


Accuracy: 95.178006092826 %


In [None]:
# Hyperparameters search

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

param_distributions = {'C': [0.1, 1, 10, 100, 1000],
                       'penalty': ['l1', 'l2']}

svc_cv = HalvingRandomSearchCV(LinearSVC(dual=False, max_iter=svc_max_iter),
                               param_distributions, n_jobs=-1)
svc_cv.fit(X_train, y_train)
print(f"\nAccuracy (CV): {svc_cv.best_score_ * 100} %")
print("Best parameters: ", svc_cv.best_params_)


# Train a new SVC with best hyperparameters

svc = LinearSVC(C=svc_cv.best_params_['C'], penalty=svc_cv.best_params_['penalty'],
                dual=False, max_iter=svc_max_iter)


svc.fit(X_train, y_train)
accuracy = svc.score(X_test, y_test)

print(f"Accuracy with best parameters: {accuracy * 100} %")






Accuracy (CV): 96.98113207547169 %
Best parameters:  {'penalty': 'l1', 'C': 1000}


*LogisticRegression*

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=lr_max_iter)  # default max_iter = 100

# Training without cross validation

clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(f"Accuracy: {accuracy * 100} %")



In [None]:
# Using HalvingRandomized on LogisticRegression

param_distributions = {'C': [0.1, 1, 10, 100, 1000]}

lr_cv = HalvingRandomSearchCV(LogisticRegression(max_iter=lr_max_iter),
                               param_distributions,
                               scoring='accuracy', n_jobs=-1, verbose=3)
lr_cv.fit(X_train, y_train)
print(f"\nAccuracy (CV): {lr_cv.best_score_ * 100} %")
print("Best parameters: ", lr_cv.best_params_)


# Train a new LogisticRegression with best hyperparameters
lr = LogisticRegression(C=lr_cv.best_params_['C'], max_iter=lr_max_iter)

lr.fit(X_train, y_train)
predictions = lr.predict(X_test)


clf.fit(X_train, y_train)
accuracy = lr.score(X_test, y_test)

print(f"Accuracy with best parameters: {accuracy * 100} %")



KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

# Training without cross validation


clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = clf.score(X_test, y_test)


print(f"Accuracy: {accuracy * 100} %")

Neural Network: Dataset Creation

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]


trainset = CustomDataset(X_train, y_train)
testset = CustomDataset(X_test, y_test)

Neural Network: Trainloader creation

In [None]:
def create_DataLoader(trainset, testset):
  # trainloader is what holds the data loader object which takes care of shuffling the data and constructing the batches
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
  # No need to shuffle test data.
  testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)
  return trainloader, testloader

trainloader, testloader = create_DataLoader(trainset, testset)

Neural Network: initialization

In [None]:
class NeuralNetwork(nn.Module):


  def __init__(self, n_input, n_output):
    super().__init__()
    self.n_input = n_input
    self.n_output = n_output
    self.mid_n = int((n_input + n_output) / 2)

    # Define Layers:
    self.l1 = nn.Linear(self.n_input, self.mid_n) # layer 1
    self.l2 = nn.Linear(self.mid_n, self.n_output) # layer 2
    self.l3 = nn.Linear(self.n_output, self.n_output) # layer 3
    self.double()

    # Define Activation functions:
    self.relu = nn.ReLU()
    self.softmax = nn.LogSoftmax(dim = 1)

    # Weights initialization
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='relu') # Using HE because more optimized for ReLU activated layers
    nn.init.zeros_(self.l1.bias)
    nn.init.kaiming_normal_(self.l2.weight, mode='fan_in', nonlinearity='relu')
    nn.init.zeros_(self.l2.bias)
    nn.init.normal_(self.l3.weight) # Using normal distribution for LogSoftMax activated layer
    nn.init.normal_(self.l3.bias)


  def forward(self, x):
    '''
    Layers: 3
    Activation Functions:
    RELU for first two layers
    Log Softmax for last layer
    '''
    x = self.l1(x)
    x = self.relu(x)
    x = self.l2(x)
    x = self.relu(x)
    x = self.l3(x)
    x = self.softmax(x)
    return x


  def train_model(self, tr_loader, n_epochs, criterion, optimizer):
    # losses --> {[*idx_first_epoch*]: loss_1, [*idx_second_epoch*]: loss_2, ...}
    losses = {}
    for e in range(n_epochs):
      for features, labels in tr_loader:
        optimizer.zero_grad() # set optimizer gradients to zero:
        output = self(features) # Initial output (method "forward()" automatically called)
        loss = criterion(output, labels.long()) # Loss Calculation
        loss.backward() # Pass loss function gradients to previous layers:
        optimizer.step() # Update Weights
        losses[e] = loss.item()

    return losses


  def test_model(self, ts_loader):
    correct = 0
    total = 0
    for features, labels in ts_loader:
      outputs = self(features)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum()

    return correct / total



# Define macros.
number_of_features = 8 # Rockyou, johntheripper , password len, numbers/len, lowercases/len, uppercases/len, special characters/len, password score
number_of_outputs = 3 # Unsecure, intermediate, secure

# Initialize NN
NN = NeuralNetwork(number_of_features, number_of_outputs)
criterion = nn.NLLLoss()
optimizer = optim.Adam(NN.parameters(), lr=0.001)

Network training: training and testing

In [None]:
# Train the model
n_epochs = 5
losses = NN.train_model(trainloader, n_epochs, criterion, optimizer)
print(losses)

# Test the models
accuracy = NN.test_model(testloader)
print(f"Accuracy of the model on the test samples: {accuracy * 100} %")

Evaluating weight of each feature on classifier decision

In [None]:
def weigh_features(model, X_train, X_test, y_train, y_test):
  # return a list with several accuracy values
  n_models = X_train.shape[1]
  features_weights = list()

  for i in range(n_models):
    sub_x_train = X_train[:, i].reshape(-1, 1)
    sub_x_test = X_test[:, i].reshape(-1, 1)

    model.fit(sub_x_train, y_train)
    accuracy = model.score(sub_x_test, y_test)

    features_weights.append(accuracy)

  return features_weights


def weigh_features_nn(X_train, X_test, y_train, y_test):
  n_input = 1
  n_output = 3

  # return a list with several accuracy values
  n_models = X_train.shape[1]
  features_weights = list()

  for i in range(n_models):
    sub_x_train = X_train[:, i].reshape(-1, 1)
    sub_x_test = X_test[:, i].reshape(-1, 1)

    trainset = CustomDataset(sub_x_train, y_train)
    testset = CustomDataset(sub_x_test, y_test)
    trainloader, testloader = create_DataLoader(trainset, testset)
    NN = NeuralNetwork(n_input, n_output)
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(NN.parameters(), lr=0.001)
    losses = NN.train_model(trainloader, n_epochs, criterion, optimizer)
    accuracy = NN.test_model(testloader)

    features_weights.append(accuracy)

  return features_weights

In [None]:

svc = LinearSVC(max_iter=1000, dual=False)
svc_feat_w = weigh_features(svc, X_train, X_test, y_train, y_test)
print("\nLinearSVC - Accuracies on single features: \n")
for i, feat in enumerate(feat_dict):
  print(f"Feature '{feat}' weight: {svc_feat_w[i]}")

lr = LogisticRegression(max_iter=1000)
lr_feat_w = weigh_features(lr, X_train, X_test, y_train, y_test)
print("\nLogisticRegression - Accuracies on single features: \n")
for i, feat in enumerate(feat_dict):
  print(f"Feature '{feat}' weight: {lr_feat_w[i]}")

knn=KNeighborsClassifier()
knn_feat_w=weigh_features(knn, X_train, X_test, y_train, y_test)
print("\nKNN - Accuracies on single features: \n")
for i, feat in enumerate(feat_dict):
  print(f"Feature '{feat}' weight: {knn_feat_w[i]}")

nn_feat_w = weigh_features_nn(X_train, X_test, y_train, y_test)
print("\nANN - Accuracies on single features: \n")
for i, feat in enumerate(feat_dict):
  print(f"Feature '{feat}' weight: {nn_feat_w[i]}")