<a href="https://colab.research.google.com/github/matteoturnu/ML_Project/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import random
from importlib.util import find_spec
import json
import os
import shutil
import requests
import sys
import torch
import torch.nn as nn # Contains Required functions and layers
import torch.nn.functional as F # For neural network functions:
import torch.optim as optim # Contains Optimization function available in PyTorch.
project_folder = "/content/password_strength_classifier"

Creation of useful directories and download of password dictionaries

In [26]:
def download_file(url, dest_folder_name):
  local_filename = url.split('/')[-1]
  path = os.path.join("/{}/{}".format(dest_folder_name, local_filename))
  """with requests.get(url, stream=True) as r:
      with open(path, 'wb') as f:
          shutil.copyfileobj(r.raw, f)"""

  with open(path, 'wb') as f:
    f.write(requests.get(url, stream=True).content)

  # return local_filename
  return path

def read_file(filepath):
  with open(filepath, errors='replace', encoding='utf-8') as f:
    data = {line.split('\n')[0] for line in f.readlines()}
  return data


if os.path.exists(project_folder) is False:
  dict_dir = project_folder + "/dictionaries/"
  dataset_dir = project_folder + "/dataset/"

  os.mkdir(project_folder)
  os.mkdir(dataset_dir)

  os.mkdir(dict_dir)

  f_rockyou = download_file(
    "https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt",
    dict_dir)

  f_jtr = download_file(
      "https://raw.githubusercontent.com/danielmiessler/SecLists/master/Passwords/Software/john-the-ripper.txt",
      dict_dir)


data_rockyou = read_file(f_rockyou)
data_jtr = read_file(f_jtr)

print("\nRock You")
import itertools
print([val for i, val in enumerate(itertools.islice(data_rockyou, 5))])
print("\nJohn The Ripper")
print([val for i, val in enumerate(itertools.islice(data_jtr, 5))])



Rock You
['', '07881951589', 'fufyfufy', 'eligla', 'mizsammie']

John The Ripper
['popeye', 'overkill', 'Abcdefg', 'timothy', 'gregory']


In [27]:
# download the dataset

if find_spec("kaggle") is None:
  ! pip install -q kaggle

if os.path.isdir("/root/.kaggle") is False:
  ! mkdir ~/.kaggle
  ! touch "/root/.kaggle/kaggle.json"

  token = {"username":"matteoturnu","key":"79ea644685a3e574038b40e4019b0927"}
  with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
  !chmod 600 /root/.kaggle/kaggle.json

  ! kaggle datasets download -d bhavikbb/password-strength-classifier-dataset -p $dataset_dir

In [28]:
# read file

file_path = os.path.join(dataset_dir, "password-strength-classifier-dataset.zip")

pswd_df = pd.read_csv(file_path, on_bad_lines='skip')
print(pswd_df)

            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669640 rows x 2 columns]


In [29]:
def word_split(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

# unique values of strength feature
pswd_df['strength'].unique()

# number of missing values in dataset
pswd_df.isnull().sum()

# remove missing values
pswd_df.dropna(inplace=True)
pswd_df.isnull().sum()

print(pswd_df)

psw_array = np.array(pswd_df)
print(psw_array)

# ??? PROBLEM: it is shown that if shuffled then
# there are several duplicates of a password
#random.shuffle(psw_array)

labels = np.array([p[1] for p in psw_array])
passwords = np.array([p[0] for p in psw_array])

print("Len of passwords: ", len(passwords))
print("Len of UNIQUE passwords: ", len(np.unique(passwords)))


"""vectorizer = TfidfVectorizer(tokenizer=word_split)
X = vectorizer.fit_transform(passwords)

print(vectorizer.get_feature_names_out())"""


            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669639 rows x 2 columns]
[['kzde5577' 1]
 ['kino3434' 1]
 ['visi7k1yr' 1]
 ...
 ['184520socram' 1]
 ['marken22a' 1]
 ['fxx4pw4g' 1]]
Len of passwords:  669639
Len of UNIQUE passwords:  669639


'vectorizer = TfidfVectorizer(tokenizer=word_split)\nX = vectorizer.fit_transform(passwords)\n\nprint(vectorizer.get_feature_names_out())'

Manuel - Calculate characters weights

In [37]:
# Create a dictionary containing occurrencies of all the characters in the dataset
def calculate_occurrencies(passwords):
  tot_occurrencies = 0
  occurrencies_dict = dict()
  for password in passwords:
    for character in password:
      tot_occurrencies += 1
      if character in occurrencies_dict:
        occurrencies_dict[character] += 1
      else:
        occurrencies_dict[character] = 1
  return occurrencies_dict, tot_occurrencies

# Converts occurrencies to weights
def calculate_weights(occurrencies_dict, tot_occurrencies):
  for key in occurrencies_dict:
    weights_dict = dict()
    weights_dict[key] = 1 - (occurrencies_dict[key] / tot_occurrencies)
  return occurrencies_dict

occurrencies_dict, tot_occurrencies = calculate_occurrencies(passwords)
weights_dict = calculate_weights(occurrencies_dict, tot_occurrencies)
print(weights_dict)



{'k': 136460, 'z': 90439, 'd': 130297, 'e': 286511, '5': 169914, '7': 150426, 'i': 253594, 'n': 204531, 'o': 249369, '3': 214671, '4': 169901, 'v': 81262, 's': 193446, '1': 385609, 'y': 130769, 'r': 199364, 'm': 160353, 'g': 126334, '2': 280804, 'l': 168557, 'a': 408272, 'b': 104639, 'h': 117154, 'A': 43221, 'V': 12445, 'Y': 21203, 'q': 55139, 'D': 29536, 'E': 24424, 'M': 40542, 'Z': 12565, 'f': 77724, 'N': 35061, 't': 159369, 'u': 160714, '6': 150335, 'c': 129623, '8': 162215, 'w': 97269, '9': 196584, '0': 250268, 'j': 95251, 'W': 12621, 'U': 21104, 'I': 23029, 'O': 28219, 'Q': 32503, 'P': 13419, 'p': 111185, '@': 5448, '-': 3146, 'H': 13325, 'x': 69488, '.': 6178, 'T': 29780, '>': 94, 'G': 13779, 'J': 13414, '&': 664, '?': 501, '<': 118, '!': 2182, 'S': 15705, 'R': 15063, 'F': 12952, 'B': 13443, 'K': 13569, 'X': 12424, 'C': 13823, 'L': 14507, ';': 363, '_': 2927, '%': 409, '±': 133, '"': 9, '~': 93, '+': 692, '^': 338, '/': 834, '$': 1140, ')': 365, ' ': 1110, '(': 315, '#': 1208, 'Ú

In [None]:
# LENGTH feature
# lengths = np.array([len(p) for p in passwords]).reshape(-1, 1)
lengths = np.array([len(p) for p in passwords])
max_len = np.max(lengths)
min_len = np.min(lengths)

print("Lengths shape: ", lengths.shape)

print(lengths)
print(f"Max: {max_len} --> {passwords[lengths == max_len]} \nMin: {min_len} --> {passwords[lengths == min_len]}")

# normalize() from sklearn accept 2D arrays only --> reshape so that we have 1 row (1 "sample")
# and make sklearn compute the remaining number of columns for us
feat_length = normalize(lengths.reshape(1, -1))

#feat_length = feat_length.flatten()
print(feat_length)


# ROCKYOU feature

# numpy array of 1 and 0 (0 if found, otherwise is 1)
# int() used to convert boolean into number
feat_rockyou = np.array([int(p not in data_rockyou) for p in passwords])
print(feat_rockyou)

# passwords found in rockyou file
print(passwords[feat_rockyou == 0])


# JTR feature
feat_jtr = np.array([int(p not in data_jtr) for p in passwords])
print(feat_jtr)
print(passwords[feat_jtr == 0])




Manuel - Password structure feature

In [None]:
# Define macros
numbers = 0
lower_case = 1
upper_case = 2
special_char = 3

# Counts numbers, lowercases, uppercases and other characters in a password
def calculate_password_structure(passwords):
  passwords_structure = []
  for password in passwords:
    counts = np.array([0, 0, 0, 0])
    for character in password:
      if character.isnumeric():
        counts[numbers] += 1
      elif character.islower():
        counts[lower_case] += 1
      elif character.isupper():
        counts[upper_case] += 1
      else:
        counts[special_char] += 1

    passwords_structure.append(counts / len(password))
  return np.array(passwords_structure)

feat_structure = calculate_password_structure(passwords)
for i in range(20):
  print(passwords[i], feat_structure[i])

Manuel - Password score feature by characters appearance frequency

In [None]:
# Calculate passwords score with character weights
def calculate_password_scores(passwords):
  scores = []
  for password in passwords:
    score = 0
    for character in password:
      # If weight does not exists in the dictionary, it is a very rare character so 1.001 points are given
      try:
        score += weights_dict[character]
      except:
        score += 1
    scores.append(score)

  return np.array(scores)

feat_scores = calculate_password_scores(passwords)
for i in range(20):
  print(passwords[i], feat_scores[i])

In [None]:
'''
# Create the feature vector
# Currently we are not including "feat_score"

# divide "feat_structure" in its sub-features since hstack() needs the same
# dimension for every array

def create_feature_vector(features_list):
  # 4 features in the list


  for feature in features_list:
    if len(feature.shape) != 1: # it's 2-d feature_structure or feat_length
      if abs(feature.shape[1]) > 1: # it's the 2-d feature_structure --> decompose it to 4 different sub-features



  features_vector = np.hstack(feature)
  return feature_vector

print("\nFeature length: ", feat_length)
print("\nFeature rockyou: ", feat_rockyou)
print("\nFeature jtr: ", feat_jtr)
print("\nFeature structure --> numbers: ", feat_structure[:, 0])
print("\nFeature structure --> lowercase: ", feat_structure[:, 1])
print("\nFeature structure --> uppercase: ", feat_structure[:, 2])
print("\nFeature structure --> special: ", feat_structure[:, 3])

print("feature shape: ", len(feat_length.shape))


feature_vector = np.hstack((feat_length, feat_rockyou, feat_jtr, feat_structure[:, 0],
                            feat_structure[:, 1], feat_structure[:, 2], feat_structure[:, 3]))
print(feature_vector)
print(feature_vector.shape)
'''




Manuel - Neural Network

Manuel - Dataset Creation

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]

Manuel - Trainloader creation

In [None]:
def create_DataLoader(trainset, testset):
  # trainloader is what holds the data loader object which takes care of shuffling the data and constructing the batches
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
  # No need to shuffle test data.
  testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

In [None]:
class NeuralNetwork(nn.Module):


  def __init__(self, n_input, n_output):
    super().__init__()
    self.n_input = n_input
    self.n_output = n_output
    self.mid_n = int((n_input + n_output) / 2)

    # Define Layers:
    self.l1 = nn.Linear(self.n_input, self.mid_n) # layer 1
    self.l2 = nn.Linear(self.mid_n, self.n_output) # layer 2
    self.l3 = nn.Linear(self.n_output, self.n_output) # layer 3

    # Define Activation functions:
    self.relu = nn.ReLU()
    self.softmax = nn.LogSoftmax(dim = 1)

    # Weights initialization
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='relu') # Using HE because more optimized for ReLU activated layers
    nn.init.zeros_(self.l1.bias)
    nn.init.kaiming_normal_(self.l2.weight, mode='fan_in', nonlinearity='relu')
    nn.init.zeros_(self.l2.bias)
    nn.init.normal_(self.l3.weight) # Using normal distribution for LogSoftMax activated layer
    nn.init.normal_(self.l3.bias)


  def forward(self, x):
    '''
    Layers: 3
    Activation Functions:
    RELU for first two layers
    Log Softmax for last layer
    '''
    x = self.l1(x)
    x = self.relu(x)
    x = self.l2(x)
    x = self.relu(x)
    x = self.l3(x)
    x = self.softmax(x)
    return x


  def train_model(self, tr_loader, n_epochs, criterion, optimizer):
    # losses --> {[*idx_first_epoch*]: loss_1, [*idx_second_epoch*]: loss_2, ...}
    losses = {}
    for e in range(n_epochs):
      for features, labels in tr_loader:
        optimizer.zero_grad() # set optimizer gradients to zero:
        output = self(features) # Intial output
        loss = criterion(output, labels.long()) # Loss Caluclation
        loss.backward() # Pass loss function gradients to pervious layers:
        optimizer.step() # Update Weights
        losses[e] = loss.item()

    return losses


  def test_model(self, ts_loader):
    correct = 0
    total = 0
    for features, labels in ts_loader:
      outputs = self(features)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum()

    return correct / total



# Define macros.
number_of_features = 8 # Rockyou, johntheripper , password len, numbers/len, lowercases/len, uppercases/len, special characters/len, password score
number_of_outputs = 3 # Unsecure, intermediate, secure

# Initialize NN
NN = NeuralNetwork(number_of_features, number_of_outputs)
criterion = nn.NLLLoss()
optimizer = optim.Adam(NN.parameters(), lr=0.001)