<a href="https://colab.research.google.com/github/matteoturnu/ML_Project/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import random
from importlib.util import find_spec
import json
import os
import shutil
import requests
import sys
import torch
import torch.nn as nn # Contains Required functions and layers
import torch.nn.functional as F # For neural network functions:
import torch.optim as optim # Contains Optimization function available in PyTorch.
project_folder = "/content/password_strength_classifier"

Creation of useful directories and download of password dictionaries

In [2]:
def download_file(url, dest_folder_name):
  local_filename = url.split('/')[-1]
  path = os.path.join("/{}/{}".format(dest_folder_name, local_filename))
  """with requests.get(url, stream=True) as r:
      with open(path, 'wb') as f:
          shutil.copyfileobj(r.raw, f)"""

  with open(path, 'wb') as f:
    f.write(requests.get(url, stream=True).content)

  # return local_filename
  return path

def read_file(filepath):
  with open(filepath, errors='replace', encoding='utf-8') as f:
    data = {line.split('\n')[0] for line in f.readlines()}
  return data


if os.path.exists(project_folder) is False:
  dict_dir = project_folder + "/dictionaries/"
  dataset_dir = project_folder + "/dataset/"

  os.mkdir(project_folder)
  os.mkdir(dataset_dir)

  os.mkdir(dict_dir)

  f_rockyou = download_file(
    "https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt",
    dict_dir)

  f_jtr = download_file(
      "https://raw.githubusercontent.com/danielmiessler/SecLists/master/Passwords/Software/john-the-ripper.txt",
      dict_dir)


data_rockyou = read_file(f_rockyou)
data_jtr = read_file(f_jtr)

print("\nRock You")
import itertools
print([val for i, val in enumerate(itertools.islice(data_rockyou, 5))])
print("\nJohn The Ripper")
print([val for i, val in enumerate(itertools.islice(data_jtr, 5))])



Rock You
['', 'tinelly123', 'belsek100', 'CAES415', '051362916']

John The Ripper
['soccer', 'xavier', 'christopher', 'farout', 'doogie']


In [3]:
# download the dataset

if find_spec("kaggle") is None:
  ! pip install -q kaggle

if os.path.isdir("/root/.kaggle") is False:
  ! mkdir ~/.kaggle
  ! touch "/root/.kaggle/kaggle.json"

  token = {"username":"matteoturnu","key":"79ea644685a3e574038b40e4019b0927"}
  with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
  !chmod 600 /root/.kaggle/kaggle.json

  ! kaggle datasets download -d bhavikbb/password-strength-classifier-dataset -p $dataset_dir

Downloading password-strength-classifier-dataset.zip to /content/password_strength_classifier/dataset
  0% 0.00/5.01M [00:00<?, ?B/s]
100% 5.01M/5.01M [00:00<00:00, 111MB/s]


In [4]:
# read file

file_path = os.path.join(dataset_dir, "password-strength-classifier-dataset.zip")

pswd_df = pd.read_csv(file_path, on_bad_lines='skip')
print(pswd_df)

            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669640 rows x 2 columns]


In [5]:
def word_split(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

# unique values of strength feature
pswd_df['strength'].unique()

# number of missing values in dataset
pswd_df.isnull().sum()

# remove missing values
pswd_df.dropna(inplace=True)
pswd_df.isnull().sum()

print(pswd_df)

psw_array = np.array(pswd_df)
print(psw_array)

# ??? PROBLEM: it is shown that if shuffled then
# there are several duplicates of a password
#random.shuffle(psw_array)

labels = np.array([p[1] for p in psw_array])
passwords = np.array([p[0] for p in psw_array])

print("Len of passwords: ", len(passwords))
print("Len of UNIQUE passwords: ", len(np.unique(passwords)))


"""vectorizer = TfidfVectorizer(tokenizer=word_split)
X = vectorizer.fit_transform(passwords)

print(vectorizer.get_feature_names_out())"""


            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669639 rows x 2 columns]
[['kzde5577' 1]
 ['kino3434' 1]
 ['visi7k1yr' 1]
 ...
 ['184520socram' 1]
 ['marken22a' 1]
 ['fxx4pw4g' 1]]
Len of passwords:  669639
Len of UNIQUE passwords:  669639


'vectorizer = TfidfVectorizer(tokenizer=word_split)\nX = vectorizer.fit_transform(passwords)\n\nprint(vectorizer.get_feature_names_out())'

Manuel - Calculate characters weights

In [6]:
# Create a dictionary containing occurrencies of all the characters in the dataset
def calculate_occurrencies(passwords):
  weights_dict = dict()
  for password in passwords:
    for character in password:
      if character in weights_dict:
        weights_dict[character] += 1
      else:
        weights_dict[character] = 1
  return weights_dict

# Converts dictionary values containing occurrencies to weights
def calculate_weights(occurrencies_dict):
  for key in occurrencies_dict:
    occurrencies_dict[key] = 1/occurrencies_dict[key]
  return occurrencies_dict

weights_dict = calculate_weights(calculate_occurrencies(passwords))
print(weights_dict)



{'k': 7.3281547706287556e-06, 'z': 1.1057176660511506e-05, 'd': 7.674773786042656e-06, 'e': 3.490267389384701e-06, '5': 5.885330225878974e-06, '7': 6.647786951723771e-06, 'i': 3.943310961615811e-06, 'n': 4.889234394786121e-06, 'o': 4.010121546784083e-06, '3': 4.65829105934197e-06, '4': 5.885780542786681e-06, 'v': 1.2305874824641283e-05, 's': 5.169401279943757e-06, '1': 2.593300467572074e-06, 'y': 7.647072318362915e-06, 'r': 5.0159507233000945e-06, 'm': 6.236241292648095e-06, 'g': 7.915525511738725e-06, '2': 3.5612028318684918e-06, 'l': 5.932711189686575e-06, 'a': 2.449347493827644e-06, 'b': 9.556666252544462e-06, 'h': 8.535773426430169e-06, 'A': 2.313690104347424e-05, 'V': 8.035355564483728e-05, 'Y': 4.716313729189266e-05, 'q': 1.813598360507082e-05, 'D': 3.385698808234019e-05, 'E': 4.094333442515558e-05, 'M': 2.4665778698633517e-05, 'Z': 7.958615200955034e-05, 'f': 1.2866038803973032e-05, 'N': 2.8521719289238757e-05, 't': 6.27474602965445e-06, 'u': 6.222233283970282e-06, '6': 6.651810

In [40]:
# LENGTH feature
# lengths = np.array([len(p) for p in passwords]).reshape(-1, 1)
lengths = np.array([len(p) for p in passwords])
max_len = np.max(lengths)
min_len = np.min(lengths)

print("Lengths shape: ", lengths.shape)

print(lengths)
print(f"Max: {max_len} --> {passwords[lengths == max_len]} \nMin: {min_len} --> {passwords[lengths == min_len]}")

# normalize() from sklearn accept 2D arrays only --> reshape so that we have 1 row (1 "sample")
# and make sklearn compute the remaining number of columns for us
feat_length = normalize(lengths.reshape(1, -1))

#feat_length = feat_length.flatten()
print(feat_length)


# ROCKYOU feature

# numpy array of 1 and 0 (0 if found, otherwise is 1)
# int() used to convert boolean into number
feat_rockyou = np.array([int(p not in data_rockyou) for p in passwords])
print(feat_rockyou)

# passwords found in rockyou file
print(passwords[feat_rockyou == 0])


# JTR feature
feat_jtr = np.array([int(p not in data_jtr) for p in passwords])
print(feat_jtr)
print(passwords[feat_jtr == 0])




Lengths shape:  (669639,)
[ 8  8  9 ... 12  9  8]
Max: 220 --> ['In0LnUoff8wfayJGqzelyDqg4AMl9gBhgl3T2iZeONzh5gPqTyP8IVLsQ960aZwlZcdSjE1XCi8taVT5dWSB3wNJwMqpzmlSIKh21A8TNxpSJ5nu2hULRgjHZF6fubMkwhjPNRryi0BOyas9zlp6JUsNN0RQ4KRma8satN1JwEOAxlhMgJ7OwgRBbwuqCCiwhdylowbq0xpBsXZbhexgZnq4yOUb'] 
Min: 1 --> ['M' '9' '1']
[[0.00094165 0.00094165 0.00105936 ... 0.00141248 0.00105936 0.00094165]]
[1 1 1 ... 1 1 1]
['megzy123' 'intel1' 'schalke04' ... 'jenny1989' 'skyline123' 'hattrick9']
[1 1 1 ... 1 1 1]
['martin1' 'harley1' 'star69' 'dagger1' 'c00per' 'family1' 'michael1'
 'ashley1' 'matti1' 'rocket1' 'florida1' 'scott1' 'front242' 'teddy1'
 'viper1' 'amanda1' 'phoenix1' 'daniel1' 'rasta1' 'david1' 'rocky1'
 'hello123' 'randy1' 'justin1' 'seven7' 'saturn5' 'vampire' 'lucky1'
 'master1' 'babylon5' 'xxx123' 'mickey1' 'montana3' '1234qwer' 'happy123'
 'cindy1' 'terry1' 'chester1' 'steph1' 'roger1' 'carol1' 'Golden' '654321'
 'trustno1' 'pussy1' 'parola' 'simba1' 'peter1' 'william1' 'billy1'
 'rambo

Manuel - Password structure feature

In [18]:
# Define macros
numbers = 0
lower_case = 1
upper_case = 2
special_char = 3

# Counts numbers, lowercases, uppercases and other characters in a password
def calculate_password_structure(passwords):
  passwords_structure = []
  for password in passwords:
    counts = np.array([0, 0, 0, 0])
    for character in password:
      if character.isnumeric():
        counts[numbers] += 1
      elif character.islower():
        counts[lower_case] += 1
      elif character.isupper():
        counts[upper_case] += 1
      else:
        counts[special_char] += 1

    passwords_structure.append(counts / len(password))
  return np.array(passwords_structure)

feat_structure = calculate_password_structure(passwords)
for i in range(20):
  print(passwords[i], feat_structure[i])

kzde5577 [0.5 0.5 0.  0. ]
kino3434 [0.5 0.5 0.  0. ]
visi7k1yr [0.22222222 0.77777778 0.         0.        ]
megzy123 [0.375 0.625 0.    0.   ]
lamborghin1 [0.09090909 0.90909091 0.         0.        ]
AVYq1lDE4MgAZfNt [0.125  0.3125 0.5625 0.    ]
u6c8vhow [0.25 0.75 0.   0.  ]
v1118714 [0.875 0.125 0.    0.   ]
universe2908 [0.33333333 0.66666667 0.         0.        ]
as326159 [0.75 0.25 0.   0.  ]
asv5o9yu [0.25 0.75 0.   0.  ]
612035180tok [0.75 0.25 0.   0.  ]
jytifok873 [0.3 0.7 0.  0. ]
WUt9IZzE0OQ7PkNE [0.1875 0.1875 0.625  0.    ]
jerusalem393 [0.25 0.75 0.   0.  ]
g067057895 [0.9 0.1 0.  0. ]
52558000aaa [0.72727273 0.27272727 0.         0.        ]
idofo673 [0.375 0.625 0.    0.   ]
6975038lp [0.77777778 0.22222222 0.         0.        ]
sbl571017 [0.66666667 0.33333333 0.         0.        ]


Manuel - Password score feature by characters appearance frequency

In [19]:
# Calculate passwords score with character weights
def calculate_password_scores(passwords):
  scores = []
  for password in passwords:
    score = 0
    for character in password:
      # If weight does not exists in the dictionary, it is a very rare character so 1.001 points are given
      try:
        score += weights_dict[character]
      except:
        score += 1.001
    scores.append(score)

  return np.array(scores)

feat_scores = calculate_password_scores(passwords)
for i in range(20):
  print(passwords[i], feat_scores[i])

kzde5577 5.4616606961773106e-05
kino3434 4.125896487807207e-05
visi7k1yr 5.459416325940427e-05
megzy123 4.7159077531428475e-05
lamborghin1 6.107818326093385e-05
AVYq1lDE4MgAZfNt 0.00044096855367907917
u6c8vhow 6.188592008576835e-05
v1118714 4.137730220503662e-05
universe2908 6.333500166465563e-05
as326159 3.605556829231577e-05
asv5o9yu 4.8776264951759156e-05
612035180tok 5.7712349554057255e-05
jytifok873 7.003875790043788e-05
WUt9IZzE0OQ7PkNE 0.0005411506299646499
jerusalem393 6.290846358193651e-05
g067057895 5.8876546000050805e-05
52558000aaa 4.671704378202566e-05
idofo673 5.046225561179805e-05
6975038lp 5.401720794530435e-05
sbl571017 4.902200037845903e-05


In [45]:
# Create the feature vector
# Currently we are not including "feat_score"

# divide "feat_structure" in its sub-features since hstack() needs the same
# dimension for every array

def create_feature_vector(features_list):
  # 4 features in the list


  for feature in features_list:
    if len(feature.shape) != 1: # it's 2-d feature_structure or feat_length
      if abs(feature.shape[1]) > 1: # it's the 2-d feature_structure --> decompose it to 4 different sub-features



  features_vector = np.hstack(feature)
  return feature_vector

print("\nFeature length: ", feat_length)
print("\nFeature rockyou: ", feat_rockyou)
print("\nFeature jtr: ", feat_jtr)
print("\nFeature structure --> numbers: ", feat_structure[:, 0])
print("\nFeature structure --> lowercase: ", feat_structure[:, 1])
print("\nFeature structure --> uppercase: ", feat_structure[:, 2])
print("\nFeature structure --> special: ", feat_structure[:, 3])

print("feature shape: ", len(feat_length.shape))


feature_vector = np.hstack((feat_length, feat_rockyou, feat_jtr, feat_structure[:, 0],
                            feat_structure[:, 1], feat_structure[:, 2], feat_structure[:, 3]))
print(feature_vector)
print(feature_vector.shape)





Feature length:  [[0.00094165 0.00094165 0.00105936 ... 0.00141248 0.00105936 0.00094165]]

Feature rockyou:  [1 1 1 ... 1 1 1]

Feature jtr:  [1 1 1 ... 1 1 1]

Feature structure --> numbers:  [0.5        0.5        0.22222222 ... 0.5        0.22222222 0.25      ]

Feature structure --> lowercase:  [0.5        0.5        0.77777778 ... 0.5        0.77777778 0.75      ]

Feature structure --> uppercase:  [0. 0. 0. ... 0. 0. 0.]

Feature structure --> special:  [0. 0. 0. ... 0. 0. 0.]
feature shape:  2


ValueError: ignored

Manuel - Neural Network

In [None]:
class NeuralNetwork(nn.Module):


  def __init__(self, n_input, n_output):
    super().__init__()
    self.n_input = n_input
    self.n_output = n_output
    self.mid_n = int((n_input + n_output) / 2)

    # Define Layers:
    self.l1 = nn.Linear(self.n_input, self.mid_n) # layer 1
    self.l2 = nn.Linear(self.mid_n, self.n_output) # layer 2
    self.l3 = nn.Linear(self.n_output, self.n_output) # layer 3

    # Define Activation functions:
    self.relu = nn.ReLU()
    self.softmax = nn.LogSoftmax(dim = 1)

    # Weights initialization
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='relu') # Using HE because more optimized for ReLU activated layers
    nn.init.zeros_(self.l1.bias)
    nn.init.kaiming_normal_(self.l2.weight, mode='fan_in', nonlinearity='relu')
    nn.init.zeros_(self.l2.bias)
    nn.init.normal_(self.l3.weight) # Using normal distribution for LogSoftMax activated layer
    nn.init.normal_(self.l3.bias)


  def forward(self, x):
    '''
    Layers: 3
    Activation Functions:
    RELU for first two layers
    Log Softmax for last layer
    '''
    x = self.l1(x)
    x = self.relu(x)
    x = self.l2(x)
    x = self.relu(x)
    x = self.l3(x)
    x = self.softmax(x)
    return x


  def train_model(self, tr_loader, n_epochs, criterion, optimizer):
    # losses --> {[*idx_first_epoch*]: loss_1, [*idx_second_epoch*]: loss_2, ...}
    losses = {}
    for e in range(n_epochs):
      for features, labels in tr_loader:
        optimizer.zero_grad() # set optimizer gradients to zero:
        output = self(features) # Intial output
        loss = criterion(output, labels.long()) # Loss Caluclation
        loss.backward() # Pass loss function gradients to pervious layers:
        optimizer.step() # Update Weights
        losses[e] = loss.item()

    return losses


  def test_model(self, ts_loader):
    correct = 0
    total = 0
    for features, labels in ts_loader:
      outputs = self(features)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum()

    return correct / total



# Define macros.
number_of_features = 8 # Rockyou, johntheripper , password len, numbers/len, lowercases/len, uppercases/len, special characters/len, password score
number_of_outputs = 3 # Unsecure, intermediate, secure

NN = NeuralNetwork(number_of_features, number_of_outputs)
criterion = nn.NLLLoss()
optimizer = optim.Adam(NN.parameters(), lr=0.001)