<a href="https://colab.research.google.com/github/matteoturnu/ML_Project/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize as nrm
import random
from importlib.util import find_spec
import json
import os
import shutil
import requests
import sys
import torch
import torch.nn as nn # Contains Required functions and layers
import torch.nn.functional as F # For neural network functions:
import torch.optim as optim # Contains Optimization function available in PyTorch.
project_folder = "/content/password_strength_classifier"

Creation of useful directories and download of password dictionaries

In [65]:
def download_file(url, dest_folder_name):
  local_filename = url.split('/')[-1]
  path = os.path.join("/{}/{}".format(dest_folder_name, local_filename))
  """with requests.get(url, stream=True) as r:
      with open(path, 'wb') as f:
          shutil.copyfileobj(r.raw, f)"""

  with open(path, 'wb') as f:
    f.write(requests.get(url, stream=True).content)

  # return local_filename
  return path

def read_file(filepath):
  with open(filepath, errors='replace', encoding='utf-8') as f:
    data = {line.split('\n')[0] for line in f.readlines()}
  return data


if os.path.exists(project_folder) is False:
  dict_dir = project_folder + "/dictionaries/"
  dataset_dir = project_folder + "/dataset/"

  os.mkdir(project_folder)
  os.mkdir(dataset_dir)

  os.mkdir(dict_dir)

  f_rockyou = download_file(
    "https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt",
    dict_dir)

  f_jtr = download_file(
      "https://raw.githubusercontent.com/danielmiessler/SecLists/master/Passwords/Software/john-the-ripper.txt",
      dict_dir)


data_rockyou = read_file(f_rockyou)
data_jtr = read_file(f_jtr)

print("\nRock You")
import itertools
print([val for i, val in enumerate(itertools.islice(data_rockyou, 5))])
print("\nJohn The Ripper")
print([val for i, val in enumerate(itertools.islice(data_jtr, 5))])



Rock You
['', 'bitch(!!', 'sarahbutler', 'venice288', '35511435a']

John The Ripper
['israel', 'dixie', 'allison', 'home', 'jessie']


In [66]:
# download the dataset

if find_spec("kaggle") is None:
  ! pip install -q kaggle

if os.path.isdir("/root/.kaggle") is False:
  ! mkdir ~/.kaggle
  ! touch "/root/.kaggle/kaggle.json"

  token = {"username":"matteoturnu","key":"79ea644685a3e574038b40e4019b0927"}
  with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
  !chmod 600 /root/.kaggle/kaggle.json

  ! kaggle datasets download -d bhavikbb/password-strength-classifier-dataset -p $dataset_dir

In [67]:
# read file

file_path = os.path.join(dataset_dir, "password-strength-classifier-dataset.zip")

pswd_df = pd.read_csv(file_path, on_bad_lines='skip')
print(pswd_df)

            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669640 rows x 2 columns]


In [68]:
def word_split(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

# unique values of strength feature
pswd_df['strength'].unique()

# number of missing values in dataset
pswd_df.isnull().sum()

# remove missing values
pswd_df.dropna(inplace=True)
pswd_df.isnull().sum()

print(pswd_df)

psw_array = np.array(pswd_df)
print(psw_array)

# ??? PROBLEM: it is shown that if shuffled then
# there are several duplicates of a password
#random.shuffle(psw_array)

labels = np.array([p[1] for p in psw_array])
passwords = np.array([p[0] for p in psw_array])

print("Len of passwords: ", len(passwords))
print("Len of UNIQUE passwords: ", len(np.unique(passwords)))

n_samples = len(passwords)
n_features = 8

"""vectorizer = TfidfVectorizer(tokenizer=word_split)
X = vectorizer.fit_transform(passwords)

print(vectorizer.get_feature_names_out())"""


            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669639 rows x 2 columns]
[['kzde5577' 1]
 ['kino3434' 1]
 ['visi7k1yr' 1]
 ...
 ['184520socram' 1]
 ['marken22a' 1]
 ['fxx4pw4g' 1]]
Len of passwords:  669639
Len of UNIQUE passwords:  669639


'vectorizer = TfidfVectorizer(tokenizer=word_split)\nX = vectorizer.fit_transform(passwords)\n\nprint(vectorizer.get_feature_names_out())'

Manuel - Normalization function

In [69]:
def normalize(value, min, max):
  return (value - min) / (max - min)

Manuel - Calculate characters weights

In [70]:
# Create a dictionary containing occurrencies of all the characters in the dataset
def calculate_occurrencies(passwords):
  tot_occurrencies = 0
  occurrencies_dict = dict()
  for password in passwords:
    for character in password:
      tot_occurrencies += 1
      if character in occurrencies_dict:
        occurrencies_dict[character] += 1
      else:
        occurrencies_dict[character] = 1
  return occurrencies_dict, tot_occurrencies

# Converts occurrencies to weights
def calculate_weights(occurrencies_dict, tot_occurrencies):
  weights_dict = dict()
  for key in occurrencies_dict:
    weights_dict[key] = 1 - (occurrencies_dict[key] / tot_occurrencies)
  return weights_dict

def normalize_weights_dict(weights_dict):
  maximum = max(weights_dict.values())
  minimum = min(weights_dict.values())
  normalized_weights_dict = dict()
  for key in weights_dict:
    normalized_weights_dict[key] = normalize(weights_dict[key], minimum, maximum)
  return normalized_weights_dict

occurrencies_dict, tot_occurrencies = calculate_occurrencies(passwords)
weights_dict = calculate_weights(occurrencies_dict, tot_occurrencies)
normalized_weights_dict = normalize_weights_dict(weights_dict)
print(normalized_weights_dict)



{'k': 0.6657636716788613, 'z': 0.7784853687869094, 'd': 0.680859037257116, 'e': 0.2982357306788878, '5': 0.5838229999191711, '7': 0.6315560007935909, 'i': 0.37886109961275827, 'n': 0.4990337300469549, 'o': 0.3892096181212973, '3': 0.47419728562645974, '4': 0.5838548415145818, 'v': 0.8009630857935044, 's': 0.5261848135184723, '1': 0.055509698215155584, 'y': 0.6797029424083516, 'r': 0.5116895395460378, 'm': 0.6072412686671358, 'g': 0.6905658251504523, '2': 0.312214191064269, 'l': 0.5871467726093695, 'a': 0.0, 'b': 0.7437045491842426, 'h': 0.7130508902175275, 'A': 0.8941389420262521, 'V': 0.9695202451312972, 'Y': 0.9480688072383304, 'q': 0.8649475470949451, 'D': 0.9276583445799487, 'E': 0.9401794396369086, 'M': 0.9007007600343888, 'Z': 0.9692263227121206, 'f': 0.8096288984522538, 'N': 0.9141256665303191, 't': 0.6096514325043904, 'u': 0.6063570520561109, '6': 0.6317788919614675, 'c': 0.6825099015114968, '8': 0.6026805724628994, 'w': 0.761756284428726, '9': 0.5184987422569818, '0': 0.387007

In [71]:
# LENGTH feature
# lengths = np.array([len(p) for p in passwords]).reshape(-1, 1)
lengths = np.array([len(p) for p in passwords])
max_len = np.max(lengths)
min_len = np.min(lengths)

print("Lengths shape: ", lengths.shape)

print(lengths)
print(f"Max: {max_len} --> {passwords[lengths == max_len]} \nMin: {min_len} --> {passwords[lengths == min_len]}")

# normalize() from sklearn accept 2D arrays only --> reshape so that we have 1 row (1 "sample")
# and make sklearn compute the remaining number of columns for us
feat_length = nrm(lengths.reshape(1, -1))

#feat_length = feat_length.flatten()
print(feat_length)


# ROCKYOU feature

# numpy array of 1 and 0 (0 if found, otherwise is 1)
# int() used to convert boolean into number
feat_rockyou = np.array([int(p not in data_rockyou) for p in passwords])
print(feat_rockyou)

# passwords found in rockyou file
print(passwords[feat_rockyou == 0])


# JTR feature
feat_jtr = np.array([int(p not in data_jtr) for p in passwords])
print(feat_jtr)
print(passwords[feat_jtr == 0])




Lengths shape:  (669639,)
[ 8  8  9 ... 12  9  8]
Max: 220 --> ['In0LnUoff8wfayJGqzelyDqg4AMl9gBhgl3T2iZeONzh5gPqTyP8IVLsQ960aZwlZcdSjE1XCi8taVT5dWSB3wNJwMqpzmlSIKh21A8TNxpSJ5nu2hULRgjHZF6fubMkwhjPNRryi0BOyas9zlp6JUsNN0RQ4KRma8satN1JwEOAxlhMgJ7OwgRBbwuqCCiwhdylowbq0xpBsXZbhexgZnq4yOUb'] 
Min: 1 --> ['M' '9' '1']
[[0.00094165 0.00094165 0.00105936 ... 0.00141248 0.00105936 0.00094165]]
[1 1 1 ... 1 1 1]
['megzy123' 'intel1' 'schalke04' ... 'jenny1989' 'skyline123' 'hattrick9']
[1 1 1 ... 1 1 1]
['martin1' 'harley1' 'star69' 'dagger1' 'c00per' 'family1' 'michael1'
 'ashley1' 'matti1' 'rocket1' 'florida1' 'scott1' 'front242' 'teddy1'
 'viper1' 'amanda1' 'phoenix1' 'daniel1' 'rasta1' 'david1' 'rocky1'
 'hello123' 'randy1' 'justin1' 'seven7' 'saturn5' 'vampire' 'lucky1'
 'master1' 'babylon5' 'xxx123' 'mickey1' 'montana3' '1234qwer' 'happy123'
 'cindy1' 'terry1' 'chester1' 'steph1' 'roger1' 'carol1' 'Golden' '654321'
 'trustno1' 'pussy1' 'parola' 'simba1' 'peter1' 'william1' 'billy1'
 'rambo

Manuel - Password structure feature

In [72]:
# Define macros
numbers = 0
lower_case = 1
upper_case = 2
special_char = 3

# Counts numbers, lowercases, uppercases and other characters in a password
def calculate_password_structure(passwords):
  passwords_structure = []
  for password in passwords:
    counts = np.array([0, 0, 0, 0])
    for character in password:
      if character.isnumeric():
        counts[numbers] += 1
      elif character.islower():
        counts[lower_case] += 1
      elif character.isupper():
        counts[upper_case] += 1
      else:
        counts[special_char] += 1

    passwords_structure.append(counts / len(password))
  return np.array(passwords_structure)

feat_structure = calculate_password_structure(passwords)
for i in range(20):
  print(passwords[i], feat_structure[i])

kzde5577 [0.5 0.5 0.  0. ]
kino3434 [0.5 0.5 0.  0. ]
visi7k1yr [0.22222222 0.77777778 0.         0.        ]
megzy123 [0.375 0.625 0.    0.   ]
lamborghin1 [0.09090909 0.90909091 0.         0.        ]
AVYq1lDE4MgAZfNt [0.125  0.3125 0.5625 0.    ]
u6c8vhow [0.25 0.75 0.   0.  ]
v1118714 [0.875 0.125 0.    0.   ]
universe2908 [0.33333333 0.66666667 0.         0.        ]
as326159 [0.75 0.25 0.   0.  ]
asv5o9yu [0.25 0.75 0.   0.  ]
612035180tok [0.75 0.25 0.   0.  ]
jytifok873 [0.3 0.7 0.  0. ]
WUt9IZzE0OQ7PkNE [0.1875 0.1875 0.625  0.    ]
jerusalem393 [0.25 0.75 0.   0.  ]
g067057895 [0.9 0.1 0.  0. ]
52558000aaa [0.72727273 0.27272727 0.         0.        ]
idofo673 [0.375 0.625 0.    0.   ]
6975038lp [0.77777778 0.22222222 0.         0.        ]
sbl571017 [0.66666667 0.33333333 0.         0.        ]


Manuel - Password score feature by characters appearance frequency

In [73]:
# Calculate passwords score with character weights
def calculate_password_scores(passwords):
  scores = []
  for password in passwords:
    score = 0
    for character in password:
      # If weight does not exists in the dictionary, it is a very rare character so 1.001 points are given
      try:
        score += normalized_weights_dict[character]
      except:
        score += 1
    normalized_score = score / len(password)
    scores.append(normalized_score)

  return np.array(scores)

feat_scores = calculate_password_scores(passwords)
for i in range(20):
  print(passwords[i], feat_scores[i])

kzde5577 0.6067627262284123
kino3434 0.5061215467177443
visi7k1yr 0.5143435501310545
megzy123 0.48701903882470265
lamborghin1 0.47054663557917564
AVYq1lDE4MgAZfNt 0.7849414053410604
u6c8vhow 0.6485382870691287
v1118714 0.35513666167814983
universe2908 0.47833016142056034
as326159 0.38777582782024717
asv5o9yu 0.5130924067592362
612035180tok 0.42952944653592046
jytifok873 0.6007950601438756
WUt9IZzE0OQ7PkNE 0.8146300190314772
jerusalem393 0.47239027508689135
g067057895 0.5648297331919243
52558000aaa 0.3479442464797782
idofo673 0.5481625562432801
6975038lp 0.5715955551310014
sbl571017 0.4668886869533008


In [74]:
from sklearn.model_selection import train_test_split
# Create the feature vector and split dataset
# Currently we are not including "feat_score"

# divide "feat_structure" in its sub-features since hstack() needs the same
# dimension for every array

def create_feature_vector(features_list, n_feat):
  # features_list --> [length, rockyou, jtr, structure]
  # feat_structure has 4 sub-features (numbers, lowercase, uppercase, special)
  feature_vector = np.zeros(shape=(n_samples, n_feat))

  for i, feature in enumerate(features_list):
    # "-1" allows to let Python compute the remaining dimension
    # in the case of "feature = feat_struct" the second dim = 4
    feature = feature.reshape(n_samples, -1)
    feature_vector[:, i:i+feature.shape[1]] = feature

  return feature_vector




print("\nFeature length: ", feat_length)
print("\nFeature rockyou: ", feat_rockyou)
print("\nFeature jtr: ", feat_jtr)
print("\nFeature structure --> numbers: ", feat_structure[:, 0])
print("\nFeature structure --> lowercase: ", feat_structure[:, 1])
print("\nFeature structure --> uppercase: ", feat_structure[:, 2])
print("\nFeature structure --> special: ", feat_structure[:, 3])

feat_list = [feat_length, feat_rockyou, feat_jtr, feat_structure]
feature_vector = create_feature_vector(feat_list, n_feat=7)
print("\n")
print(feature_vector.shape)
print(feature_vector)

# 80% of the dataset is used for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(feature_vector, labels, test_size=0.20, random_state=42)
print(X_train.shape)





Feature length:  [[0.00094165 0.00094165 0.00105936 ... 0.00141248 0.00105936 0.00094165]]

Feature rockyou:  [1 1 1 ... 1 1 1]

Feature jtr:  [1 1 1 ... 1 1 1]

Feature structure --> numbers:  [0.5        0.5        0.22222222 ... 0.5        0.22222222 0.25      ]

Feature structure --> lowercase:  [0.5        0.5        0.77777778 ... 0.5        0.77777778 0.75      ]

Feature structure --> uppercase:  [0. 0. 0. ... 0. 0. 0.]

Feature structure --> special:  [0. 0. 0. ... 0. 0. 0.]


(669639, 7)
[[9.41651385e-04 1.00000000e+00 1.00000000e+00 ... 5.00000000e-01
  0.00000000e+00 0.00000000e+00]
 [9.41651385e-04 1.00000000e+00 1.00000000e+00 ... 5.00000000e-01
  0.00000000e+00 0.00000000e+00]
 [1.05935781e-03 1.00000000e+00 1.00000000e+00 ... 7.77777778e-01
  0.00000000e+00 0.00000000e+00]
 ...
 [1.41247708e-03 1.00000000e+00 1.00000000e+00 ... 5.00000000e-01
  0.00000000e+00 0.00000000e+00]
 [1.05935781e-03 1.00000000e+00 1.00000000e+00 ... 7.77777778e-01
  0.00000000e+00 0.00000000e+

In [75]:
# SVC
from sklearn.svm import LinearSVC
# Note: SVC may be impractical beyond ten of thousands of samples.
# Use "LinearSVC" or "SGDClassifier" instead

clf = LinearSVC(max_iter=1000)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("Accuracy: ", accuracy * 100)



Accuracy:  82.86168687653067


In [76]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("Accuracy: ", accuracy * 100)


Accuracy:  82.8594468669733


Manuel - Dataset Creation

In [77]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]


trainset = CustomDataset(X_train, y_train)
testset = CustomDataset(X_test, y_test)

Manuel - Trainloader creation

In [81]:
def create_DataLoader(trainset, testset):
  # trainloader is what holds the data loader object which takes care of shuffling the data and constructing the batches
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
  # No need to shuffle test data.
  testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)
  return trainloader, testloader

trainloader, testloader = create_DataLoader(trainset, testset)

Manuel - Neural Network

In [87]:
class NeuralNetwork(nn.Module):


  def __init__(self, n_input, n_output):
    super().__init__()
    self.n_input = n_input
    self.n_output = n_output
    self.mid_n = int((n_input + n_output) / 2)

    # Define Layers:
    self.l1 = nn.Linear(self.n_input, self.mid_n) # layer 1
    self.l2 = nn.Linear(self.mid_n, self.n_output) # layer 2
    self.l3 = nn.Linear(self.n_output, self.n_output) # layer 3

    # Define Activation functions:
    self.relu = nn.ReLU()
    self.softmax = nn.LogSoftmax(dim = 1)

    # Weights initialization
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='relu') # Using HE because more optimized for ReLU activated layers
    nn.init.zeros_(self.l1.bias)
    nn.init.kaiming_normal_(self.l2.weight, mode='fan_in', nonlinearity='relu')
    nn.init.zeros_(self.l2.bias)
    nn.init.normal_(self.l3.weight) # Using normal distribution for LogSoftMax activated layer
    nn.init.normal_(self.l3.bias)


  def forward(self, x):
    '''
    Layers: 3
    Activation Functions:
    RELU for first two layers
    Log Softmax for last layer
    '''
    x = self.l1(x)
    x = self.relu(x)
    x = self.l2(x)
    x = self.relu(x)
    x = self.l3(x)
    x = self.softmax(x)
    return x


  def train_model(self, tr_loader, n_epochs, criterion, optimizer):
    # losses --> {[*idx_first_epoch*]: loss_1, [*idx_second_epoch*]: loss_2, ...}
    losses = {}
    for e in range(n_epochs):
      for features, labels in tr_loader:
        optimizer.zero_grad() # set optimizer gradients to zero:
        output = self(features) # Intial output
        loss = criterion(output, labels.long()) # Loss Caluclation
        loss.backward() # Pass loss function gradients to pervious layers:
        optimizer.step() # Update Weights
        losses[e] = loss.item()

    return losses


  def test_model(self, ts_loader):
    correct = 0
    total = 0
    for features, labels in ts_loader:
      outputs = self(features)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum()

    return correct / total



# Define macros.
number_of_features = 8 # Rockyou, johntheripper , password len, numbers/len, lowercases/len, uppercases/len, special characters/len, password score
number_of_outputs = 3 # Unsecure, intermediate, secure

# Initialize NN
NN = NeuralNetwork(number_of_features, number_of_outputs)
criterion = nn.NLLLoss()
optimizer = optim.Adam(NN.parameters(), lr=0.001)

Manuel - Network training

In [86]:
'''
# Train the model
n_epochs = 5
losses = NN.train_model(trainloader, n_epochs, criterion, optimizer)
print(losses)

# Test the models
accuracy = NN.test_model(testloader)
print(f"Accuracy of the model on the test images: {accuracy * 100} %")
'''

RuntimeError: ignored