<a href="https://colab.research.google.com/github/matteoturnu/ML_Project/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import random
from importlib.util import find_spec
import json
import os
import shutil
import requests
import sys
project_folder = "/content/password_strength_classifier"

Creation of useful directories and download of password dictionaries

In [2]:
def download_file(url, dest_folder_name):
  local_filename = url.split('/')[-1]
  path = os.path.join("/{}/{}".format(dest_folder_name, local_filename))
  """with requests.get(url, stream=True) as r:
      with open(path, 'wb') as f:
          shutil.copyfileobj(r.raw, f)"""

  with open(path, 'wb') as f:
    f.write(requests.get(url, stream=True).content)

  # return local_filename
  return path

def read_file(filepath):
  with open(filepath, errors='replace', encoding='utf-8') as f:
    data = {line.split('\n')[0] for line in f.readlines()}
  return data


if os.path.exists(project_folder) is False:
  dict_dir = project_folder + "/dictionaries/"
  dataset_dir = project_folder + "/dataset/"

  os.mkdir(project_folder)
  os.mkdir(dataset_dir)

  os.mkdir(dict_dir)

  f_rockyou = download_file(
    "https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt",
    dict_dir)

  f_jtr = download_file(
      "https://raw.githubusercontent.com/danielmiessler/SecLists/master/Passwords/Software/john-the-ripper.txt",
      dict_dir)


data_rockyou = read_file(f_rockyou)
data_jtr = read_file(f_jtr)

print("\nRock You")
import itertools
print([val for i, val in enumerate(itertools.islice(data_rockyou, 5))])
print("\nJohn The Ripper")
print([val for i, val in enumerate(itertools.islice(data_jtr, 5))])



Rock You
['', 'katnica', '9285515', 'netving', '77sunsetstrip']

John The Ripper
['sparky', 'fiction', 'Master', 'autumn', 'smiles']


In [3]:
# download the dataset

if find_spec("kaggle") is None:
  ! pip install -q kaggle

if os.path.isdir("/root/.kaggle") is False:
  ! mkdir ~/.kaggle
  ! touch "/root/.kaggle/kaggle.json"

  token = {"username":"matteoturnu","key":"79ea644685a3e574038b40e4019b0927"}
  with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
  !chmod 600 /root/.kaggle/kaggle.json

  ! kaggle datasets download -d bhavikbb/password-strength-classifier-dataset -p $dataset_dir

Downloading password-strength-classifier-dataset.zip to /content/password_strength_classifier/dataset
  0% 0.00/5.01M [00:00<?, ?B/s]
100% 5.01M/5.01M [00:00<00:00, 52.8MB/s]


In [4]:
# read file

file_path = os.path.join(dataset_dir, "password-strength-classifier-dataset.zip")

pswd_df = pd.read_csv(file_path, on_bad_lines='skip')
print(pswd_df)

            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669640 rows x 2 columns]


In [5]:
def word_split(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

# unique values of strength feature
pswd_df['strength'].unique()

# number of missing values in dataset
pswd_df.isnull().sum()

# remove missing values
pswd_df.dropna(inplace=True)
pswd_df.isnull().sum()

print(pswd_df)

psw_array = np.array(pswd_df)
print(psw_array)

# ??? PROBLEM: it is shown that if shuffled then
# there are several duplicates of a password
#random.shuffle(psw_array)

labels = np.array([p[1] for p in psw_array])
passwords = np.array([p[0] for p in psw_array])

print("Len of passwords: ", len(passwords))
print("Len of UNIQUE passwords: ", len(np.unique(passwords)))


"""vectorizer = TfidfVectorizer(tokenizer=word_split)
X = vectorizer.fit_transform(passwords)

print(vectorizer.get_feature_names_out())"""


            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669639 rows x 2 columns]
[['kzde5577' 1]
 ['kino3434' 1]
 ['visi7k1yr' 1]
 ...
 ['184520socram' 1]
 ['marken22a' 1]
 ['fxx4pw4g' 1]]
Len of passwords:  669639
Len of UNIQUE passwords:  669639


'vectorizer = TfidfVectorizer(tokenizer=word_split)\nX = vectorizer.fit_transform(passwords)\n\nprint(vectorizer.get_feature_names_out())'

In [6]:
# LENGTH feature
lengths = np.array([len(p) for p in passwords])
max_len = np.max(lengths)
min_len = np.min(lengths)


print(lengths)
print(f"Max: {max_len} --> {passwords[lengths == max_len]} \nMin: {min_len} --> {passwords[lengths == min_len]}")

# normalize() from sklearn accept 2D arrays only --> reshape so that we have 1 row (1 "sample")
# and make sklearn compute the remaining number of columns for us
feat_length = normalize(lengths.reshape(1, -1))
print(feat_length)


# ROCKYOU feature

# numpy array of 1 and 0 (1 if found, otherwise is 0)
# int() used to convert boolean into number
feat_rockyou = np.array([int(p in data_rockyou) for p in passwords])
print(feat_rockyou)

# passwords found in rockyou file
print(passwords[feat_rockyou == 1])


# JTR feature
feat_jtr = np.array([int(p in data_jtr) for p in passwords])
print(feat_jtr)
print(passwords[feat_jtr == 1])




[ 8  8  9 ... 12  9  8]
Max: 220 --> ['In0LnUoff8wfayJGqzelyDqg4AMl9gBhgl3T2iZeONzh5gPqTyP8IVLsQ960aZwlZcdSjE1XCi8taVT5dWSB3wNJwMqpzmlSIKh21A8TNxpSJ5nu2hULRgjHZF6fubMkwhjPNRryi0BOyas9zlp6JUsNN0RQ4KRma8satN1JwEOAxlhMgJ7OwgRBbwuqCCiwhdylowbq0xpBsXZbhexgZnq4yOUb'] 
Min: 1 --> ['M' '9' '1']
[[0.00094165 0.00094165 0.00105936 ... 0.00141248 0.00105936 0.00094165]]
[0 0 0 ... 0 0 0]
['megzy123' 'intel1' 'schalke04' ... 'jenny1989' 'skyline123' 'hattrick9']
[0 0 0 ... 0 0 0]
['martin1' 'harley1' 'star69' 'dagger1' 'c00per' 'family1' 'michael1'
 'ashley1' 'matti1' 'rocket1' 'florida1' 'scott1' 'front242' 'teddy1'
 'viper1' 'amanda1' 'phoenix1' 'daniel1' 'rasta1' 'david1' 'rocky1'
 'hello123' 'randy1' 'justin1' 'seven7' 'saturn5' 'vampire' 'lucky1'
 'master1' 'babylon5' 'xxx123' 'mickey1' 'montana3' '1234qwer' 'happy123'
 'cindy1' 'terry1' 'chester1' 'steph1' 'roger1' 'carol1' 'Golden' '654321'
 'trustno1' 'pussy1' 'parola' 'simba1' 'peter1' 'william1' 'billy1'
 'rambo1' 'Lindsay' 'james1' 'apo

Password structure feature

In [7]:
# Define macros
numbers = 0
lower_case = 1
upper_case = 2
special_char = 3

# Counts numbers, lowercases, uppercases and other characters in a password
def calculate_password_structure(passwords):
  passwords_structure = []
  for password in passwords:
    counts = [0, 0, 0, 0]
    for character in password:
      if character.isnumeric():
        counts[numbers] += 1
      elif character.islower():
        counts[lower_case] += 1
      elif character.isupper():
        counts[upper_case] += 1
      else:
        counts[special_char] += 1
    passwords_structure.append(counts)
  return passwords_structure


passwords_structure = calculate_password_structure(passwords)
for i in range(20):
  print(passwords[i],passwords_structure[i])

kzde5577 [4, 4, 0, 0]
kino3434 [4, 4, 0, 0]
visi7k1yr [2, 7, 0, 0]
megzy123 [3, 5, 0, 0]
lamborghin1 [1, 10, 0, 0]
AVYq1lDE4MgAZfNt [2, 5, 9, 0]
u6c8vhow [2, 6, 0, 0]
v1118714 [7, 1, 0, 0]
universe2908 [4, 8, 0, 0]
as326159 [6, 2, 0, 0]
asv5o9yu [2, 6, 0, 0]
612035180tok [9, 3, 0, 0]
jytifok873 [3, 7, 0, 0]
WUt9IZzE0OQ7PkNE [3, 3, 10, 0]
jerusalem393 [3, 9, 0, 0]
g067057895 [9, 1, 0, 0]
52558000aaa [8, 3, 0, 0]
idofo673 [3, 5, 0, 0]
6975038lp [7, 2, 0, 0]
sbl571017 [6, 3, 0, 0]
