<a href="https://colab.research.google.com/github/matteoturnu/ML_Project/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import random
from importlib.util import find_spec
import json
import os
import shutil
import requests

project_folder = "/content/password_strength_classifier"

Creation of useful directories and download of password dictionaries

In [5]:
def download_file(url, dest_folder_name):
  local_filename = url.split('/')[-1]
  path = os.path.join("/{}/{}".format(dest_folder_name, local_filename))
  with requests.get(url, stream=True) as r:
      with open(path, 'wb') as f:
          shutil.copyfileobj(r.raw, f)

  # return local_filename
  return path

if os.path.exists(project_folder) is False:
  dict_dir = project_folder + "/dictionaries/"
  dataset_dir = project_folder + "/dataset/"

  os.mkdir(project_folder)
  os.mkdir(dict_dir)
  os.mkdir(dataset_dir)

  url = "https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt"
  f_rockyou = download_file(url, dict_dir)


with open(f_rockyou, errors='replace') as f:
  data_rockyou = [line.split('\n')[0] for line in f.readlines()]

print(data_rockyou[0:4])




['123456', '12345', '123456789', 'password']


In [3]:
# download the dataset

if find_spec("kaggle") is None:
  ! pip install -q kaggle

if os.path.isdir("/root/.kaggle") is False:
  ! mkdir ~/.kaggle
  ! touch "/root/.kaggle/kaggle.json"

  token = {"username":"matteoturnu","key":"79ea644685a3e574038b40e4019b0927"}
  with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
  !chmod 600 /root/.kaggle/kaggle.json

  ! kaggle datasets download -d bhavikbb/password-strength-classifier-dataset -p $dataset_dir

Downloading password-strength-classifier-dataset.zip to /content/password_strength_classifier/dataset
  0% 0.00/5.01M [00:00<?, ?B/s]
100% 5.01M/5.01M [00:00<00:00, 85.8MB/s]


In [3]:
# read file
file_path = '/content/password-strength-classifier-dataset.zip'

pswd_df = pd.read_csv(file_path, on_bad_lines='skip')
print(pswd_df)

            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669640 rows x 2 columns]


In [10]:
def word_split(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

# unique values of strength feature
pswd_df['strength'].unique()

# number of missing values in dataset
pswd_df.isnull().sum()

# remove missing values
pswd_df.dropna(inplace=True)
pswd_df.isnull().sum()

print(pswd_df)

psw_array = np.array(pswd_df)
print(psw_array)

random.shuffle(psw_array)

labels = np.array([p[1] for p in psw_array])
passwords = np.array([p[0] for p in psw_array])

"""vectorizer = TfidfVectorizer(tokenizer=word_split)
X = vectorizer.fit_transform(passwords)

print(vectorizer.get_feature_names_out())"""


            password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669639 rows x 2 columns]
[['kzde5577' 1]
 ['kino3434' 1]
 ['visi7k1yr' 1]
 ...
 ['184520socram' 1]
 ['marken22a' 1]
 ['fxx4pw4g' 1]]


'vectorizer = TfidfVectorizer(tokenizer=word_split)\nX = vectorizer.fit_transform(passwords)\n\nprint(vectorizer.get_feature_names_out())'

In [16]:
lengths = np.array([len(p) for p in passwords])
max_len = np.max(lengths)
min_len = np.min(lengths)


print(lengths)
print(f"Max: {max_len} --> {passwords[lengths == max_len]} \nMin: {min_len} --> {passwords[lengths == min_len]}")

# normalize() from sklearn accept 2D arrays only --> reshape so that we have 1 row (1 "sample")
# and make sklearn compute the remaining number of columns for us
feat_length = normalize(lengths.reshape(1, -1))
print(feat_length)




[ 8  8  9 ...  9  7 14]
Max: 220 --> ['In0LnUoff8wfayJGqzelyDqg4AMl9gBhgl3T2iZeONzh5gPqTyP8IVLsQ960aZwlZcdSjE1XCi8taVT5dWSB3wNJwMqpzmlSIKh21A8TNxpSJ5nu2hULRgjHZF6fubMkwhjPNRryi0BOyas9zlp6JUsNN0RQ4KRma8satN1JwEOAxlhMgJ7OwgRBbwuqCCiwhdylowbq0xpBsXZbhexgZnq4yOUb'] 
Min: 1 --> ['1']
[[0.0009419  0.0009419  0.00105964 ... 0.00105964 0.00082416 0.00164833]]
