# Password Strength Model

Content:

Explanation:

## Import Libraries 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle

## Import Dataset

In [2]:
data = pd.read_csv('password_strength.csv',',',error_bad_lines=False)

b'Skipping line 2810: expected 2 fields, saw 5\nSkipping line 4641: expected 2 fields, saw 5\nSkipping line 7171: expected 2 fields, saw 5\nSkipping line 11220: expected 2 fields, saw 5\nSkipping line 13809: expected 2 fields, saw 5\nSkipping line 14132: expected 2 fields, saw 5\nSkipping line 14293: expected 2 fields, saw 5\nSkipping line 14865: expected 2 fields, saw 5\nSkipping line 17419: expected 2 fields, saw 5\nSkipping line 22801: expected 2 fields, saw 5\nSkipping line 25001: expected 2 fields, saw 5\nSkipping line 26603: expected 2 fields, saw 5\nSkipping line 26742: expected 2 fields, saw 5\nSkipping line 29702: expected 2 fields, saw 5\nSkipping line 32767: expected 2 fields, saw 5\nSkipping line 32878: expected 2 fields, saw 5\nSkipping line 35643: expected 2 fields, saw 5\nSkipping line 36550: expected 2 fields, saw 5\nSkipping line 38732: expected 2 fields, saw 5\nSkipping line 40567: expected 2 fields, saw 5\nSkipping line 40576: expected 2 fields, saw 5\nSkipping line 

b'Skipping line 525174: expected 2 fields, saw 5\nSkipping line 526251: expected 2 fields, saw 5\nSkipping line 529611: expected 2 fields, saw 5\nSkipping line 531398: expected 2 fields, saw 5\nSkipping line 534146: expected 2 fields, saw 5\nSkipping line 544954: expected 2 fields, saw 5\nSkipping line 553002: expected 2 fields, saw 5\nSkipping line 553883: expected 2 fields, saw 5\nSkipping line 553887: expected 2 fields, saw 5\nSkipping line 553915: expected 2 fields, saw 5\nSkipping line 554172: expected 2 fields, saw 5\nSkipping line 563534: expected 2 fields, saw 5\nSkipping line 565191: expected 2 fields, saw 5\nSkipping line 574108: expected 2 fields, saw 5\nSkipping line 574412: expected 2 fields, saw 5\nSkipping line 575985: expected 2 fields, saw 5\nSkipping line 580091: expected 2 fields, saw 5\nSkipping line 582682: expected 2 fields, saw 5\nSkipping line 585885: expected 2 fields, saw 5\nSkipping line 590171: expected 2 fields, saw 5\nSkipping line 591924: expected 2 field

In [3]:
# Display dataset
data.head(3)

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1


## Data Pre-processing

In [28]:
# Check for null values and drop them
data[data['password'].isnull()]
data.dropna(inplace=True)

# Shuffling the dataset and reset the jumblled indices to new order
data = shuffle(data)
data.reset_index(drop=True,inplace=True)

# Breaking the dataset into features and response
X = data['password'] # features
y = data['strength'] # response

In [29]:
# Tokenize the words
def words_to_char(inputs):
    characters=[]
    for i in inputs:
        characters.append(i)
    return characters

# Convert characters to vectors
vectorizer=TfidfVectorizer(tokenizer=words_to_char)
X=vectorizer.fit_transform(X)

# Exported trained vectorizer
filename = 'tfidfvectorizer.pickle'
pickle.dump(vectorizer, open(filename, 'wb'))

X.todense()
vectorizer.vocabulary_

{'i': 64,
 '3': 37,
 'l': 67,
 'o': 70,
 'c': 58,
 'y': 80,
 'd': 59,
 'm': 68,
 'w': 78,
 'q': 72,
 'n': 69,
 'x': 79,
 'z': 81,
 'f': 61,
 'e': 60,
 'k': 66,
 '1': 35,
 '2': 36,
 's': 74,
 '0': 34,
 '5': 39,
 '9': 43,
 'a': 56,
 't': 75,
 'u': 76,
 'v': 77,
 '6': 40,
 'r': 73,
 '8': 42,
 'h': 63,
 '7': 41,
 'b': 57,
 'g': 62,
 '4': 38,
 'j': 65,
 'p': 71,
 '!': 21,
 '.': 32,
 '@': 49,
 '-': 31,
 '$': 24,
 ' ': 20,
 '+': 30,
 '_': 54,
 '(': 27,
 ')': 28,
 '/': 33,
 '#': 23,
 '>': 47,
 '*': 29,
 '"': 22,
 '&': 26,
 '=': 46,
 'ä': 119,
 '°': 98,
 '?': 48,
 ']': 52,
 '[': 50,
 'þ': 142,
 '|': 83,
 '^': 53,
 ';': 44,
 'ú': 138,
 '±': 99,
 '<': 45,
 '%': 25,
 '{': 82,
 '~': 85,
 'ö': 135,
 '}': 84,
 '´': 102,
 '\\': 51,
 'â': 117,
 'í': 126,
 '—': 146,
 'ý': 141,
 'å': 120,
 'ÿ': 143,
 'õ': 134,
 'º': 107,
 '¨': 95,
 'é': 124,
 '\x06': 4,
 'ð': 129,
 '¤': 92,
 'á': 116,
 '\x17': 13,
 'à': 115,
 'ß': 114,
 '\x05': 3,
 '\x1b': 16,
 '\x1c': 17,
 '\x08': 5,
 'ê': 125,
 'î': 127,
 '÷': 136,
 '‚

In [6]:
feature_names = vectorizer.get_feature_names()
 
# Get TFIDF vector for first document
first_document_vector=X[0]
 
# Print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
_,0.516616
y,0.411102
a,0.398603
o,0.327215
v,0.238949
...,...
8,0.000000
7,0.000000
6,0.000000
5,0.000000


## Build Model

In [9]:
# Split processed datset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Model creation
log_class=LogisticRegression(penalty='l2' ,max_iter = 1000 ,multi_class='ovr')

# Model Compilation
log_class.fit(X_train,y_train)

# Model Test score
print('Accuracy: {:.3f}'.format(log_class.score(X_test,y_test)*100))

Accuracy: 81.235


## Export Model 

In [8]:
# To export model
filename = 'finalized_model.sav'
pickle.dump(log_class, open(filename, 'wb'))