# Predicting Password Strength

We will evaluate what makes a password easy to guess and a password that is not easy to guess. We will be able to find out if our passwords are easy to guess and change them if they are too easy. A score will be given to a password to indicate whether it is easy to guess or not

In [None]:
# Importing modules
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


First we must import our data. We will use on_bad_lines = "skip" so that any error-causing entries will be skipped

In [None]:
data = pd.read_csv("drive/MyDrive/CSCI405/finalProject/data.csv", on_bad_lines='skip')

In [None]:
data.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [None]:
data.isnull().sum()

password    1
strength    0
dtype: int64

In [None]:
data = data.dropna()

In [None]:
data.isnull().sum()

password    0
strength    0
dtype: int64

In [None]:
data

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
...,...,...
669635,10redtux10,1
669636,infrared1,1
669637,184520socram,1
669638,marken22a,1


# Feature Engineering

In [None]:
data['length'] = len(data['password'])

In [None]:
data.reset_index(drop=True,inplace = True)

In [None]:
data

Unnamed: 0,password,strength,length
0,kzde5577,1,669639
1,kino3434,1,669639
2,visi7k1yr,1,669639
3,megzy123,1,669639
4,lamborghin1,1,669639
...,...,...,...
669634,10redtux10,1,669639
669635,infrared1,1,669639
669636,184520socram,1,669639
669637,marken22a,1,669639


In [None]:
#data['length'][669639] = len(data['password'][669639])

In [None]:
for i in range(0,len(data)):
  data['length'][i] = len(data['password'][i])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['length'][i] = len(data['password'][i])


In [None]:
np.count_nonzero(data['length']==669639)

0

Number 367579 does not exist

In [None]:
data

Unnamed: 0,password,strength,length
0,kzde5577,1,8
1,kino3434,1,8
2,visi7k1yr,1,9
3,megzy123,1,8
4,lamborghin1,1,11
...,...,...,...
669634,10redtux10,1,10
669635,infrared1,1,9
669636,184520socram,1,12
669637,marken22a,1,9


In [None]:
data['Char&Num'] = 0

In [None]:
data

Unnamed: 0,password,strength,length,Char&Num
0,kzde5577,1,8,0
1,kino3434,1,8,0
2,visi7k1yr,1,9,0
3,megzy123,1,8,0
4,lamborghin1,1,11,0
...,...,...,...,...
669634,10redtux10,1,10,0
669635,infrared1,1,9,0
669636,184520socram,1,12,0
669637,marken22a,1,9,0


In [None]:
for i in range(0,len(data)):
    if(data['password'][i].isalpha()==0 & data['password'][i].isdigit()==0):
      data['Char&Num'][i] = 1



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Char&Num'][i] = 1


In [None]:
data

Unnamed: 0,password,strength,length,Char&Num
0,kzde5577,1,8,1
1,kino3434,1,8,1
2,visi7k1yr,1,9,1
3,megzy123,1,8,1
4,lamborghin1,1,11,1
...,...,...,...,...
669634,10redtux10,1,10,1
669635,infrared1,1,9,1
669636,184520socram,1,12,1
669637,marken22a,1,9,1


In [None]:
np.count_nonzero(data['Char&Num']==0)

330

In [None]:
OnlyNumOrChar = np.where(data['Char&Num'] == 0)
OnlyNumOrChar

(array([   231,   4602,   6096,   6500,  10740,  16564,  18478,  21272,
         23039,  27009,  27563,  28647,  28663,  32580,  33578,  42532,
         42989,  44144,  48133,  48300,  48872,  53268,  55197,  59942,
         64530,  64752,  69708,  70912,  71461,  72762,  75562,  77903,
         79551,  81731,  82029,  82675,  85409,  85642,  86761,  86848,
         88516,  92066,  92258, 103802, 105761, 108761, 108866, 109052,
        109895, 113554, 113925, 114736, 116625, 119100, 120593, 120908,
        121935, 122154, 124254, 125085, 132678, 133187, 134210, 134937,
        135149, 135732, 141932, 146111, 149175, 151032, 153411, 153515,
        160243, 164274, 165044, 166673, 170210, 171889, 177597, 180581,
        181918, 185641, 195059, 196314, 198397, 199559, 200725, 202530,
        207369, 207516, 207902, 209849, 213825, 216324, 219297, 223203,
        223278, 224118, 224753, 227191, 228559, 229030, 231272, 232342,
        233542, 235216, 236606, 236773, 236830, 237651, 240131, 

In [None]:
data.iloc[666327,:]

password    PomarancaÇ
strength             1
length              10
Char&Num             0
Name: 666327, dtype: object

In [None]:
np.count_nonzero(data['strength']==1)

496801

In [None]:
np.count_nonzero(data['strength']==0)

89701

In [None]:
np.count_nonzero(data['strength']==2)

83137

In [None]:
data.iloc[265003,:]

password    wiHiGuqYQUd007
strength                 2
length                  14
Char&Num                 1
Name: 265003, dtype: object

In [None]:
data['CapLetter'] = 0

In [None]:
data

Unnamed: 0,password,strength,length,Char&Num,CapLetter
0,kzde5577,1,8,1,0
1,kino3434,1,8,1,0
2,visi7k1yr,1,9,1,0
3,megzy123,1,8,1,0
4,lamborghin1,1,11,1,0
...,...,...,...,...,...
669634,10redtux10,1,10,1,0
669635,infrared1,1,9,1,0
669636,184520socram,1,12,1,0
669637,marken22a,1,9,1,0


In [None]:
# Code that doesn't work now but who knows:
# countCap = 0
  #countLow = 0
  #for j in range(0,len(data['password'][i])):
  #  print(countCap)
  #  if(data['password'][i][j].isupper()==1):
  #    countCap = countCap +1
  #  if(data['password'][i][j].islower()==1):
  #    countLow = countLow +1
  #  if(countCap != 0 & countLow !=0):
  #    data['CapLetter'][i] = 1

In [None]:
for i in range(0, len(data)):


  if (data['password'][i].upper() != data['password'][i] and data['password'][i].lower() != data['password'][i]):
    data['CapLetter'][i] = 1
  else:
    data['CapLetter'][i] = 0




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CapLetter'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CapLetter'][i] = 1


In [None]:
data['CapLetter'].value_counts()

0    592953
1     76686
Name: CapLetter, dtype: int64

In [None]:
data.groupby('CapLetter').mean()

  data.groupby('CapLetter').mean()


Unnamed: 0_level_0,strength,length,Char&Num
CapLetter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.860092,9.232894,0.999538
1,1.996205,15.858501,0.99927


Creating a column for special Char

In [None]:
data['SpecialChar'] = 1

In [None]:
data

Unnamed: 0,password,strength,length,Char&Num,CapLetter,SpecialChar
0,kzde5577,1,8,1,0,1
1,kino3434,1,8,1,0,1
2,visi7k1yr,1,9,1,0,1
3,megzy123,1,8,1,0,1
4,lamborghin1,1,11,1,0,1
...,...,...,...,...,...,...
669634,10redtux10,1,10,1,0,1
669635,infrared1,1,9,1,0,1
669636,184520socram,1,12,1,0,1
669637,marken22a,1,9,1,0,1


In [None]:
for i in range(0,len(data)):
    if(data['password'][i].isalnum()==1):
      data['SpecialChar'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SpecialChar'][i] = 0


In [None]:
data[data['strength'] == 2]

Unnamed: 0,password,strength,length,Char&Num,CapLetter,SpecialChar
5,AVYq1lDE4MgAZfNt,2,16,1,1,0
13,WUt9IZzE0OQ7PkNE,2,16,1,1,0
20,elyass15@ajilent-ci,2,19,1,0,1
22,klara-tershina3H,2,16,1,1,1
41,pHyqueDIyNQ8vmhb,2,16,1,1,0
...,...,...,...,...,...,...
669617,juanpaganini588@gmail.com,2,25,1,0,1
669618,tYAam8zg3Mg2AZ7a,2,16,1,1,0
669621,weslley.06888524,2,16,1,0,1
669626,sakaryal&#305;,2,14,1,0,1


In [None]:
datafinal = data.drop(['password'],axis =1)
datafinal

Unnamed: 0,strength,length,Char&Num,CapLetter,SpecialChar
0,1,8,1,0,0
1,1,8,1,0,0
2,1,9,1,0,0
3,1,8,1,0,0
4,1,11,1,0,0
...,...,...,...,...,...
669634,1,10,1,0,0
669635,1,9,1,0,0
669636,1,12,1,0,0
669637,1,9,1,0,0


In [None]:
dataResponse = datafinal['strength']
dataResponse

0         1
1         1
2         1
3         1
4         1
         ..
669634    1
669635    1
669636    1
669637    1
669638    1
Name: strength, Length: 669639, dtype: int64

In [None]:
data.groupby('strength').mean()

  data.groupby('strength').mean()


Unnamed: 0_level_0,length,Char&Num,CapLetter,SpecialChar
strength,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.549604,0.996533,0.000814,0.016466
1,9.618964,0.999962,0.000292,0.011983
2,15.932497,1.0,0.919783,0.136125


We will make a correlation matrix

In [None]:
correlation_matrix = data.corr()
correlation_matrix

  correlation_matrix = data.corr()


Unnamed: 0,strength,length,Char&Num,CapLetter,SpecialChar
strength,1.0,0.836817,0.040769,0.712247,0.178645
length,0.836817,1.0,0.030071,0.748189,0.231027
Char&Num,0.040769,0.030071,1.0,-0.003848,0.003768
CapLetter,0.712247,0.748189,-0.003848,1.0,0.074912
SpecialChar,0.178645,0.231027,0.003768,0.074912,1.0


As we can see, the main variables which influence strength of a password are length and CapLetter (there being a mix of capital and lowercase letters)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ['strength', 'length', 'Char&Num', 'CapLetter', 'SpecialChar']

scatter_matrix(data[attributes], figsize = (12,8))

array([[<Axes: xlabel='strength', ylabel='strength'>,
        <Axes: xlabel='length', ylabel='strength'>,
        <Axes: xlabel='Char&Num', ylabel='strength'>,
        <Axes: xlabel='CapLetter', ylabel='strength'>,
        <Axes: xlabel='SpecialChar', ylabel='strength'>],
       [<Axes: xlabel='strength', ylabel='length'>,
        <Axes: xlabel='length', ylabel='length'>,
        <Axes: xlabel='Char&Num', ylabel='length'>,
        <Axes: xlabel='CapLetter', ylabel='length'>,
        <Axes: xlabel='SpecialChar', ylabel='length'>],
       [<Axes: xlabel='strength', ylabel='Char&Num'>,
        <Axes: xlabel='length', ylabel='Char&Num'>,
        <Axes: xlabel='Char&Num', ylabel='Char&Num'>,
        <Axes: xlabel='CapLetter', ylabel='Char&Num'>,
        <Axes: xlabel='SpecialChar', ylabel='Char&Num'>],
       [<Axes: xlabel='strength', ylabel='CapLetter'>,
        <Axes: xlabel='length', ylabel='CapLetter'>,
        <Axes: xlabel='Char&Num', ylabel='CapLetter'>,
        <Axes: xlabel='CapLe

Error in callback <function flush_figures at 0x79f5fff592d0> (for post_execute):


KeyboardInterrupt: ignored

In [None]:
data[data['strength'] == 2]

Unnamed: 0,password,strength,length,Char&Num,CapLetter,SpecialChar
5,AVYq1lDE4MgAZfNt,2,16,1,1,0
13,WUt9IZzE0OQ7PkNE,2,16,1,1,0
20,elyass15@ajilent-ci,2,19,1,0,1
22,klara-tershina3H,2,16,1,1,1
41,pHyqueDIyNQ8vmhb,2,16,1,1,0
...,...,...,...,...,...,...
669617,juanpaganini588@gmail.com,2,25,1,0,1
669618,tYAam8zg3Mg2AZ7a,2,16,1,1,0
669621,weslley.06888524,2,16,1,0,1
669626,sakaryal&#305;,2,14,1,0,1


# Training Models

First let's look at our features set

In [None]:
X = data.drop(['strength', 'password'],axis=1)
X

Unnamed: 0,length,Char&Num,CapLetter,SpecialChar
0,8,1,0,0
1,8,1,0,0
2,9,1,0,0
3,8,1,0,0
4,11,1,0,0
...,...,...,...,...
669634,10,1,0,0
669635,9,1,0,0
669636,12,1,0,0
669637,9,1,0,0


Next let's look at our response set

In [None]:
y = data['strength']
y

0         1
1         1
2         1
3         1
4         1
         ..
669634    1
669635    1
669636    1
669637    1
669638    1
Name: strength, Length: 669639, dtype: int64

Stratified sampling

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(data, data['strength']):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [None]:
strat_test_set['strength'].value_counts() / len(strat_test_set)

1    0.741891
0    0.133953
2    0.124156
Name: strength, dtype: float64

In [None]:
data['strength'].value_counts() / len(data)

1    0.741894
0    0.133954
2    0.124152
Name: strength, dtype: float64

As we can see this is well sampled

In [None]:
strat_test_set

Unnamed: 0,password,strength,length,Char&Num,CapLetter,SpecialChar
578013,ocarina01,1,9,1,0,0
630921,9171327887mar,1,13,1,0,0
572795,8425262u,1,8,1,0,0
68113,az1titok,1,8,1,0,0
439139,b19851218,1,9,1,0,0
...,...,...,...,...,...,...
199550,merva042011,1,11,1,0,0
600111,azeqsdf1258,1,11,1,0,0
587987,passpwp123,1,10,1,0,0
626041,15freaks2,1,9,1,0,0


In [None]:
# Test set separation into features and response

strat_test_setX = strat_test_set.drop(['strength', 'password'], axis = 1)
strat_test_sety = strat_test_set['strength']


# Train set separation into features and response

strat_train_setX = strat_train_set.drop(['strength', 'password'], axis = 1)
strat_train_sety = strat_train_set['strength']


In [None]:
strat_train_setX

Unnamed: 0,length,Char&Num,CapLetter,SpecialChar
94842,15,1,1,0
17266,8,1,0,0
248806,15,1,1,0
121655,10,1,0,0
339450,10,1,0,0
...,...,...,...,...
331856,16,1,1,0
527,10,1,0,0
230358,11,1,0,0
411658,10,1,0,0


Let's train the stochastic gradient descent classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter = 100, tol = 1e-3, random_state = 42)

In [None]:
# Fitting the model
sgd_clf.fit(strat_train_setX, strat_train_sety)

In [None]:
sgd_clfPreds = sgd_clf.predict(strat_test_setX)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

sgd_accuracy = accuracy_score(strat_test_sety, sgd_clfPreds)
sgd_error = 1 - sgd_accuracy
sgd_precision = precision_score(strat_test_sety, sgd_clfPreds, average = 'weighted')  # We use a weighted average, as the classes are quite imbalanced
sgd_recall = recall_score(strat_test_sety, sgd_clfPreds, average = 'weighted')

# We can't really have a precision and recall score as this is a multiclass classifier
#sgd_precision = precision_score(strat_test_sety, sgd_clfPreds, average = 'weighted')
#sgd_recall = recall_score(strat_test_sety, sgd_clfPreds, average = 'weighted')

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 15)

In [None]:
rfc.fit(strat_train_setX.values, strat_train_sety)

In [None]:
rfcPreds = rfc.predict(strat_test_setX)



In [None]:
rfc_accuracy = accuracy_score(strat_test_sety,rfcPreds)
rfc_error = 1 - rfc_accuracy
rfc_precision = precision_score(strat_test_sety, rfcPreds, average = 'weighted')
rfc_recall = recall_score(strat_test_sety, rfcPreds, average = 'weighted')

# K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)

In [None]:
knn.fit(strat_train_setX, strat_train_sety)

In [None]:
knnPreds = knn.predict(strat_test_setX)

In [None]:
knn_accuracy = accuracy_score(strat_test_sety, knnPreds)
knn_error = 1 - knn_accuracy
knn_precision = precision_score(strat_test_sety, knnPreds, average = 'weighted')
knn_recall = recall_score(strat_test_sety, knnPreds, average = 'weighted')

# One-Vs-The-Rest Classifier using a support vector machine

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
oneVrest = OneVsRestClassifier(SVC())

In [None]:
oneVrest.fit(strat_train_setX, strat_train_sety)

In [None]:
oneVrest.predict(strat_test_setX)

In [None]:
oneVrest_accuracy = accuracy_score(strat_test_sety, knnPreds)
oneVrest_error = 1 - knn_accuracy
oneVrest_precision = precision_score(strat_test_sety, knnPreds, average = 'weighted')
oneVrest_recall = recall_score(strat_test_sety, knnPreds, average = 'weighted')

# Dataframe
Let's make a dataframe comparing the accuracy, error, precision, and recall of all of these

In [None]:
sgdDict = {'Accuracy': sgd_accuracy,
           'Error' : sgd_error,
           'Precision' : sgd_precision,
           'Recall' : sgd_recall}

rfcDict = {'Accuracy': rfc_accuracy,
           'Error' : rfc_error,
           'Precision' : rfc_precision,
           'Recall' : rfc_recall}
knnDict = {'Accuracy': knn_accuracy,
           'Error' : knn_error,
           'Precision' : knn_precision,
           'Recall' : knn_recall}
oneVrestDict = {'Accuracy': oneVrest_accuracy,
           'Error' : oneVrest_error,
           'Precision' : oneVrest_precision,
           'Recall' : oneVrest_recall}
metricScores = pd.DataFrame({'sgd' : sgdDict,
                             'rfc' : rfcDict,
                             'knn' : knnDict,
                             'oneVrest' : oneVrestDict})

print(metricScores.T)


# Predicting passwords

We must make a function that transforms the password given into an array matching various features of the password dataset

In [None]:
def transformPassword(pWord):
  length = len(pWord)
  charNum = 0
  if (pWord.isalpha()==0 & pWord.isdigit()==0):
      charNum = 1
  capLetter = 0
  if (pWord.upper() != pWord and pWord.lower() != pWord):
    capLetter = 1
  specialChar = 1
  if(pWord.isalnum()==1):
    specialChar = 0
  return np.array([length, charNum, capLetter, specialChar]).reshape(1, -1)

In [None]:
helloPred = transformPassword('hell1asf3sdafas#@o')
rfc.predict(helloPred)

# Exporting this function
In order to use this function with AWS Amplify and PyScript, we're gonna export the joblib function with jobLib

ModuleNotFoundError: No module named 'xgboost'

In [None]:
!pwd

In [None]:
!python --version