For my project I chose the dataset '10000 Most Common Passwords' from SecLists. The dataset includes the passwords, length and the number of characters, numbers, upper/lowercase letters, special characters, vowels, and syllables.

In [53]:
import numpy as np 
import pandas as pd
import graphviz
import seaborn as sn
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
from sklearn import cluster, tree
from scipy.cluster import hierarchy
from mlxtend.preprocessing import TransactionEncoder
from IPython.display import Image
%matplotlib inline

In [106]:
#reads CSV file & displays first 5 rows
data = pd.read_csv("../input/10000-most-common-passwords/common_passwords.csv")
data.columns = ['password','length','num_chars','num_digits','num_upper','num_lower','num_special','num_vowels','num_syllables']
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

In [93]:
for col in data.columns:
    if is_numeric_dtype(data[col]):
        print('%s:' % (col))
        print('\t Mean = %.3f' % data[col].mean())
        print('\t Standard deviation = %.2f' % data[col].std())
        print('\t Minimum = %.2f' % data[col].min())
        print('\t Maximum = %.2f' % data[col].max())

Num_chars, num_digits, and num_lower have the largest standard deviations becuase they have the most variance between the data. The mean for num_special and num_upper is extremly low, which means most of them will be probably be outliers.

In [35]:
#this is just neat
print('number of passwords with special characters : ')
print(data.shape[0] - data['num_special'].value_counts()[0])
print('number of passwords with uppercase letters : ')
print(data.shape[0] - data['num_upper'].value_counts()[0])

In [36]:
corrMatrix = data.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

The highest correlation is between num_lower and num_chars. One of the main trends in the data is that most passwords are either entirely characters, or entirely digits. This causes an extremly high negative correlation betweent the two (-.89), and its almost the same for lowercase characters as well (-.88). However there is barely any correlation between the number of uppercase letters and the number of characters (.04).
There is also an extremly high correlation between the number of characters and the number of lowercase characters (.99) because most of the passwords that contain characters are exclusivly lowercase.

In [37]:
fig, axes = plt.subplots(3, 2, figsize=(12,12))
index = 0
for i in range(3):
    for j in range(i+1,4):
        ax1 = int(index/2)
        ax2 = index % 2
        axes[ax1][ax2].scatter(data[data.columns[i]], data[data.columns[j]], color='red')
        axes[ax1][ax2].set_xlabel(data.columns[i])
        axes[ax1][ax2].set_ylabel(data.columns[j])
        index = index + 1
print("test")

# Preprocessing

First I checked to see if there were no missing values, and luckily there weren't any. Then I removed the duplicate data (only two row).

In [31]:
#Shows no missing values
data = data.replace('?',np.NaN)
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col,data[col].isna().sum()))

In [33]:
print('Number of rows before discarding duplicates = %d' % (data.shape[0]))
data = data.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % (data.shape[0]))

In [38]:
#Drops password column so only working with numbers & displays boxplot of outliers
data2 = data.drop(['password'],axis=1)
data2.boxplot(figsize=(20,3))

There are a quite a few of outliers in the data. There's not a box for num_upper & num_special because the min,1st quartile,median,3rd quartile, and the max are all 0. Anytime there is actually a password with an uppercase letter or a special character, it is automatically an outlier.  Since all of the columns have outliers, it will be fixed by calculating   the z-score will be computed for each attribute & the attributes with z-scores above or below 3 will be removed.

In [39]:
Z = (data2-data2.mean())/data2.std()
Z[:10]

In [40]:
#discards outliers
print('Number of rows before discarding outliers = %d' % (Z.shape[0]))
Z2 = Z.loc[((Z > -3).sum(axis=1)==8) & ((Z <= 3).sum(axis=1)==8),:]
print('Number of rows after discarding outliers = %d' % (Z2.shape[0]))

# Clustering

For this dataset, aggregation didn't make sense because all of the passwords are unique. However 10,000 tuples is quite a lot, so I took a sample of 50 tuples to make it more manageable.

In [41]:
sample = data.sample(frac=0.005, random_state=1)
sample

For the clustering, I only wanted to look at the password, length, and the number of digits, so I only selected those columns.

In [42]:
sample = data[['password','length','num_digits']]
sample = sample.sample(50)
sample

In [43]:
clustering_data = sample.drop('password', axis = 1)
k_means = cluster.KMeans(n_clusters=2, max_iter=50, random_state=1)
k_means.fit(clustering_data) 
labels = k_means.labels_
pd.DataFrame(labels, index=sample.password, columns=['Cluster ID'])

The clustering was pretty simple because almost all of the passwords followed the trend of being either all characters or all numbers, so those are the two groups they were clustered into. Cluster 0 being the cluster with majority characters, and cluster 1 wuth majority digits. 

In [44]:
centroids = k_means.cluster_centers_
pd.DataFrame(centroids,columns=['length','num_digits'])

In [46]:
numClusters = [1,2,3,4,5,6]
SSE = []
for k in numClusters:
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(clustering_data)
    SSE.append(k_means.inertia_)

plt.plot(numClusters, SSE)
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')

This graph shows that 2 clusters is the most effictive cluster number for this dataset.

In [47]:
smaller_sample = sample.sample(15)
passwords = smaller_sample['password']
data_matrix = smaller_sample.drop(['password'],axis = 1)
linkage = hierarchy.linkage(data_matrix.values, 'single')
dn = hierarchy.dendrogram(linkage,labels=passwords.tolist(),orientation='right')

This shows that theres a lot of similarity between all of the passwords with only digits and the passwords with only characters, and the mixed passwords are somewhere in between.

In [48]:
Z = hierarchy.linkage(data_matrix.values, 'complete')
dn = hierarchy.dendrogram(Z,labels=passwords.tolist(),orientation='right')

This shows basically the same thing as the graph before it, but it seperates the mixed passwords into a third group instead of having just connections between only numbers and only characters. It also shows that the mixed passwords are closer to the digits.

# Classification

In [107]:
data['length'] = data['length'].replace([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], 1)
data['num_chars'] = data['num_chars'].replace([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], 1)
data['num_digits'] = data['num_digits'].replace([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], 1)
data['num_upper'] = data['num_upper'].replace([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], 1)
data['num_lower'] = data['num_lower'].replace([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], 1)
data['num_special'] = data['num_special'].replace([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], 1)
data['num_vowels'] = data['num_vowels'].replace([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], 1)
data.columns = ['password','length','chars','digits','upper','lower','special','vowels','num_syllables']
data.head()

Instead of of the number of each occurance, it now has 1 for true or it remains 0 if the attribute doesnt occur. I also changed the titles of the columns to make it easier to understand. It would have been easier to do this with transaction encoder. however every time I tried to do it, it messed up the data, so heres a failed attempt at that. 


In [65]:
#tried to do a transation encoder but it wouldn't work so I harded coded it ^
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

In [102]:
Y = data['password']
X = data.drop(['password','num_syllables','upper','lower','vowels'],axis=1)
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=3)
clf = clf.fit(X, Y)
dtree = tree.DecisionTreeClassifier()
dtree = dtree.fit(X, Y)
data = tree.export_graphviz(dtree,feature_names = ['length','chars','digits','special'], out_file=None)
graph = graphviz.Source(data)
graph