In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import math

## Cleaning and Preparing password dataset

**Using Shanon Entropy, " The concept of entropy was used by Shannon in information theory for the data communication of computer sciences and is known as Shannon entropy" will include Link below for article to cite source. :)**

[Article](https://www.sciencedirect.com/topics/engineering/shannon-entropy#:~:text=The%20concept%20of%20entropy%20was,is%20known%20as%20Shannon%20entropy.)

### Importing CSV of Google Keychain Passwords

In [2]:

df = pd.read_csv('passwords.csv', encoding='ISO-8859-1', dtype={'password': str})
df

Unnamed: 0,password
0,RXujZkrxUgKhWW2
1,2eXrHHw9S7iU2EN
2,M7nc3G2iwA7gf9M
3,vGA6X7j9ixFj2mH
4,ucX3vskRPzBEKZv
...,...
404,$7ez3DFRc6eJe.E
405,#BG$Qw4GT$zqtj6
406,FmbQd333P3d!G9#
407,*E6!WMT3wM3ejq@


In [4]:
dollar_sign_records = df[df['password'].str.contains('\$')]
print(dollar_sign_records)

            password
205  dx@hypMHE@Q$DY5
208  #$UWJdrYVQ@idL6
213  Awq$KFA2D4#bd@9
214  7j@WPK$5*URdiZK
215  vX@p$C#UV4$yUXD
..               ...
399  g3BiFa!$2VKUJDf
402  !5EW4h$p!k$ysH8
403  pFt2P*mxKK$ZYgn
404  $7ez3DFRc6eJe.E
405  #BG$Qw4GT$zqtj6

[70 rows x 1 columns]


#### Changing dtypes into string so I can evaluate number, letter, and capitalization frequency

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 409 entries, 0 to 408
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   password  409 non-null    object
dtypes: object(1)
memory usage: 3.3+ KB


In [6]:
df = df.astype(str)

In [7]:
type(df["password"][1])

str

#### Dtypes are string, now going to look at variation of passwords
**Length**  
**Capitalization**  
**Numbers**  
**Letters**  

In [8]:
df["password"].str.isnumeric().sum()

0

In [9]:
df["password"].str.isupper().sum()

0

In [10]:
df["password"].str.islower().sum()

0

In [11]:
df["password"].str.isalpha().sum()

0

In [12]:
#All of the passwords in the dataset seem to be a combinationof Letters and Numbers
df["password"].str.isalnum().sum()

204

In [13]:
#No passwords in title format
df["password"].str.istitle().sum()

0

#### Using a fucntion to check for special characters

In [14]:
def find_special_char(row):
    for char in row:
        if char in string.punctuation:
            return 1
        else:
            pass

In [15]:
#Function returns 1 for any string that contains any character such as these "!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~"
df["password"].apply(find_special_char).sum()

205.0

In [16]:
df["length"] = df["password"].str.len()
df["length"]

0      15
1      15
2      15
3      15
4      15
       ..
404    15
405    15
406    15
407    15
408    15
Name: length, Length: 409, dtype: int64

In [17]:
#Added a length column to look at how long each password truly is to see if there is any variation at all,doesn't seem to be
df.head()


Unnamed: 0,password,length
0,RXujZkrxUgKhWW2,15
1,2eXrHHw9S7iU2EN,15
2,M7nc3G2iwA7gf9M,15
3,vGA6X7j9ixFj2mH,15
4,ucX3vskRPzBEKZv,15


In [18]:
#Going to create some functions so that I could measure the frequency of digits, uppercase, and lower case letters
def freq_lowercase(row):
    return len([char for char in row if char.islower()])/len(row)
def freq_uppercase(row):
    return len([char for char in row if char.isupper()])/len(row)
def freq_number(row):
    return len([char for char in row if char.isdigit()])/len(row)

In [19]:
def freq_special_char(row):
    total_chars = len(row)
    special_chars = sum(not char.isalnum() for char in row)
    return special_chars / total_chars if total_chars > 0 else 0.0

In [22]:
df["lowercase_freq"] = np.round(df["password"].apply(freq_lowercase) , 3)

df["uppercase_freq"] = np.round(df["password"].apply(freq_uppercase) , 3)

df["number_freq"] = np.round(df["password"].apply(freq_number) , 3)

df["special_char_freq"] = np.round(df["password"].apply(freq_special_char) , 3)

#### Added columns representing frequencies  

**Each of the frequencies add up to ~1**  
**Gonna see if I can create a Flag column to represent what the password starts with**

In [23]:
df

Unnamed: 0,password,length,lowercase_freq,uppercase_freq,digit_freq,special_char_freq,number_freq
0,RXujZkrxUgKhWW2,15,0.467,0.467,0.067,0.000,0.067
1,2eXrHHw9S7iU2EN,15,0.267,0.467,0.267,0.000,0.267
2,M7nc3G2iwA7gf9M,15,0.400,0.267,0.333,0.000,0.333
3,vGA6X7j9ixFj2mH,15,0.400,0.333,0.267,0.000,0.267
4,ucX3vskRPzBEKZv,15,0.467,0.467,0.067,0.000,0.067
...,...,...,...,...,...,...,...
404,$7ez3DFRc6eJe.E,15,0.333,0.333,0.200,0.133,0.200
405,#BG$Qw4GT$zqtj6,15,0.333,0.333,0.133,0.200,0.133
406,FmbQd333P3d!G9#,15,0.267,0.267,0.333,0.133,0.333
407,*E6!WMT3wM3ejq@,15,0.267,0.333,0.200,0.200,0.200


In [24]:
df['start_char']= df['password'].str[0]

In [25]:
df

Unnamed: 0,password,length,lowercase_freq,uppercase_freq,digit_freq,special_char_freq,number_freq,start_char
0,RXujZkrxUgKhWW2,15,0.467,0.467,0.067,0.000,0.067,R
1,2eXrHHw9S7iU2EN,15,0.267,0.467,0.267,0.000,0.267,2
2,M7nc3G2iwA7gf9M,15,0.400,0.267,0.333,0.000,0.333,M
3,vGA6X7j9ixFj2mH,15,0.400,0.333,0.267,0.000,0.267,v
4,ucX3vskRPzBEKZv,15,0.467,0.467,0.067,0.000,0.067,u
...,...,...,...,...,...,...,...,...
404,$7ez3DFRc6eJe.E,15,0.333,0.333,0.200,0.133,0.200,$
405,#BG$Qw4GT$zqtj6,15,0.333,0.333,0.133,0.200,0.133,#
406,FmbQd333P3d!G9#,15,0.267,0.267,0.333,0.133,0.333,F
407,*E6!WMT3wM3ejq@,15,0.267,0.333,0.200,0.200,0.200,*


#### I'd like to group passwords by their starting characters as well

In [26]:
start_type = []

for row in df['start_char']:
    if 'A' <= row <= 'Z':
        start_type.append('uppercase')
    elif row in string.punctuation:
         start_type.append('special')
    elif 'a' <= row <= 'z':
        start_type.append('lowercase')
    else:
        start_type.append('number')

In [27]:
df['start_type'] = start_type

In [28]:
df

Unnamed: 0,password,length,lowercase_freq,uppercase_freq,digit_freq,special_char_freq,number_freq,start_char,start_type
0,RXujZkrxUgKhWW2,15,0.467,0.467,0.067,0.000,0.067,R,uppercase
1,2eXrHHw9S7iU2EN,15,0.267,0.467,0.267,0.000,0.267,2,number
2,M7nc3G2iwA7gf9M,15,0.400,0.267,0.333,0.000,0.333,M,uppercase
3,vGA6X7j9ixFj2mH,15,0.400,0.333,0.267,0.000,0.267,v,lowercase
4,ucX3vskRPzBEKZv,15,0.467,0.467,0.067,0.000,0.067,u,lowercase
...,...,...,...,...,...,...,...,...,...
404,$7ez3DFRc6eJe.E,15,0.333,0.333,0.200,0.133,0.200,$,special
405,#BG$Qw4GT$zqtj6,15,0.333,0.333,0.133,0.200,0.133,#,special
406,FmbQd333P3d!G9#,15,0.267,0.267,0.333,0.133,0.333,F,uppercase
407,*E6!WMT3wM3ejq@,15,0.267,0.333,0.200,0.200,0.200,*,special


In [29]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### Going to use Shanon Entropy as mentioned before

In [30]:

def password_entropy(password):
    if not password:
        return 0

    entropy = 0
    for char in set(password):
        p_char = password.count(char) / len(password)
        entropy -= p_char * math.log2(p_char)

    # Normalize to a range between 0 and 1
    normalized_entropy = entropy / math.log2(len(set(password)))

    return round(normalized_entropy, 3)

In [31]:
df['pw_entropy'] = df['password'].apply(password_entropy)
df

Unnamed: 0,password,length,lowercase_freq,uppercase_freq,digit_freq,special_char_freq,number_freq,start_char,start_type,pw_entropy
0,RXujZkrxUgKhWW2,15,0.467,0.467,0.067,0.000,0.067,R,uppercase,0.991
1,2eXrHHw9S7iU2EN,15,0.267,0.467,0.267,0.000,0.267,2,number,0.984
2,M7nc3G2iwA7gf9M,15,0.400,0.267,0.333,0.000,0.333,M,uppercase,0.984
3,vGA6X7j9ixFj2mH,15,0.400,0.333,0.267,0.000,0.267,v,lowercase,0.991
4,ucX3vskRPzBEKZv,15,0.467,0.467,0.067,0.000,0.067,u,lowercase,0.991
...,...,...,...,...,...,...,...,...,...,...
404,$7ez3DFRc6eJe.E,15,0.333,0.333,0.200,0.133,0.200,$,special,0.970
405,#BG$Qw4GT$zqtj6,15,0.333,0.333,0.133,0.200,0.133,#,special,0.984
406,FmbQd333P3d!G9#,15,0.267,0.267,0.333,0.133,0.333,F,uppercase,0.937
407,*E6!WMT3wM3ejq@,15,0.267,0.333,0.200,0.200,0.200,*,special,0.984


In [32]:
df.to_csv('password_analytics.csv',index = False)