Enriching Authentication data for ML based suspicious login detection

Below we gather authentication information. We are pulling from a Pangea Audit log. From there we enrich the authentication information with Pangea User Intel and IP intel. Additional Python modules are used to extract features from included information. Specifically we look up an ASN number for IP. 

To save on IP and User intel lookup fees, we cache the results into json files for later re-use. 

In [None]:
import ipaddress
import json
import re
import sys
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
import pytz
from ipwhois import IPWhois

current_directory = Path.cwd()
parent_dir = current_directory.parent
modules_dir = str(parent_dir.joinpath('modules'))
sys.path.append(modules_dir)

import pservices as Pangea_Services
from vaultids import *

pservices = Pangea_Services.pservices(url="aws.us.pangea.cloud", VAULT_TOKEN=env_vault_token)

if Path('ipcache.json').exists():
    print("ip cache file exists")
    with Path('ipcache.json').open() as file:
        ipcache = json.load(file)
else:
    ipcache = {}

if Path('user.json').exists():
    print("user cache file exists")
    with Path('user.json').open() as file:
        usercache = json.load(file)
else:
    usercache = {}

def add_user_intel(email_addr: str):
    # This just returns 1/0 if breached or not.
    if email_addr in usercache:
        print(f"------>> email address {email_addr} found in cache!!!")
        user_enrich = usercache[email_addr]
        print(user_enrich)
    else:
        user_enrich = pservices.get_user_intel(email_addr)
        #add to cache
        usercache[email_addr] = user_enrich
    return user_enrich

def add_ip_intel(IP_addr: str):
    # Check if the IP address is already in the cache
    if IP_addr in ipcache:
        print(f"IP address {IP_addr} found in cache!!!")
        ip_enrich = ipcache[IP_addr]
        print(ip_enrich)
    else:
        ip_enrich = pservices.ip_intel(IP_addr)

        # Get the IP address location
        print(f"IP address NOT {IP_addr} found in cache!!!")
        print (ip_enrich['geo_location'])
        ## sample output
        # Country: Republic Of The Philippines, City: manila, Latitude: 14.59, Longitude: 121.0
        # Country: French Republic, City: roubaix, Latitude: 50.69, Longitude: 3.17
        match = re.search(r'Country: ([^,]+), City: ([^,]+), Latitude: ([\d.-]+), Longitude: ([\d.-]+)', ip_enrich['geo_location'])
        if match:
            country = match.group(1)
            city = match.group(2)
            latitude = float(match.group(3))
            longitude = float(match.group(4))
            print(f"Country: {country}, City: {city}, Latitude: {latitude}, Longitude: {longitude}")
        else:
            latitude = None
            longitude = None
            city = None
            country = None

        ip_enrich['asn'] = get_asn(IP_addr)
        ip_enrich['latitude'] = latitude
        ip_enrich['longitude'] = longitude
        ip_enrich['city'] = city
        ip_enrich['country'] = country

        #add to cache
        ipcache[IP_addr] = ip_enrich

    return pd.Series(ip_enrich)

def get_asn(ip: str):
    if not ipaddress.ip_address(ip).is_private:
        obj = IPWhois(ip)
        res = obj.lookup_rdap(depth=1)
        if res is not None:
            return res['asn']
        else:
            return 0
    else:
        return 0

list = pservices.get_PangeaAuthLogs(maxresults=500)
print (f"Count of list: {len(list)}")

log_data = []
for row in list:
    print(row)
    dict = row['external_context']
    json_dict = json.loads(dict)
    print (f"user email - actor {json_dict['actor']['username']}")
    #print (f"org id is {row['source']}")
    print (f"Login time {row['timestamp']}")
    print (f"login IP {json_dict['request']['ip']}")
    print (f"User Agent {json_dict['client']['user_agent']}")
    print (f"User id {json_dict['actor']['user_id']}")

    enriched = add_ip_intel(json_dict['request']['ip'])
    breached_account = add_user_intel(json_dict['actor']['username'])

    log_data.append({
        'ip': json_dict['request']['ip'],
        'useremail': json_dict['actor']['username'],
        'userid': json_dict['actor']['user_id'],
        'login_date': row['timestamp'],
        'user_agent': json_dict['client']['user_agent'],
        'latitude': enriched['latitude'],
        'longitude': enriched['longitude'],
        'country': enriched['country'],
        'city': enriched['city'],
        'breached': breached_account,
        'org': row['source'],
        'proxy': enriched['proxy'],
        'vpn': enriched['vpn'],
        'malicious': enriched['ip'],
        'asn': enriched['asn'],
    })

print(log_data)

with Path('ipcache.json').open('w') as file:
    json.dump(ipcache, file)

with Path('user.json').open('w') as file:
    json.dump(usercache, file)
    


View a sanitized sample of our data

- We have user email, IP, the useragent, login timestamp, a user id and an org id

In [None]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 15)
pd.set_option('display.expand_frame_repr', False)

df = pd.DataFrame(log_data)

def sanitize(df: pd.DataFrame) -> pd.DataFrame:
    sanitized_df = df
    sanitized_df['ip'] = sanitized_df['ip'].apply(lambda ip: ip.split('.')[0] + '.*.*.*')
    sanitized_df['useremail'] = "asdf@1234.com"
    sanitized_df['org'] = sanitized_df['org'].apply(lambda x: hash(x) % (10 ** 8))
    sanitized_df['userid'] = sanitized_df['userid'].apply(lambda x: hash(x) % (10 ** 8))
    sanitized_df['asn'] = sanitized_df['asn'].apply(lambda x: int(x) + 123)
    return sanitized_df

sanitized_df = sanitize(pd.DataFrame(log_data)) ## sanitize data for example purposes
print (sanitized_df[['useremail','ip','user_agent','login_date','userid','org']].head(5))

Simple report on IP and Breached users

- Not super useful because there isn't context or comparison. 
- some users may always use a proxy or vpn 
- user are flagged as breached fairly regularly, doesn't mean the auth is a compromised on

In [None]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 15)
pd.set_option('display.expand_frame_repr', False)

df = pd.DataFrame(log_data)

sanitized_df = df
sanitized_df['ip'] = sanitized_df['ip'].apply(lambda ip: ip.split('.')[0] + '.*.*.*')
sanitized_df['useremail'] = "asdf@1234.com"
sanitized_df['org'] = sanitized_df['userid'].apply(lambda x: hash(x) % (10 ** 8))
sanitized_df['userid'] = sanitized_df['userid'].apply(lambda x: hash(x) % (10 ** 8))
sanitized_df['asn'] = sanitized_df['asn'].apply(lambda x: int(x) + 123)

print(sanitized_df.head(5))
print(sanitized_df.shape)

print("Authentications from Malicious IPs")
print(sanitized_df[sanitized_df['malicious'] != "Safe"])

print("Authentications with Breached users")
print(sanitized_df[sanitized_df['breached'] == "True"])

print("Authentications using VPN")
print(sanitized_df[sanitized_df['vpn'] == "Yes"])


Covert enriched data to something a ML model can use

- numeric values! 
 numeric hashes
 booleans and true/false to 0/1
 label encoder for strings like OS and browser

In [None]:
from sklearn.preprocessing import LabelEncoder
from user_agents import parse

pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None  # default='warn'

df = pd.DataFrame(log_data)
encoder = LabelEncoder()

df['login_date'] = pd.to_datetime(df['login_date'])
df['hour_of_day'] = df['login_date'].dt.hour
df['day_of_week'] = df['login_date'].dt.weekday
df['ip_octets_sum'] = df['ip'].str.split('.').apply(lambda x: sum(map(int, x[:3])))
df['user_agent_id'] = df['user_agent'].apply(lambda x: hash(x) % (10 ** 8))
df['lat_long'] = df['latitude'] + df['longitude']
df['breached'] = df['breached'].astype(int)
df['proxy'] = df['proxy'].map({'Yes': 1, 'No': 0})
df['vpn'] = df['vpn'].map({'Yes': 1, 'No': 0})
df['malicious'] = df['malicious'].map({'Malicious': 1, 'Safe': 0})
df['user_agent_browser_string'] = df['user_agent'].apply(lambda x: parse(x).browser.family if not "" else "unknown")
df['user_agent_browser'] = encoder.fit_transform(df['user_agent_browser_string'])
df['user_agent_browser_version_string'] = df['user_agent'].apply(lambda x: parse(x).browser.version_string if not "" else "unknown")
df['user_agent_browser_version'] = encoder.fit_transform(df['user_agent_browser_version_string'])
df['user_agent_os_string'] = df['user_agent'].apply(lambda x: parse(x).os.family if not "" else "unknown")
df['user_agent_os'] = encoder.fit_transform(df['user_agent_os_string'])
df['user_agent_os_version_string'] = df['user_agent'].apply(lambda x: parse(x).os.version_string if not "" else "unknown")
df['user_agent_os_version'] = encoder.fit_transform(df['user_agent_os_version_string'])

feature_columns = ['hour_of_day', 'day_of_week', 'ip_octets_sum', 'user_agent_id', 'lat_long', 'latitude', 'longitude', 'breached', 'proxy', 'vpn', 'malicious', 'user_agent_browser', 'user_agent_browser_version', 'user_agent_os', 'user_agent_os_version', 'asn']

print (f"sample df {df[feature_columns].sample(1)}")


Check for nulls - these don't work in ML models

In [None]:
nan_rows = df[df.isnull().any(axis=1)]
print(nan_rows)


Build a isolation forest machine learning model. 

contamination can be tuned to match your data. for instance .05 assumes that 5% of your data is bad/contaminated. 

We build this model on data older that 1 day. 

In [None]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(n_estimators=1000, contamination=0.01, random_state=0, max_samples='auto')
model.fit(df[df['login_date'] < datetime.now(pytz.utc) - timedelta(days=1)][feature_columns])

Run an outlier detection on data greater than one day old. 
The shap module helps explain why the particular record was found to be an outlier. If no outliers were found in your data, play with the contamination variable above.

In [None]:
import shap

explainer = shap.Explainer(model)

for _index, row in df[df['login_date'] > datetime.now(pytz.utc) - timedelta(days=1)].iterrows():
    this_df = pd.DataFrame(row).transpose()
    prediction = model.predict(this_df[feature_columns])
    if (prediction == -1):
        print(f"Anomaly detected !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        shap_values = explainer(this_df[feature_columns])
        shap.plots.waterfall(shap_values[0])
        print(f"source data {this_df.transpose()}")
