In [None]:
import ast
import numpy as np
from datetime import datetime, timedelta
import glob
import gzip
import ipaddress
import os
from os import listdir
from os.path import isdir, isfile, join
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
from pandarallel import pandarallel
from joblib import Parallel, delayed
from OpenSSL import crypto
from local_utils import *
import sys
import logging

# so = open("shodan.log", 'w', 10)
# sys.stdout.echo = so
# sys.stderr.echo = so

# get_ipython().log.handlers[0].stream = so
# get_ipython().log.setLevel(logging.INFO)


pandarallel.initialize(progress_bar=True, nb_workers=60)

In [None]:
# org_ranges_file = 'csv/all_ranges_over_time_gaps_filled.csv'

# def process_cols(row):
#     row['date'] = pd.to_datetime(row['date'], format='%Y-%m-%d')
#     row['first'] = pd.to_datetime(row['first'], format='%Y-%m-%d')
#     row['last'] = pd.to_datetime(row['last'], format='%Y-%m-%d')
#     row['ripe_names'] = ast.literal_eval(row['ripe_names'])
#     row['Ranges'] = [ipaddress.IPv4Network(x) for x in ast.literal_eval(row['Ranges'])]
# #     row['ripe_ranges'] = [ipaddress.IPv4Network(x) for x in ast.literal_eval(row['ripe_ranges'])]
#     return row
    

# df = pd.read_csv(org_ranges_file)
# df = df.parallel_apply(process_cols, axis=1)
# df

In [None]:
exploded = pd.read_csv('csv/all_ranges_exploded.csv', dtype={'ripe_names': 'string'})

In [None]:
def process_cols(row):
    row['date'] = pd.to_datetime(row['date'], format='%Y-%m-%d')
    row['Ranges'] = ipaddress.IPv4Network(row['Ranges'])
#     row['ripe_names'] = '' if pd.isna(row['ripe_names']) else row['ripe_names']
#     row['ripe_names'] = row['ripe_names'] if row['ripe_names'] != '' else row['org']
    return row

exploded = exploded.parallel_apply(process_cols, axis=1)
exploded

In [None]:
exploded[exploded['ripe_names'] == '']

In [None]:
# first 3 FREAK, last is heartbleed
_TLS_VULNS = set(['CVE-2015-0204', 'CVE-2015-1637', 'CVE-2015-1067', 'CVE-2014-0160'])

# from risky business paper
_WEAK_SIGS = set(['md2', 'md5', 'sha1'])

# common primes from https://testssl.sh/etc/common-primes.txt
with open('common_primes.txt', 'r') as f:
    _DH_COMMON_PRIMES = set([line for line in f.read().split()])

# features from Risky Business paper from Edwards et al. and Cloudy with a Chance of Breach
def extract_features_from_json_object(json_obj, ip_ranges=None):
    if ip_ranges and json_obj['ip_str'] not in ip_ranges:
        return {}
    
    data = {}
    data['timestamp'] = json_obj['timestamp'][0:10]
    data['port'] = json_obj['port']
    data['ip'] = json_obj['ip']
    data['ip_str'] = json_obj['ip_str']
    
    #TLS errors
    data['has_tls'] = 0
    # silly heuristic to check if it REALLY has TLS services
    # (e.g. not a regular http server on port 443 without https)
    if 'ssl' in json_obj and 'versions' in json_obj['ssl']:
        ssl = json_obj['ssl']
        #ssl config errors
        data['has_tls'] = 1
        
        data['tls_old_version'] = 1 if ('SSLv2' in ssl['versions']) or ('SSLv3' in ssl['versions']) else 0
            
        
        try:
            data['tls_heartbleed_or_freak_vuln'] = 1 if not _TLS_VULNS.isdisjoint(json_obj['opts']['vulns']) else 0
        except:
            data['tls_heartbleed_or_freak_vuln'] = 0
            
        # weak DH keys and primes
        dh_weak_bits = False
        dh_common_prime = False
        if 'dhparams' in ssl and ssl['dhparams']:
            dh_bits = ssl['dhparams']['bits']
            dh_weak_bits = dh_bits < 2048
            dh_prime = ssl['dhparams']['prime']
            dh_common_prime = dh_prime in _DH_COMMON_PRIMES
            
        data['dh_common_prime'] = 1 if dh_common_prime else 0
        data['dh_weak_bits'] = 1 if dh_weak_bits else 0
            
        
        #certificate errors
        try:
            timestamp = datetime.strptime(json_obj['timestamp'], '%Y-%m-%dT%H:%M:%S.%f')
        except:
            timestamp = datetime.strptime(json_obj['timestamp'], '%Y-%m-%dT%H:%M:%S')
            
        cert_parse_failed = False
        if 'cert' in ssl:
            # if parsed certificate data is included in shodan scan data
            data['cert_expired'] = 1 if ssl['cert']['expired'] else 0
            
            try:
                issued_time = datetime.strptime(ssl['cert']['issued'], '%Y%m%d%H%M%SZ')
            except:
                issued_time = datetime.strptime(ssl['cert']['issued'], '%Y%m%d%H%M%S%z').replace(tzinfo=None)
                
            
            weak_key = (ssl['cert']['pubkey']['type'] == 'rsa' and ssl['cert']['pubkey']['bits'] <= 1024) or\
                        ('ec' in ssl['cert']['pubkey']['type'] and ssl['cert']['pubkey']['bits'] < 224)
            sig_alg = ssl['cert']['sig_alg'][0:4]
            
            data['cert_unparseable'] = 0
        else:
            # old shodan data doesn't parse certificate data, have to do this myself
            try:
                issued_times = []
                expired_times = []
                for c in ssl['chain']:
                    curr_cert = crypto.load_certificate(crypto.FILETYPE_PEM, c)
                    cert_notbefore = str(curr_cert.get_notBefore(), 'utf-8')
                    cert_notafter = str(curr_cert.get_notAfter(), 'utf-8')
                    try:
                        issued_time = datetime.strptime(cert_notbefore, '%Y%m%d%H%M%SZ')
                    except:
                        issued_time = datetime.strptime(cert_notbefore, '%Y%m%d%H%M%S%z').replace(tzinfo=None)

                    try:
                        expired_time = datetime.strptime(cert_notafter, '%Y%m%d%H%M%SZ')
                    except:
                        expired_time = datetime.strptime(cert_notafter, '%Y%m%d%H%M%S%z').replace(tzinfo=None)
                        
                    issued_times.append(issued_time)
                    expired_times.append(expired_time)
                
                issued_time = max(issued_times)
                expired_time = min(expired_times)

                cert = crypto.load_certificate(crypto.FILETYPE_PEM, ssl['chain'][0])
                data['cert_expired'] = 1 if timestamp > expired_time else 0

                pubkey_type = cert.get_pubkey().type()
                pubkey_length = cert.get_pubkey().bits()
                weak_key = (pubkey_type == crypto.TYPE_RSA and pubkey_length <= 1024) or\
                            (pubkey_type == crypto.TYPE_EC and pubkey_length < 224)
                sig_alg = str(cert.get_signature_algorithm(), 'utf-8')[0:4]
            
                data['cert_unparseable'] = 0
            except:
                # this means that the certificate could not be parsed
                # perhaps invalid, corrupted, no idea
                cert_parse_failed = True
        
        if not cert_parse_failed:
            data['cert_issued_in_future'] = 1 if issued_time > timestamp else 0
            
            chain = ssl['chain']
            cert = crypto.load_certificate(crypto.FILETYPE_PEM, chain[0])
            root_cert = crypto.load_certificate(crypto.FILETYPE_PEM, chain[-1])
            
            # non-standard root
            # get the certificate's basic constraints extension and check if it indicates that the certificate is a CA
            data['cert_nonstandard_root'] = 1
            basic_constraints = root_cert.get_extension_count()
            for i in range(0, basic_constraints):
                ext_name = root_cert.get_extension(i).get_short_name()
                if ext_name == b'basicConstraints':
                    ext_data = root_cert.get_extension(i).get_data()
                    # 0x0 means CA:FALSE, non-zero byte at end of byte string means CA:TRUE
                    # extract the last byte from the extension data and check if it is non-zero
                    ca_true = ext_data[-1]
                    data['cert_nonstandard_root'] = 0 if ca_true else 1
                    break
            
            # broken chain of trust
            # make sure the cert is considered not expired for this test
            # so get max date from all certs
            dates = []
            for c in chain:
                curr_cert = crypto.load_certificate(crypto.FILETYPE_PEM, c)
                cert_notbefore = str(curr_cert.get_notBefore(), 'utf-8')
                try:
                    d = datetime.strptime(cert_notbefore, '%Y%m%d%H%M%SZ')
                except:
                    d = datetime.strptime(cert_notbefore, '%Y%m%d%H%M%S%z').replace(tzinfo=None)
                dates.append(d)
            validation_date = max(dates) + timedelta(days=1)
                
            store = crypto.X509Store()
            store.add_cert(root_cert)
            store.set_time(validation_date)
            if len(chain) > 2:
                intermediate_certs = chain[1:-1]
                intermediate_certs = [crypto.load_certificate(crypto.FILETYPE_PEM, x) for x in intermediate_certs]
                store_ctx = crypto.X509StoreContext(store, cert, intermediate_certs)
            else:
                store_ctx = crypto.X509StoreContext(store, cert)
            try:
                store_ctx.verify_certificate()
                data['cert_broken_chain_of_trust'] = 0
            except crypto.X509StoreContextError as e:
                ex = ast.literal_eval(str(e))
                if ex[0] == 2: # cannot find issuer cert
                    data['cert_nonstandard_root'] = 1
                    data['cert_broken_chain_of_trust'] = 1
                elif ex[0] == 10: # one of the certs is expired
                    data['cert_expired'] = 1
                    data['cert_broken_chain_of_trust'] = 0 # do nothing
                elif ex[0] == 18: # self-signed cert
                    data['cert_self_signed'] = 1
                    data['cert_broken_chain_of_trust'] = 0
                elif ex[0] == 20: # unable to get local issuer certificate
                    data['cert_broken_chain_of_trust'] = 1
                elif ex[0] == 79: # invalid CA certificate
                    data['cert_broken_chain_of_trust'] = 1
                elif ex[0] == 19: # self-signed cert in chain
                    data['cert_broken_chain_of_trust'] = 1
                elif ex[0] == 7: # certificate signature failure
                    data['cert_broken_chain_of_trust'] = 1
                elif ex[0] == 24: # invalid CA cert
                    data['cert_broken_chain_of_trust'] = 1
                elif ex[0] == 14: # format error in certificate's notAfter field, probably more strict OpenSSL version
                    data['cert_broken_chain_of_trust'] = 0 # do nothing
                elif ex[0] == 13: # format error in certificate's notBefore field, probably more strict OpenSSL version
                    data['cert_broken_chain_of_trust'] = 0 # do nothing
                elif ex[0] == 31: # authority and issuer serial number mismatch
                    data['cert_broken_chain_of_trust'] = 1
                else:
                    data['cert_broken_chain_of_trust'] = 1
#                    print(e.args)
#                    print('is expired?: ' + str(data['cert_expired']))
#                    print('nonstandard root?: ' + str(data['cert_nonstandard_root']))
#                    print('offending cert start date: ' + str(datetime.strptime(e.certificate.get_notBefore().decode(), '%Y%m%d%H%M%SZ')))
#                    print('offending cert expiry date: ' + str(datetime.strptime(e.certificate.get_notAfter().decode(), '%Y%m%d%H%M%SZ')))
#                    print('start date: ' + str(datetime.strptime(cert.get_notBefore().decode(), '%Y%m%d%H%M%SZ')))
#                    print('expiry date: ' + str(datetime.strptime(cert.get_notAfter().decode(), '%Y%m%d%H%M%SZ')))
#                    print('validation date: ' + str(validation_date))
                    
#                    for i in range(len(chain)):
#                        if crypto.load_certificate(crypto.FILETYPE_PEM,chain[i]).get_serial_number() == e.certificate.get_serial_number():
#                            print(f'index: {i}')
#                    print(chain)
                    
#                    raise e


            # certificates with weak keys
            data['cert_weak_key'] = 1 if weak_key else 0

            # certificates with weak signatures
            data['cert_weak_sig'] = 1 if sig_alg in _WEAK_SIGS else 0

            # self-signed certificates
            try:
                # maybe shodan checked this themselves already
                data['cert_self_signed'] = 1 if 'self-signed' in json_obj['tags'] else 0
            except:
                try:
                    # if not, do simple subject == issuer check from parsed data in shodan scan
                    data['cert_self_signed'] = 1 if data['ssl']['issuer']['CN'] == data['ssl']['subject']['CN'] else 0
                except:
                    try:
                        # if not parsed by shodan, use our own parsed cert certificate variable
                        data['cert_self_signed'] = 1 if cert.get_subject() == cert.get_issuer() else 0
                    except:
                        # if all else fails, assume it's not self-signed
                        data['cert_self_signed'] = 0
                    
        else:
            # if the cert was invalid/corrupted/unparseable, fill these fields with 0
            # or maybe add a different field 'cert_corrupted' to cover this issue in the data?
            data['tls_old_version'] = 0
            data['tls_heartbleed_or_freak_vuln'] = 0
            data['dh_common_prime'] = 0
            data['dh_weak_bits'] = 0
            data['cert_self_signed'] = 0
            data['cert_expired'] = 0
            data['cert_issued_in_future'] = 0
            data['cert_nonstandard_root'] = 0
            data['cert_broken_chain_of_trust'] = 0
            data['cert_weak_key'] = 0
            data['cert_weak_sig'] = 0
            data['cert_unparseable'] = 1
    
    else:
        data['tls_old_version'] = 0
        data['tls_heartbleed_or_freak_vuln'] = 0
        data['dh_common_prime'] = 0
        data['dh_weak_bits'] = 0
        data['cert_self_signed'] = 0
        data['cert_expired'] = 0
        data['cert_issued_in_future'] = 0
        data['cert_nonstandard_root'] = 0
        data['cert_broken_chain_of_trust'] = 0
        data['cert_weak_key'] = 0
        data['cert_weak_sig'] = 0
        data['cert_unparseable'] = 0
    
    #21 frequently used services
    data['ftp'] = 1 if 'ftp' in json_obj else 0
    data['telnet'] = 1 if 'telnet' in json_obj else 0
    data['sunrpc'] = 1 if json_obj['port'] == 111 else 0  #are we sure about this?
    data['netbios'] = 1 if 'netbios' in json_obj else 0
    data['snmp'] = 1 if 'snmp' in json_obj else 0
    data['smb'] = 1 if 'smb' in json_obj else 0
    data['mysql'] = 1 if 'mysql' in json_obj else 0
    data['mssql'] = 1 if 'mssql' in json_obj else 0
    data['dns'] = 1 if 'dns' in json_obj else 0
    data['http'] = 1 if 'http' in json_obj else 0
    data['ntp'] = 1 if 'ntp' in json_obj else 0
    data['ssh'] = 1 if 'ssh' in json_obj else 0
    
    try:
        module = json_obj['_shodan']['module']
        data['smtp'] = 1 if module == 'smtp' else 0
        data['pop3'] = 1 if module == 'pop3' else 0
        data['imap'] = 1 if module == 'imap' else 0
        data['rdp'] = 1 if module == 'rdp' else 0
        data['postgres'] = 1 if module == 'postgresql' else 0
        data['https'] = 1 if module == 'https' else 0
        data['smtps'] = 1 if module == 'smtps' else 0
        data['imaps'] = 1 if module == 'imap-ssl' else 0
        data['pop3s'] = 1 if module == 'pop3-ssl' else 0
    except:
        data['smtp'] = 1 if json_obj['port'] == 25 else 0  #other ports?
        data['pop3'] = 1 if json_obj['port'] == 110 else 0
        data['imap'] = 1 if json_obj['port'] == 143 else 0
        data['rdp'] = 1 if json_obj['port'] == 3389 else 0
        data['postgres'] = 1 if json_obj['port'] == 5432 else 0
        data['https'] = 1 if json_obj['port'] == 443 else 0
        data['smtps'] = 1 if json_obj['port'] == 465 or json_obj['port'] == 587 else 0
        data['imaps'] = 1 if json_obj['port'] == 993 else 0
        data['pop3s'] = 1 if json_obj['port'] == 995 else 0
        
    # cloudy with a chance of breach features
    if 'dns' in json_obj:
        dns = json_obj['dns']
        try:
            data['dns_recursive'] = 1 if dns['recursive'] else 0
        except:
            data['dns_recursive'] = 0
    else:
        data['dns_recursive'] = 0
        
        
    # other stuff, necessary?
    data['ntlm'] = 1 if 'ntlm' in json_obj else 0
    
    return data

def process_shodan_file(ip_ranges, file_path):
    data_list = []
    with open(file_path, 'r') as file:
        for line in file.readlines():
            parsed = json.loads(line.strip())

            data = extract_features_from_json_object(json_obj=parsed, ip_ranges=ip_ranges)
            if len(data) == 0:
                continue

            vulns = []
            if 'vulns' in parsed:
                for k in parsed['vulns'].keys():
                    if 'CVE' in k:
                        vulns.append(k)
            data['vulns'] = vulns

            data_list.append(data)

    return data_list

def process_shodan_files(ripe_name, ip_ranges, paths_list):
    processed_path = '/data/data-sh/data/data.external-scans/shodan/ripe-nl/processed'
    done_file_path = f'{processed_path}/_done'
        
    data_list = Parallel(n_jobs=40, verbose=1, backend='multiprocessing')(delayed(process_shodan_file)(ip_ranges, file_path) for file_path in paths_list)
    data_list = [ item for sublist in data_list for item in sublist ]

    data_df = pd.DataFrame(data_list).fillna(int(0))
    if len(data_df.index) > 0:
        print('\tsaving files...')
        for i, (name, group) in enumerate(data_df.groupby('timestamp')):
            group.to_csv(f'{processed_path}/{name}-{ripe_name}.csv', index=False)
    else:
        print('\tempty dataframe, skipping')
    
    with open(done_file_path, 'a') as f:
        f.write(f'{ripe_name}\n')

def process_shodan_files_non_parallel(ripe_name, ip_ranges, paths_list):
    processed_path = '/data/data-sh/data/data.external-scans/shodan/ripe-nl/processed'
    done_file_path = f'{processed_path}/_done'
        
    data_list = []
    size = len(paths_list)
    start = datetime.now()
    print(f'total files to process: {size}')
    for i in range(len(paths_list)):
        file_path = paths_list[i]
        data_list.append( process_shodan_file(ip_ranges, file_path) )
        
        if i < 500 and i%50==0:
            seconds = (datetime.now()-start).seconds
            print(f'Done with {i+1} out of {size}\t in {seconds} seconds')
            
        elif i % 500 == 0:
            seconds = (datetime.now()-start).seconds
            print(f'Done with {i+1} out of {size}\t in {seconds} seconds')
           
    seconds = (datetime.now()-start).seconds 
    print(f'Done\t in {seconds} seconds')
    
    data_list = [ item for sublist in data_list for item in sublist ]

    data_df = pd.DataFrame(data_list).fillna(int(0))
    if len(data_df.index) > 0:
        print('\tsaving files...')
        for i, (name, group) in enumerate(data_df.groupby('timestamp')):
            group.to_csv(f'{processed_path}/{name}-{ripe_name}.csv', index=False)
    else:
        print('\tempty dataframe, skipping')
    
    with open(done_file_path, 'a') as f:
        f.write(f'{ripe_name}\n')

def get_set_of_ips(ip_ranges, ip_type='str', parallel=False):
    if ip_type == 'str':
        iparray = [ host.compressed for ip_range in ip_ranges for host in ip_range ]
    elif ip_type == 'int':
        if parallel:
            iparray = Parallel(n_jobs=40, verbose=1, backend='multiprocessing')(
                delayed(get_hosts_in_range)(ip_range)
                for ip_range in ip_ranges
            )
            iparray = [ item for sublist in iparray for item in sublist ]
        else:
            iparray = [ int(host) for ip_range in ip_ranges for host in ip_range ]
    else:
        iparray = [ host for ip_range in ip_ranges for host in ip_range ]
    ipset = set(iparray)
    return ipset


def get_hosts_in_range(r):
    res = []
    for host in r:
        res.append(int(host))
    return res

In [None]:
###### SHODAN JSON FILES ON SERVER (1234567890-2143658709.json)
###### ARE NOT IN ORDER, SO ALL FILES NEED TO BE PROCESSED
###### :'(

shodan_files_path = '/data/data-ig/personal/carlos/cidr_NL/results_shodan'
shodan_dirs_to_process = sorted([ x for x in exploded['ripe_names'].unique() if x.startswith('nl.') ])

processed_path = '/data/data-sh/data/data.external-scans/shodan/ripe-nl/processed'
done_file_path = f'{processed_path}/_done'
if not os.path.isfile(done_file_path):
    with open(done_file_path, 'w') as f:
        pass

with open(done_file_path, 'r') as f:
    done_files = [ l.strip() for l in f.readlines() ]

for ripe_name in shodan_dirs_to_process:
    if ripe_name == 'nl.kpn-bbt' or ripe_name == 'nl.wapi':
        continue

    print(f'processing {ripe_name}')
    if ripe_name in done_files:
        print('\talready done, skipping.')
        continue
    
    ip_ranges = exploded.loc[exploded['ripe_names'] == ripe_name, 'Ranges'].unique()
    ip_set = get_set_of_ips(ip_ranges)
    current_path = f'{shodan_files_path}/{ripe_name}'
    shodan_files = [f'{current_path}/{f}' for f in listdir(current_path) if isfile(join(current_path, f))]
    process_shodan_files_non_parallel(ripe_name, ip_set, shodan_files)

for ripe_name in ['nl.kpn-bbt', 'nl.wapi']:
    print(f'processing {ripe_name}')
    if ripe_name in done_files:
        print('\talready done, skipping.')
        continue
        
    ip_ranges = exploded.loc[exploded['ripe_names'] == ripe_name, 'Ranges'].unique()
    ip_set = get_set_of_ips(ip_ranges)
    current_path = f'{shodan_files_path}/{ripe_name}'
    shodan_files = [f'{current_path}/{f}' for f in listdir(current_path) if isfile(join(current_path, f))]
    process_shodan_files_non_parallel(ripe_name, ip_set, shodan_files)
    

In [None]:
def process_nonripe_shodan_file(file_path, ip_ranges):
    dirname = os.path.dirname(file_path)
    processed_path = f'{dirname}/processed/'
    filename = os.path.basename(file_path).replace('.json', '')
    
    data_list = []
    with open(file_path, 'r') as file:
        for line in file.readlines():
            parsed = json.loads(line.strip())

            data = extract_features_from_json_object(json_obj=parsed, ip_ranges=ip_ranges)
            if len(data) == 0:
                continue

            vulns = []
            if 'vulns' in parsed:
                for k in parsed['vulns'].keys():
                    if 'CVE' in k:
                        vulns.append(k)
            data['vulns'] = vulns

            data_list.append(data)
            
    return data_list
    
    
nonripe_shodan_dir = '/data/data-sh/data/data.external-scans/shodan/non-ripe-nl/'
nonripe_shodan_files = [f for f in listdir(nonripe_shodan_dir) if isfile(join(nonripe_shodan_dir, f)) and f.endswith('.csv.gz')]
nonripe_shodan_files = sorted(nonripe_shodan_files, key=lambda x: int(x.split('-')[0]))

nonripe = exploded[~exploded['ripe_names'].str.startswith('nl.')]

In [None]:
processed_path = f'{nonripe_shodan_dir}/processed'

done_file_path = f'{processed_path}/_done'
if not os.path.isfile(done_file_path):
    with open(done_file_path, 'w') as f:
        pass
        
with open(done_file_path, 'r') as f:
    done_files = [ l.strip() for l in f.readlines() ]

for i, (name, group) in enumerate(nonripe.groupby('ripe_names')):
    print(f'processing {name}')
    
    if name in done_files:
#         print('\talready done, skipping.')
        continue
        
    print('\tpicking which shodan files to process')
    ip_ranges = group['Ranges'].unique()
    ip_set = get_set_of_ips(ip_ranges, ip_type='int', parallel=True)
    sorted_ip_list = sorted(list(ip_set))
    
    # IMPORTANT THAT nonripe_shodan_files IS SORTED NUMERICALLY
    nonripe_shodan_files_to_process = []
    for iprange in ip_ranges:
        first_ip = int(iprange[0])
        last_ip = int(iprange[-1])
        index = 0
        done_range = False
        # for every shodan file
        for _ in range(len(nonripe_shodan_files)):
            f = nonripe_shodan_files[index]
            shodan_filename = f.replace('.csv.gz', '')
            ips = shodan_filename.split('-')
            ip1 = int(ips[0])
            ip2 = int(ips[1])
            # if the first IP of this range is in the current file, start adding files
            if first_ip <= ip2:
                nonripe_shodan_files_to_process.append(f)
                index += 1
                # while the last IP in this range is larger than the first IP in the current shodan file,
                # add the current shodan file to list of files to process
                while True:
                    if index >= len(nonripe_shodan_files):
                        done_range = True
                        break
                    f2 = nonripe_shodan_files[index]
                    shodan_filename2 = f2.replace('.csv.gz', '')
                    ips2 = shodan_filename2.split('-')
                    ip21 = int(ips2[0])
                    ip22 = int(ips2[1])
                    if last_ip >= ip21:
                        nonripe_shodan_files_to_process.append(f2)
                        index += 1
                        
                    else:
                        # once the last IP in this range is smaller than the last IP in the current shodan file,
                        # stop adding files
                        # and start processing next range
                        print(f'breaking because {last_ip} less than {ip21}')
                        
                        done_range = True
                        break
                        
            if done_range:
                break
            
            index += 1
            
    nonripe_shodan_files_to_process = list(set(nonripe_shodan_files_to_process))
                    
        
    print(nonripe_shodan_files_to_process)
    print('\tprocessing files')
    ip_set = get_set_of_ips(ip_ranges)
    
    
    #for every file in nonripe_shodan_files_to_process
    #    extract all rows where row IP is in ip_set
    data_df = []
    for filename in nonripe_shodan_files_to_process:
        try:
            tmp = pd.read_csv(f'{nonripe_shodan_dir}/{filename}', compression='gzip')
        except:
            print('error in %s' % filename)
            raise
        data_df.append(tmp)
    if len(data_df) > 0:
        data_df = pd.concat(data_df)
        if len(data_df.index) > 0:
            print('\tsaving files...')
            for i, (date, group) in enumerate(data_df.groupby('timestamp')):
                group.to_csv(f'{processed_path}/{date}-{name}.csv', index=False)
        else:
            print('\tempty dataframe, skipping')
            
    else:
        print('\tempty shodan files, skipping')
        
    
    with open(done_file_path, 'a') as f:
        f.write(f'{name}\n')
    

In [None]:
print('done processing non-shodan files')

In [None]:
def parse_shodan_df(row):
    row['ip'] = ipaddress.IPv4Network(row['ip_str'])
    return row

_FEATURES = ['port', 'ip', 'has_tls', 'tls_old_version', 'tls_heartbleed_or_freak_vuln', 'dh_common_prime', 'dh_weak_bits', 'cert_expired', 'cert_issued_in_future', 'cert_weak_key', 'cert_weak_sig', 'cert_nonstandard_root', 'cert_broken_chain_of_trust', 'cert_self_signed', 'ftp', 'telnet', 'sunrpc', 'netbios', 'snmp', 'smb', 'mysql', 'mssql', 'dns', 'http', 'ntp', 'ssh', 'smtp', 'pop3', 'imap', 'rdp', 'postgres', 'https', 'smtps', 'imaps', 'pop3s', 'vulns']
_EMPTY_FEATURES = {f: 0 for f in _FEATURES}
def extract_features_group(group, super_dict):
    def get_closest_date(current_date):
        dates = sorted(list(date_files_dict.keys()))
        for d in dates:
            if d > current_date:
                return d
            
        return None
    
    result = pd.Series([ {} for x in group.index ], index=group.index, name='shodan_features')
    
    date = group['date'].iloc[0]
    ripe_name = group['ripe_names'].iloc[0]
    
    if ripe_name not in super_dict:
        result = pd.Series([ _EMPTY_FEATURES for x in group.index ], index=group.index, name='shodan_features')
        return result

    date_files_dict = super_dict[ripe_name]

    sorted_dates = sorted(list(date_files_dict.keys()))
    first_date = sorted_dates[0]
    last_date = sorted_dates[-1]
    if date < first_date:
        return result
    if date > last_date:
        return result

    closest = date_files_dict.get(date, 0)
    if closest == 0:
        closest_date = get_closest_date(date)
        closest = date_files_dict.get(closest_date)

    if os.path.getsize(closest) == 0:
        return result

    shodan_df = pd.read_csv(closest)
    shodan_df = shodan_df.apply(parse_shodan_df, axis=1)
    shodan_df['vulns'] = shodan_df['vulns'].apply(ast.literal_eval)
    result = group.apply(extract_features, args=(shodan_df,), axis=1)
    return result['shodan_features']
    
def extract_features(row, shodan_df):
    #get single IP range from Ranges column
    r = row['Ranges']
    #extract features from shodan dataframe
    issupernet = r.supernet_of
    subnet_ips = shodan_df[ [ issupernet(ip) for ip in shodan_df['ip'] ] ]
    
    cols = list(shodan_df.columns.values)
    cols.remove('timestamp')
    cols.remove('ip_str')
    operations = { col: 'sum' for col in cols }
    operations['ip'] = 'nunique'
    operations['port'] = 'count'
    operations['vulns'] = 'sum'
    subnet_ips['vulns'] = subnet_ips['vulns'].apply(len)
    features_df = subnet_ips.agg(operations)
    features_dict = features_df.to_dict()
    
    row['shodan_features'] = features_dict
    return row

In [None]:
# %%time

shodan_output_dir = '/data/data-sh/data/data.external-scans/shodan'
ripe_shodan_output_dir = '/data/data-sh/data/data.external-scans/shodan/ripe-nl'
ripe_shodan_processed_path = f'{ripe_shodan_output_dir}/processed'
nonripe_shodan_output_dir = '/data/data-sh/data/data.external-scans/shodan/non-ripe-nl'
nonripe_shodan_processed_path = f'{nonripe_shodan_output_dir}/processed'

if not os.path.isfile(f'{shodan_output_dir}/exploded.csv'):

    cols = list(exploded.columns.values)
    cols.remove('Ranges')
    cols.remove('ripe_names')

    # HOW TO PARALLEL_APPLY FOR BATCH OF ROWS, WITHOUT COMBINING INTO GROUPS
    # https://stackoverflow.com/questions/72845864/call-parallel-apply-for-batch-of-rows

    ripe_names_list = sorted(exploded['ripe_names'].unique().tolist())

    # collect all processed shodan files, group them by RIPE name + date
    super_dict = {}
    for name in ripe_names_list:
        
        if name.startswith('nl.'):
            ripe_or_nonripe_path = ripe_shodan_processed_path
        else:
            ripe_or_nonripe_path = nonripe_shodan_processed_path
            
        files = glob.glob(f'{ripe_or_nonripe_path}/*{name}*')
        if len(files) == 0:
            continue

        last_slash_index = files[0].rfind('/')

        dates = [ x[last_slash_index+1 : last_slash_index + 1 + len('2010-10-10')] for x in files ]
        dates = [ datetime.strptime(x, '%Y-%m-%d') for x in dates ]
        date_files_dict = dict( zip(dates, files) )
        super_dict[name] = date_files_dict
    
    
    pandarallel.initialize(progress_bar=True, nb_workers=5)
    exploded['shodan_features'] = (exploded.groupby(['date', 'ripe_names']).parallel_apply(lambda x: extract_features_group(x, super_dict)).droplevel(0).droplevel(0))

    exploded.to_csv(f'{shodan_output_dir}/exploded.csv', index=False)

else:
    exploded = pd.read_csv(f'{shodan_output_dir}/exploded.csv')
#     exploded['date'] = pd.to_datetime(exploded['date'], format='%Y-%m-%d')
#     exploded['first'] = pd.to_datetime(exploded['first'], format='%Y-%m-%d')
#     exploded['last'] = pd.to_datetime(exploded['last'], format='%Y-%m-%d')
    exploded['Ranges'] = exploded['Ranges'].apply(ipaddress.IPv4Network)
#     exploded['ripe_ranges'] = exploded['ripe_ranges'].apply(ipaddress.IPv4Network)
    exploded['shodan_features'] = exploded['shodan_features'].apply(ast.literal_eval)
    print('loaded exploded.csv from disk')
    

In [None]:
print('done')

In [None]:
exploded = exploded.sort_values(['date', 'customer', 'Ranges'])
exploded[exploded['shodan_features'].str.len() > 1]

In [None]:
%%time
exploded = exploded.sort_values(['date', 'customer', 'Ranges'])
exploded['Ranges'] = exploded['Ranges'].apply(str)

def aggregate_to_list(group):
    return group.tolist()

imploded = exploded.groupby(cols).agg({'Ranges': aggregate_to_list,
                                       'ripe_names': aggregate_to_list,
                                       'shodan_features': aggregate_to_list}).reset_index()

imploded.to_csv(f'{shodan_output_dir}/imploded.csv', index=False)

In [None]:
################
###### i think i'll probably need to use the exploded versions when processing further and ML stuff
################