In [2]:
import os

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
from numpy import array
import tqdm
import random
from tensorflow.keras.layers import Dense, LSTM,GRU
from tensorflow.keras.optimizers import SGD
from keras import Sequential



import datetime as dt
import pandas as pd

from imblearn.over_sampling import RandomOverSampler


In [3]:
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence) - 1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]

        seq_x = np.pad(seq_x, ((0, 0), (0, 30 - seq_x.shape[1])), 'constant')
        X.append(seq_x)
        y.append(seq_y[-1])
    return array(X), array(y)

In [4]:
from matplotlib import pyplot as plt
def draw_timeline(name,vulns,first_date, last_date):

    dates = vulns
    dates += [first_date]
    dates += [last_date]

    values = [1]*len(dates)
    values[-1] = 2
    values[-2] = 2

    X = pd.to_datetime(dates)
    fig, ax = plt.subplots(figsize=(6,1))
    ax.scatter(X, [1]*len(X), c=values,
               marker='s', s=100)
    fig.autofmt_xdate()

    # everything after this is turning off stuff that's plotted by default
    ax.set_title(name)
    ax.yaxis.set_visible(True)
    ax.spines['right'].set_visible(True)
    ax.spines['left'].set_visible(True)
    ax.spines['top'].set_visible(True)
    ax.xaxis.set_ticks_position('bottom')
    ax.set_facecolor('white')
    
    ax.get_yaxis().set_ticklabels([])
    # day = pd.to_timedelta("1", unit='D')
    # plt.xlim(X[0] - day, X[-1] + day)
    plt.show()
    #plt.subplots_adjust(bottom=0.15)
    #plt.savefig(f"D:/cve/images/timeline/{name}.jpg", transparent=False)
    
    
def find_benign_events(cur_repo_data,gap_days, num_of_events):
    benign_events = []
    for _ in range(num_of_events):
        found_event = False
        while not found_event:
            try:
                cur_event = random.randint(2*gap_days+1,cur_repo_data.shape[0]-gap_days*2-1)
            except ValueError:
                continue
            event = cur_repo_data.index[cur_event]

            before_vuln = event - gap_days
            after_vuln = event + gap_days
            res_event = cur_repo_data[before_vuln:event-1]
            if not res_event[res_event["VulnEvent"]>0].empty:
                continue
            benign_events.append(res_event.iloc[:,:-1].values)
            found_event = True
            
            
    return benign_events

In [11]:
repo_dirs = 'D:/repo_gharchive_processed4'
benign_all, vuln_all = [], []
n_features = 0
gap_days = 10

nice_list = ['facebook_hhvm.csv',
'ffmpeg_ffmpeg.csv',
'flatpak_flatpak.csv',
'freerdp_freerdp.csv',
'git_git.csv',
'gpac_gpac.csv',
'imagemagick_imagemagick.csv',
'kde_kdeconnect-kde.csv',
'krb5_krb5.csv',
'mantisbt_mantisbt.csv',
'op-tee_optee_os.csv',
'owncloud_core.csv',
'php_php-src.csv',
'revive-adserver_revive-adserver.csv',
'rubygems_rubygems.csv',
'the-tcpdump-group_tcpdump.csv']

for file in tqdm.tqdm(os.listdir(repo_dirs)[:]):
    try:
        try:
            
            cur_repo_data = pd.read_csv(repo_dirs + "/" + file,parse_dates=['created_at'],index_col='created_at')

            if cur_repo_data[cur_repo_data["VulnEvent"] > 0].values.shape[0]<20: 
                continue
                
            cur_repo_data = cur_repo_data[cur_repo_data.index.notnull()]
            cur_repo_data["additions"]=(cur_repo_data["additions"]-cur_repo_data["additions"].mean())/cur_repo_data["additions"].std()
            cur_repo_data["deletions"]=(cur_repo_data["deletions"]-cur_repo_data["deletions"].mean())/cur_repo_data["deletions"].std()
            
            y = pd.get_dummies(cur_repo_data.index.day_of_week, prefix='day_of_week')
            y.index = cur_repo_data.index
            cur_repo_data = pd.concat([cur_repo_data,y],axis=1)
            y = pd.get_dummies(cur_repo_data.index.hour, prefix='hour')
            y.index = cur_repo_data.index
            cur_repo_data = pd.concat([cur_repo_data,y],axis=1)
            cur_repo_data = cur_repo_data.resample('W-MON').sum()
            cur_repo_data = cur_repo_data.reset_index(drop=True)

        except pd.errors.EmptyDataError:
            #print(file)
            continue
        if cur_repo_data.shape[0]<100:
            continue
        print(file)
        cols_at_end = ['VulnEvent']
        cur_repo_data = cur_repo_data[[c for c in cur_repo_data if c not in cols_at_end]
                                + [c for c in cols_at_end if c in cur_repo_data]]
        # vulns = cur_repo_data[cur_repo_data["VulnEvent"] > 0]
        vulns = cur_repo_data.index[cur_repo_data['VulnEvent'] > 0].tolist()

        # vulns = vulns.sort_index()
        
        for vuln in vulns:
            # before,after = vuln - timedelta(days=gap_days),vuln + timedelta(days=gap_days)
            before, after = vuln-gap_days,vuln+gap_days
            res = cur_repo_data[before:vuln-1].iloc[:,:-1].values
            vuln_all.append(res)

        num_of_events = len(vulns)
        benigns=find_benign_events(cur_repo_data,gap_days,num_of_events)

        for res in benigns:
            benign_all.append(res)

        """for vuln in vulns.index:
            before_vuln = vuln - timedelta(days=gap_days)
            after_vuln = vuln + timedelta(days=gap_days)
            # draw_timeline(file.split(".csv")[0],list(vulns.index),cur_data.index[0],cur_data.index[-1])
            cur_data = cur_repo_data[before_vuln:after_vuln]


            #cur_train_x, cur_train_y = split_sequence(cur_data.values, n_steps)

            # if len(cur_train_x) == 0:
            #     continue

            all_train_x.append(cur_data.values)
            all_train_y.append([1])

        for benign in find_benign_events(cur_repo_data,gap_days,num_of_events):
            print(benign.shape)
            all_train_x.append(benign)
            all_train_y.append([0])"""
        #print(len(all_train_x),len(all_train_y))
    except KeyError as e:
        #print(file)
        continue
        # import traceback
        # traceback.print_exc()
        

# all_train_x = np.array(all_train_x)
# all_train_y = np.array(all_train_y)
# print(all_train_x.shape,all_train_y.shape)
# np.save('x.npy', all_train_x, allow_pickle=True)
# np.save('y.npy', all_train_y, allow_pickle=True)


  1%|▌                                                                               | 13/1784 [00:00<01:08, 25.74it/s]

abrt_abrt.csv


  2%|█▍                                                                              | 32/1784 [00:02<02:32, 11.50it/s]

agentejo_cockpit.csv


  4%|███▏                                                                            | 70/1784 [00:06<02:04, 13.75it/s]

anuko_timetracker.csv


  5%|████▎                                                                           | 95/1784 [00:11<05:36,  5.03it/s]

armmbed_mbedtls.csv


  8%|█████▉                                                                         | 134/1784 [00:15<02:57,  9.30it/s]

axiomatic-systems_bento4.csv


  8%|██████▏                                                                        | 139/1784 [00:17<05:48,  4.72it/s]

b2evolution_b2evolution.csv


  8%|██████▌                                                                        | 147/1784 [00:19<04:58,  5.48it/s]

baserproject_basercms.csv


  8%|██████▌                                                                        | 149/1784 [00:19<04:51,  5.62it/s]

bcgit_bc-java.csv


  9%|███████                                                                        | 160/1784 [00:21<07:17,  3.71it/s]

bigbluebutton_bigbluebutton.csv
bigtreecms_bigtree-cms.csv


 10%|███████▊                                                                       | 177/1784 [00:27<08:49,  3.04it/s]

bonzini_qemu.csv


 10%|████████                                                                       | 182/1784 [00:28<05:11,  5.14it/s]

bottelet_daybydaycrm.csv


 11%|████████▉                                                                      | 202/1784 [00:30<04:32,  5.81it/s]

cacti_cacti.csv


 12%|█████████▌                                                                     | 216/1784 [00:33<05:40,  4.60it/s]

cendioossman_tigervnc.csv


 12%|█████████▋                                                                     | 220/1784 [00:40<31:18,  1.20s/it]

ceph_ceph.csv


 12%|█████████▉                                                                     | 223/1784 [00:40<15:38,  1.66it/s]

cesnet_libyang.csv


 13%|██████████                                                                     | 226/1784 [00:41<10:55,  2.38it/s]

chamilo_chamilo-lms.csv


 14%|██████████▉                                                                    | 247/1784 [00:44<03:20,  7.67it/s]

clusterlabs_pcs.csv


 16%|████████████▉                                                                  | 293/1784 [00:59<07:32,  3.30it/s]

curl_curl.csv


 17%|█████████████▋                                                                 | 310/1784 [01:03<10:47,  2.28it/s]

dart-lang_sdk.csv


 18%|██████████████▎                                                                | 322/1784 [01:05<05:32,  4.39it/s]

dbry_wavpack.csv
debiki_talkyard.csv


 20%|███████████████▍                                                               | 348/1784 [01:11<08:30,  2.81it/s]

discourse_discourse.csv


 20%|███████████████▌                                                               | 352/1784 [01:14<14:11,  1.68it/s]

django_django.csv


 24%|███████████████████                                                            | 431/1784 [01:40<04:00,  5.63it/s]

enalean_tuleap.csv


 25%|███████████████████▍                                                           | 439/1784 [01:43<05:31,  4.06it/s]

erikd_libsndfile.csv


 25%|███████████████████▉                                                           | 449/1784 [01:47<09:05,  2.45it/s]

ether_etherpad-lite.csv


 26%|████████████████████▍                                                          | 461/1784 [01:48<02:28,  8.90it/s]

exponentcms_exponent-cms.csv


 26%|████████████████████▋                                                          | 467/1784 [01:50<04:31,  4.86it/s]

facebook_fbthrift.csv


 26%|████████████████████▊                                                          | 470/1784 [01:51<06:40,  3.28it/s]

facebook_hermes.csv


 26%|████████████████████▊                                                          | 471/1784 [01:53<12:09,  1.80it/s]

facebook_hhvm.csv


 28%|█████████████████████▉                                                         | 495/1784 [02:01<02:17,  9.36it/s]

fedora-selinux_setroubleshoot.csv


 28%|██████████████████████                                                         | 499/1784 [02:02<06:19,  3.39it/s]

ffmpeg_ffmpeg.csv


 28%|██████████████████████▏                                                        | 502/1784 [02:03<05:26,  3.93it/s]

file_file.csv


 28%|██████████████████████▍                                                        | 507/1784 [02:04<04:25,  4.81it/s]

firefly-iii_firefly-iii.csv


 29%|██████████████████████▋                                                        | 511/1784 [02:05<04:35,  4.61it/s]

flatpak_flatpak.csv


 29%|███████████████████████▎                                                       | 526/1784 [02:08<05:16,  3.97it/s]

forkcms_forkcms.csv


 30%|███████████████████████▊                                                       | 537/1784 [02:09<02:52,  7.25it/s]

freerdp_freerdp.csv


 31%|████████████████████████▏                                                      | 547/1784 [02:11<03:57,  5.21it/s]

fusionpbx_fusionpbx.csv


 31%|████████████████████████▎                                                      | 549/1784 [02:11<03:42,  5.56it/s]

galette_galette.csv


 33%|█████████████████████████▋                                                     | 580/1784 [02:20<12:55,  1.55it/s]

git_git.csv


 33%|█████████████████████████▊                                                     | 582/1784 [02:21<11:45,  1.70it/s]

glpi-project_glpi.csv


 35%|███████████████████████████▊                                                   | 627/1784 [02:40<03:22,  5.71it/s]

gpac_gpac.csv


 35%|███████████████████████████▉                                                   | 631/1784 [02:48<28:03,  1.46s/it]

grafana_grafana.csv


 38%|██████████████████████████████                                                 | 678/1784 [02:59<04:49,  3.82it/s]

horde_horde.csv


 38%|██████████████████████████████▍                                                | 686/1784 [03:01<03:40,  4.98it/s]

http4s_http4s.csv


 39%|███████████████████████████████                                                | 702/1784 [03:04<02:31,  7.15it/s]

ifmeorg_ifme.csv


 40%|███████████████████████████████▎                                               | 707/1784 [03:04<01:58,  9.11it/s]

imagemagick_imagemagick.csv


 40%|███████████████████████████████▍                                               | 709/1784 [03:05<03:12,  5.58it/s]

imagemagick_imagemagick6.csv


 40%|███████████████████████████████▉                                               | 721/1784 [03:09<04:38,  3.82it/s]

inspircd_inspircd.csv


 41%|████████████████████████████████▏                                              | 726/1784 [03:10<03:08,  5.60it/s]

inverse-inc_sogo.csv


 42%|█████████████████████████████████▍                                             | 754/1784 [03:15<02:04,  8.28it/s]

janeczku_calibre-web.csv


 44%|██████████████████████████████████▍                                            | 778/1784 [03:19<05:41,  2.95it/s]

jenkinsci_jenkins.csv


 46%|████████████████████████████████████                                           | 813/1784 [03:27<03:55,  4.12it/s]

jquery_jquery-ui.csv


 46%|████████████████████████████████████▎                                          | 821/1784 [03:29<02:13,  7.21it/s]

jsummers_imageworsener.csv


 47%|████████████████████████████████████▉                                          | 834/1784 [03:32<06:36,  2.40it/s]

jupyter_notebook.csv


 47%|█████████████████████████████████████▎                                         | 844/1784 [03:35<05:13,  2.99it/s]

kanboard_kanboard.csv


 48%|█████████████████████████████████████▊                                         | 853/1784 [03:37<02:54,  5.35it/s]

kde_kdeconnect-kde.csv


 48%|██████████████████████████████████████                                         | 859/1784 [03:40<04:37,  3.34it/s]

kevinpapst_kimai2.csv


 49%|██████████████████████████████████████▌                                        | 870/1784 [03:42<02:10,  7.02it/s]

kkos_oniguruma.csv


 49%|███████████████████████████████████████                                        | 882/1784 [03:43<01:10, 12.82it/s]

koral--_android-gif-drawable.csv


 50%|███████████████████████████████████████▎                                       | 888/1784 [03:45<02:40,  5.57it/s]

krb5_krb5.csv


 50%|███████████████████████████████████████▋                                       | 895/1784 [03:46<01:30,  9.86it/s]

labd_wagtail-2fa.csv


 51%|████████████████████████████████████████▍                                      | 912/1784 [03:51<01:57,  7.43it/s]

libarchive_libarchive.csv


 52%|████████████████████████████████████████▋                                      | 919/1784 [03:52<02:07,  6.76it/s]

libgd_libgd.csv


 52%|████████████████████████████████████████▋                                      | 920/1784 [03:53<04:11,  3.44it/s]

libgit2_libgit2.csv


 52%|█████████████████████████████████████████▏                                     | 930/1784 [03:53<01:30,  9.44it/s]

libraw_libraw.csv


 52%|█████████████████████████████████████████▎                                     | 932/1784 [03:55<04:05,  3.47it/s]

libredwg_libredwg.csv


 53%|█████████████████████████████████████████▋                                     | 940/1784 [03:57<03:05,  4.55it/s]

libtom_libtomcrypt.csv


 54%|██████████████████████████████████████████▍                                    | 959/1784 [04:02<01:48,  7.60it/s]

livehelperchat_livehelperchat.csv


 56%|███████████████████████████████████████████▉                                   | 991/1784 [04:09<01:47,  7.38it/s]

mantisbt_mantisbt.csv


 56%|███████████████████████████████████████████▉                                  | 1005/1784 [04:12<05:05,  2.55it/s]

matrix-org_sydent.csv


 57%|████████████████████████████████████████████                                  | 1009/1784 [04:14<04:45,  2.71it/s]

matrix-org_synapse.csv
matroska-org_libebml.csv


 57%|████████████████████████████████████████████▍                                 | 1016/1784 [04:17<03:31,  3.63it/s]

mdadams_jasper.csv


 58%|█████████████████████████████████████████████▏                                | 1033/1784 [04:19<01:27,  8.54it/s]

microsoft_chakracore.csv


 58%|█████████████████████████████████████████████▍                                | 1039/1784 [04:20<01:36,  7.69it/s]

microweber_microweber.csv


 59%|█████████████████████████████████████████████▋                                | 1046/1784 [04:22<03:14,  3.79it/s]

miniupnp_miniupnp.csv


 60%|███████████████████████████████████████████████▏                              | 1079/1784 [04:33<10:54,  1.08it/s]

mono_mono.csv


 61%|███████████████████████████████████████████████▉                              | 1096/1784 [04:41<07:59,  1.44it/s]

mruby_mruby.csv


 62%|████████████████████████████████████████████████▌                             | 1111/1784 [04:43<01:20,  8.37it/s]

nanopb_nanopb.csv


 64%|██████████████████████████████████████████████████▏                           | 1147/1784 [04:56<02:29,  4.25it/s]

nilsteampassnet_teampass.csv


 65%|██████████████████████████████████████████████████▊                           | 1163/1784 [05:12<04:40,  2.21it/s]

nothings_stb.csv


 67%|███████████████████████████████████████████████████▉                          | 1188/1784 [05:21<04:18,  2.30it/s]

oisf_suricata.csv


 67%|████████████████████████████████████████████████████▎                         | 1197/1784 [05:22<01:54,  5.14it/s]

op-tee_optee_os.csv


 68%|████████████████████████████████████████████████████▋                         | 1206/1784 [05:26<05:30,  1.75it/s]

openbsd_src.csv


 68%|█████████████████████████████████████████████████████                         | 1214/1784 [05:28<02:30,  3.79it/s]

opencontainers_runc.csv
openemr_openemr.csv


 69%|█████████████████████████████████████████████████████▋                        | 1227/1784 [05:32<02:28,  3.76it/s]

opennms_opennms.csv


 69%|█████████████████████████████████████████████████████▋                        | 1228/1784 [05:32<02:30,  3.68it/s]

openolat_openolat.csv


 69%|█████████████████████████████████████████████████████▊                        | 1232/1784 [05:33<02:26,  3.76it/s]

opensc_opensc.csv


 69%|██████████████████████████████████████████████████████▏                       | 1239/1784 [05:37<03:41,  2.47it/s]

openssh_openssh-portable.csv


 70%|██████████████████████████████████████████████████████▎                       | 1242/1784 [05:40<05:41,  1.59it/s]

openssl_openssl.csv


 70%|██████████████████████████████████████████████████████▍                       | 1245/1784 [05:41<04:25,  2.03it/s]

openstack_keystone.csv


 70%|██████████████████████████████████████████████████████▌                       | 1247/1784 [05:42<04:17,  2.09it/s]

openstack_nova.csv


 70%|██████████████████████████████████████████████████████▊                       | 1253/1784 [05:43<02:42,  3.28it/s]

opensuse_open-build-service.csv


 72%|████████████████████████████████████████████████████████▎                     | 1287/1784 [06:00<07:06,  1.16it/s]

owncloud_core.csv


 74%|██████████████████████████████████████████████████████████                    | 1329/1784 [06:15<07:02,  1.08it/s]

php_php-src.csv


 75%|██████████████████████████████████████████████████████████▋                   | 1342/1784 [06:17<01:17,  5.70it/s]

piwigo_piwigo.csv


 75%|██████████████████████████████████████████████████████████▊                   | 1346/1784 [06:18<01:15,  5.81it/s]

pjsip_pjproject.csv


 76%|███████████████████████████████████████████████████████████▌                  | 1361/1784 [06:21<02:17,  3.07it/s]

postgres_postgres.csv


 77%|████████████████████████████████████████████████████████████▍                 | 1381/1784 [06:25<01:35,  4.24it/s]

projectacrn_acrn-hypervisor.csv


 79%|█████████████████████████████████████████████████████████████▎                | 1401/1784 [06:31<02:02,  3.14it/s]

puppetlabs_puppet.csv


 79%|█████████████████████████████████████████████████████████████▌                | 1408/1784 [06:32<01:23,  4.51it/s]

pylons_waitress.csv


 79%|█████████████████████████████████████████████████████████████▉                | 1416/1784 [06:35<01:30,  4.07it/s]

python-pillow_pillow.csv


 79%|█████████████████████████████████████████████████████████████▉                | 1417/1784 [06:40<09:04,  1.48s/it]

python_cpython.csv


 80%|██████████████████████████████████████████████████████████████▎               | 1425/1784 [06:42<02:58,  2.01it/s]

qemu_qemu.csv


 80%|██████████████████████████████████████████████████████████████▎               | 1426/1784 [06:42<02:32,  2.35it/s]

qpdf_qpdf.csv


 80%|██████████████████████████████████████████████████████████████▌               | 1430/1784 [06:44<02:36,  2.27it/s]

qutebrowser_qutebrowser.csv


 81%|██████████████████████████████████████████████████████████████▉               | 1439/1784 [06:45<00:56,  6.16it/s]

radareorg_radare2.csv


 81%|███████████████████████████████████████████████████████████████               | 1442/1784 [06:48<02:31,  2.26it/s]

radare_radare2.csv


 82%|███████████████████████████████████████████████████████████████▊              | 1460/1784 [06:54<00:54,  5.99it/s]

rdesktop_rdesktop.csv


 82%|████████████████████████████████████████████████████████████████▎             | 1471/1784 [06:56<00:41,  7.63it/s]

redmine_redmine.csv


 83%|████████████████████████████████████████████████████████████████▋             | 1480/1784 [06:57<00:39,  7.63it/s]

requarks_wiki.csv


 83%|████████████████████████████████████████████████████████████████▉             | 1486/1784 [06:57<00:43,  6.87it/s]

revive-adserver_revive-adserver.csv


 84%|█████████████████████████████████████████████████████████████████▊            | 1506/1784 [07:02<01:58,  2.35it/s]

roundcube_roundcubemail.csv


 85%|██████████████████████████████████████████████████████████████████▎           | 1518/1784 [07:05<01:40,  2.64it/s]

rubygems_rubygems.csv


 87%|███████████████████████████████████████████████████████████████████▋          | 1549/1784 [07:21<00:45,  5.16it/s]

sandstorm-io_sandstorm.csv


 88%|████████████████████████████████████████████████████████████████████▋         | 1570/1784 [07:28<00:50,  4.25it/s]

sebhildebrandt_systeminformation.csv


 90%|██████████████████████████████████████████████████████████████████████        | 1602/1784 [07:38<00:49,  3.68it/s]

silverstripe_sapphire.csv


 91%|███████████████████████████████████████████████████████████████████████▎      | 1630/1784 [07:46<00:55,  2.76it/s]

snipe_snipe-it.csv


 93%|████████████████████████████████████████████████████████████████████████▌     | 1661/1784 [07:59<00:42,  2.87it/s]

sqlite_sqlite.csv


 94%|█████████████████████████████████████████████████████████████████████████     | 1672/1784 [08:03<00:26,  4.18it/s]

star7th_showdoc.csv


 96%|██████████████████████████████████████████████████████████████████████████▋   | 1707/1784 [08:13<00:48,  1.58it/s]

systemd_systemd.csv


 96%|███████████████████████████████████████████████████████████████████████████▏  | 1721/1784 [08:19<00:32,  1.94it/s]

testlinkopensourcetrms_testlink-code.csv


 97%|███████████████████████████████████████████████████████████████████████████▍  | 1726/1784 [08:20<00:20,  2.85it/s]

the-tcpdump-group_tcpdump.csv


 97%|███████████████████████████████████████████████████████████████████████████▊  | 1734/1784 [08:22<00:10,  4.83it/s]

thorsten_phpmyfaq.csv


 98%|████████████████████████████████████████████████████████████████████████████▎ | 1744/1784 [08:25<00:09,  4.16it/s]

tine20_tine-2.0-open-source-groupware-and-crm.csv


100%|██████████████████████████████████████████████████████████████████████████████| 1784/1784 [08:34<00:00,  3.47it/s]


In [15]:
benign_all

AttributeError: 'list' object has no attribute 'shape'

In [17]:
print("HI")
np.save('vuln_all_week.npy', vuln_all2)    # .npy extension is added if not given
np.save('benign_all_week.npy', benign_all2)    # .npy extension is added if not given

HI


In [6]:

vuln_all = np.load("vuln_all_week.npy")
benign_all = np.load("benign_all_week.npy")
def normalize(time_series_feature):
    if time_series_feature.max()-time_series_feature.min() == 0:
        return time_series_feature
    return (time_series_feature-time_series_feature.min())/(time_series_feature.max()-time_series_feature.min())

In [7]:
from collections import Counter

max_vals = max(Counter([v.shape for v in vuln_all]))
vuln_all = [v for v in vuln_all if v.shape == max_vals]
max_vals = max(Counter([v.shape for v in benign_all]))
benign_all = [v for v in benign_all if v.shape == max_vals]

vuln_all2 =np.nan_to_num(np.array(vuln_all))
benign_all2 = np.nan_to_num(np.array(benign_all))

vuln_all2.shape,benign_all2.shape

((1021, 9, 54), (1027, 9, 54))

In [8]:
all_train_x = np.concatenate([vuln_all2,benign_all2])
all_train_y = np.concatenate([np.ones(vuln_all2.shape[0]),np.zeros(benign_all2.shape[0])])
all_train_x.shape,all_train_y.shape

((2048, 9, 54), (2048,))

In [9]:
RESAMPLE = True
NORMALIZE = True

if NORMALIZE:
    all_train_x= normalize(all_train_x)
    vuln_all2 = normalize(vuln_all2)
    benign_all2 = normalize(benign_all2)

In [13]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
print(all_train_x.shape, all_train_x[0].shape)
print(all_train_y.shape, all_train_y[0].shape)
from numpy import mean
from numpy import std
from numpy import dstack
from pandas import read_csv
from matplotlib import pyplot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
 

X_train, X_test, y_train, y_test = train_test_split(all_train_x,all_train_y,shuffle=True)

"""model = Sequential()
model.add(Dense(50, input_shape=[all_train_x.shape[1]], activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=SGD(learning_rate=0.1), metrics=['accuracy'])

"""

"""model = Sequential()
model.add(Dense(150,input_shape=[benign_all2.shape[1]], activation='linear')) 
model.add(Dense(75, activation="linear"))
model.add(Dense(10, activation="linear"))
model.add(Dense(75, activation="linear"))
model.add(Dense(benign_all2.shape[1], activation="linear"))
model.compile(optimizer='adam',loss='mse')
"""


RESAMPLE = False
if RESAMPLE:
    #all_train_x = np.load('x.npy', allow_pickle=True)
    #all_train_y = np.load('y.npy', allow_pickle=True)
    #collections.Counter(all_train_y)
    

    ros = RandomOverSampler(random_state=42)

    ros.fit_resample(X_train[:,:,0], y_train)
    X_train = X_train[ros.sample_indices_]
    y_train = y_train[ros.sample_indices_]

"""
model = Sequential()

model.add(LSTM(100, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))

from keras.layers import RepeatVector
from keras.layers import TimeDistributed

model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=True,input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100,validation_data=(X_test,y_test),verbose=1)
"""

"""model = Sequential()

model.add(LSTM(100, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5,validation_data=(X_test,y_test),verbose=2)


"""

for i in [0.1,0.01,0.001,0.0001,0.00001]:

    model = Sequential()
    #model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    #model.add(MaxPooling1D(pool_size=2))
    #model.add(Flatten())
    #model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=i), metrics=['accuracy'])
    model.fit(X_train.reshape(X_train.shape[0],-1), y_train, verbose=0,epochs=50, batch_size=32,validation_data=(X_test.reshape(X_test.shape[0],-1),y_test))

    print(i,model.evaluate(X_test.reshape(X_test.shape[0],-1), y_test, verbose=0))


# model.fit(benign_all2, benign_all2, epochs=100)
          
"""
threshold = find_threshold(model, benign_all2)
print(f"Threshold: {threshold}")
# Threshold: 0.01001314025746261
predictions = get_predictions(model, vuln_all2, threshold)
accuracy_score(predictions, [0]*len(vuln_all2))"""

(2048, 9, 54) (9, 54)
(2048,) ()
0.1 [0.6936587691307068, 0.51171875]
0.01 [0.6930619478225708, 0.51171875]
0.001 [0.6779234409332275, 0.58203125]
0.0001 [0.6905045509338379, 0.52734375]
1e-05 [0.6930927038192749, 0.494140625]


'\nthreshold = find_threshold(model, benign_all2)\nprint(f"Threshold: {threshold}")\n# Threshold: 0.01001314025746261\npredictions = get_predictions(model, vuln_all2, threshold)\naccuracy_score(predictions, [0]*len(vuln_all2))'

In [14]:
!conda install -c conda-forge jupyterlab-git

^C


In [None]:
a = model.predict(normalize(vuln_all2))
from pandas import DataFrame
DataFrame(a).plot()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

def find_best_f1(X_test,y_test,model):
    max_f1 = 0
    thresh = 0
    best_y = 0
    for i in range(100):
        y_predict = (model.predict(X_test).reshape(-1)>i/100).astype(int)
        precision, recall, fscore, support = score(y_test,y_predict ,zero_division=0)
        cur_f1 = fscore[1]
        print(i,cur_f1)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            best_y = y_predict
            thresh = i / 100
    return max_f1,thresh, best_y


f1,thresh,best_y = find_best_f1(X_test,y_test,model)
print(f1)



In [None]:
import tensorflow as tf
nb_epoch = 10
batch_size = 64
input_dim = benign_all2.shape[1] #num of columns, 30
encoding_dim = 14
hidden_dim_1 = int(encoding_dim / 2) #
hidden_dim_2=4  
learning_rate = 1e-7

#input Layer
input_layer = tf.keras.layers.Input(shape=(input_dim, ))
#Encoder
encoder = tf.keras.layers.Dense(encoding_dim, activation="tanh",                                activity_regularizer=tf.keras.regularizers.l2(learning_rate))(input_layer)
encoder=tf.keras.layers.Dropout(0.2)(encoder)
encoder = tf.keras.layers.Dense(hidden_dim_1, activation='relu')(encoder)
encoder = tf.keras.layers.Dense(hidden_dim_2, activation=tf.nn.leaky_relu)(encoder)
# Decoder
decoder = tf.keras.layers.Dense(hidden_dim_1, activation='relu')(encoder)
decoder=tf.keras.layers.Dropout(0.2)(decoder)
decoder = tf.keras.layers.Dense(encoding_dim, activation='relu')(decoder)
decoder = tf.keras.layers.Dense(input_dim, activation='tanh')(decoder)
#Autoencoder
autoencoder = tf.keras.Model(inputs=input_layer, outputs=decoder)
#autoencoder.summary()

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='adam')

history = autoencoder.fit(benign_all2, benign_all2,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(vuln_all2, vuln_all2),
                    verbose=1,
                    ).history

In [10]:
vuln_all2.shape

(39213, 1840)

In [11]:
mall = np.concatenate([benign_all2,vuln_all2]) # .shape ,benign_all2.shape,vuln_all2.shape

In [12]:

threshold_fixed = 0.1
check = vuln_all2
#mall = check
test_x_predictions1 = autoencoder.predict(vuln_all2)
# test_x_predictions2 = autoencoder.predict(mall[int(mall.shape[0]/2):])


In [13]:
mse = np.mean(np.power(mall - test_x_predictions, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_error': mse,
                        'True_class': len(benign_all2)*[0]+len(vuln_all2)*[1]})

NameError: name 'test_x_predictions' is not defined

In [None]:
groups = error_df.groupby('True_class')
fig, ax = plt.subplots()
for name, group in groups:
    ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='',
            label= "Fraud" if name == 1 else "Normal")
ax.hlines(0.4, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
# plt.ylim([0,1])
plt.title("Reconstruction error for normal and fraud data")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.show()

In [539]:
accuracy_score(predictions, [1]*len(vuln_all2))

0.9918881118881119

In [486]:
test_x_predictions = autoencoder.predict(test_data)
mse = np.mean(np.power(test_data - test_x_predictions, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_error': mse,
                        'True_class': test_labels})
Plotting the test data points and their respective reconstruction error sets a threshold value to visualize if the threshold value needs to be adjusted.
threshold_fixed = 50
groups = error_df.groupby('True_class')
fig, ax = plt.subplots()
for name, group in groups:
    ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='',
            label= "Fraud" if name == 1 else "Normal")
ax.hlines(threshold_fixed, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
plt.title("Reconstruction error for normal and fraud data")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.show();


from sklearn.metrics import accuracy_score
import tensorflow as tf
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_test_scaled, threshold):
    predictions = model.predict(x_test_scaled)
    # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_test_scaled)
    # 0 = anomaly, 1 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 0.0 if x == True else 1.0)
    return preds



Threshold: 1.2196545370888118e-05


0.017708333333333333

In [505]:
predictions = get_predictions(model, vuln_all2, threshold-0.000011)
accuracy_score(predictions, [0]*len(vuln_all2))

0.04895833333333333

0 0.6587436332767402
1 0.6587436332767402
2 0.6587436332767402
3 0.6587436332767402
4 0.6587436332767402
5 0.6587436332767402
6 0.6587436332767402
7 0.6587436332767402
8 0.6587436332767402
9 0.6587436332767402
10 0.6587436332767402
11 0.6587436332767402
12 0.6587436332767402
13 0.6587436332767402
14 0.6587436332767402
15 0.6587436332767402
16 0.6587436332767402
17 0.6587436332767402
18 0.6587436332767402


KeyboardInterrupt: 

  return (time_series_feature-time_series_feature.min())/(time_series_feature.max()-time_series_feature.min())


In [119]:
for i in range(30):
    print(i,np.isnan(all_train_x[:,:,i]).any())
    

0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False


IndexError: index 21 is out of bounds for axis 2 with size 21

In [118]:
all_train_x = all_train_x[:,:,:21]