# Include needed files. 
## all of them should be available

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf

import csv
import os
import sys
import glob
import operator
import time

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.utils.np_utils import to_categorical, normalize

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import TensorBoard

# Include Dataset


In [0]:
%%bash
URL=https://iscxdownloads.cs.unb.ca/iscxdownloads/ISCX-URL-2016/
FILES=(ISCXURL2016.zip) 
for FILE in ${FILES[*]}; do
    if [ ! -f "$FILE" ]; then
        printf "downloading %s\n" $FILE
        curl -O $URL$FILE
        # unzip files
        echo 'unzipping ' $FILE
        unzip -o $FILE #overwrite exiting files/folders if exists
    fi
done

### Check Dataset
> need to ensure that the dataset was properly downloaded.


In [3]:
! ls FinalDataset

All_BestFirst.csv	      Malware_Infogain.csv
All_BestFirst_test.csv	      Malware_Infogain_test.csv
All.csv			      Phishing_BestFirst.csv
All.csv.pickle		      Phishing.csv
All_Infogain.csv	      Phishing_Infogain.csv
All_Infogain_test.csv	      Phishing_Infogain_test.csv
Defacement_BestFirst.csv      Spam_BestFirst.csv
Defacement.csv		      Spam_BestFirst_test.csv
Defacement_Infogain.csv       Spam.csv
Defacement_Infogain_test.csv  Spam_Infogain.csv
Malware_BestFirst.csv	      Spam_Infogain_test.csv
Malware.csv		      URL


# Set some data
> Some data needs to be set, we need to ensure that constants are set properly. These are important but will not be used until much later.

In [0]:
resultPath = 'results_keras_tensorflow'
if not os.path.exists(resultPath):
   print('result path {} created.'.format(resultPath))
   os.mkdir(resultPath)

In [0]:
dep_var = 'Label'
model_name="init"

In [0]:
cat_names = []
cont_names = []

## Analyze FinalDataset/All.csv file
> lets make sure that the files are properly added, this should look similar to the FASTAI experiments.

In [0]:
df = pd.read_csv('FinalDataset/All.csv', low_memory=False)

In [8]:
df.shape

(36707, 80)

In [9]:
df['argPathRatio'].astype('float')

0        0.076923
1        0.058824
2        0.060606
3        0.025974
4        0.040816
5        0.033898
6        0.046512
7        0.040000
8        0.045455
9        0.090909
10       0.043478
11       0.039216
12       0.095238
13       0.105263
14       0.080000
15       0.086957
16       0.038462
17       0.083333
18       0.017241
19       0.016949
20       0.020408
21       0.012579
22       0.014815
23       0.014085
24       0.012500
25       0.018182
26       0.050000
27       0.037037
28       0.039216
29       0.036364
           ...   
36677    0.792593
36678    0.763636
36679    0.794118
36680    0.015625
36681    0.057143
36682    0.085106
36683    0.662651
36684    0.100000
36685    0.407407
36686    0.036364
36687    0.578125
36688    0.129412
36689    0.712644
36690    0.785047
36691    0.086957
36692    0.040000
36693    0.068966
36694    0.742574
36695    0.015267
36696    0.071429
36697    0.985386
36698    0.817308
36699    0.275000
36700    0.052632
36701    0

In [10]:
df.columns

Index(['Querylength', 'domain_token_count', 'path_token_count',
       'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
       'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
       'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
       'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
       'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
       'ISIpAddressInDomainName', 'CharacterContinuityRate',
       'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
       'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
       'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
       'Directory_LetterCount', 'Filename_LetterCount',
       'Extension_LetterCount', 'Query_LetterCount', 'LongestPathToken

In [11]:
df.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,ldl_domain,ldl_path,ldl_filename,ldl_getArg,dld_url,dld_domain,dld_path,dld_filename,dld_getArg,urlLen,domainlength,pathLength,subDirLen,fileNameLen,this.fileExtLen,ArgLen,pathurlRatio,ArgUrlRatio,argDomanRatio,domainUrlRatio,pathDomainRatio,argPathRatio,executable,isPortEighty,NumberofDotsinURL,ISIpAddressInDomainName,CharacterContinuityRate,LongestVariableValue,URL_DigitCount,host_DigitCount,Directory_DigitCount,File_name_DigitCount,Extension_DigitCount,Query_DigitCount,URL_Letter_Count,host_letter_count,Directory_LetterCount,Filename_LetterCount,Extension_LetterCount,Query_LetterCount,LongestPathTokenLength,Domain_LongestWordLength,Path_LongestWordLength,sub-Directory_LongestWordLength,Arguments_LongestWordLength,URL_sensitiveWord,URLQueries_variable,spcharUrl,delimeter_Domain,delimeter_path,delimeter_Count,NumberRate_URL,NumberRate_Domain,NumberRate_DirectoryName,NumberRate_FileName,NumberRate_Extension,NumberRate_AfterPath,SymbolCount_URL,SymbolCount_Domain,SymbolCount_Directoryname,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,4,5,5.5,14,4.4,4,8,3,0,0,0,0,0,0,0,0,0,0,58,25,26,26,13,1,2,0.448276,0.034483,0.08,0.431034,1.04,0.07692308,0,-1,5,-1,0.6,-1,1,0,0,0,1,-1,47,22,8,13,0,-1,13,14,13,5,-1,0,0,3,0,2,-1,0.017241,0.0,0.0,0.066667,1.0,-1.0,8,3,2,1,0,-1,0.726298,0.784493,0.894886,0.850608,,-1.0,Defacement
1,0,4,5,5.5,14,6.0,4,12,4,0,0,0,0,0,0,0,0,0,0,66,25,34,34,2,2,2,0.515151,0.030303,0.08,0.378788,1.36,0.05882353,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,56,22,8,13,9,-1,13,14,13,5,-1,0,0,4,0,1,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.688635,0.784493,0.814725,0.859793,0.0,-1.0,Defacement
2,0,4,5,5.5,14,5.8,4,12,5,0,0,0,0,0,0,0,0,0,0,65,25,33,33,2,2,2,0.507692,0.030769,0.08,0.384615,1.32,0.060606062,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,55,22,8,13,8,-1,13,14,13,5,-1,0,0,4,0,1,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.695049,0.784493,0.814725,0.80188,0.0,-1.0,Defacement
3,0,4,12,5.5,14,5.5,4,32,16,0,0,0,0,0,0,0,0,0,0,109,25,77,77,2,2,2,0.706422,0.018349,0.08,0.229358,3.08,0.025974026,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,92,22,8,13,45,-1,52,14,13,13,-1,0,0,4,0,8,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.64013,0.784493,0.814725,0.66321,0.0,-1.0,Defacement
4,0,4,6,5.5,14,7.333334,4,18,11,0,0,0,0,0,0,0,0,0,0,81,25,49,49,2,2,2,0.604938,0.024691,0.08,0.308642,1.96,0.040816326,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,70,22,8,13,23,-1,24,14,13,13,-1,0,0,4,0,2,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.681307,0.784493,0.814725,0.804526,0.0,-1.0,Defacement


In [12]:
df.tail()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,ldl_domain,ldl_path,ldl_filename,ldl_getArg,dld_url,dld_domain,dld_path,dld_filename,dld_getArg,urlLen,domainlength,pathLength,subDirLen,fileNameLen,this.fileExtLen,ArgLen,pathurlRatio,ArgUrlRatio,argDomanRatio,domainUrlRatio,pathDomainRatio,argPathRatio,executable,isPortEighty,NumberofDotsinURL,ISIpAddressInDomainName,CharacterContinuityRate,LongestVariableValue,URL_DigitCount,host_DigitCount,Directory_DigitCount,File_name_DigitCount,Extension_DigitCount,Query_DigitCount,URL_Letter_Count,host_letter_count,Directory_LetterCount,Filename_LetterCount,Extension_LetterCount,Query_LetterCount,LongestPathTokenLength,Domain_LongestWordLength,Path_LongestWordLength,sub-Directory_LongestWordLength,Arguments_LongestWordLength,URL_sensitiveWord,URLQueries_variable,spcharUrl,delimeter_Domain,delimeter_path,delimeter_Count,NumberRate_URL,NumberRate_Domain,NumberRate_DirectoryName,NumberRate_FileName,NumberRate_Extension,NumberRate_AfterPath,SymbolCount_URL,SymbolCount_Domain,SymbolCount_Directoryname,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
36702,29,4,14,5.75,12,3.666667,4,20,24,3,0,3,0,2,0,0,0,0,0,146,26,113,113,2,2,85,0.773973,0.582192,3.269231,0.178082,4.346154,0.7522124,0,-1,5,-1,0.5,23,31,0,4,0,27,3,94,23,46,7,14,24,43,12,11,11,23,0,3,6,0,2,5,0.212329,0.0,0.064516,0.529412,0.627907,0.066667,19,3,11,3,2,7,0.690555,0.791265,0.777498,0.690227,0.656684,0.796205,spam
36703,0,4,13,3.75,8,8.461538,4,24,23,0,0,0,0,0,0,0,0,0,0,147,18,122,122,2,2,2,0.829932,0.013605,0.111111,0.122449,6.777778,0.016393442,0,-1,5,-1,0.5,-1,21,0,0,0,21,-1,101,15,7,6,69,-1,105,8,9,9,-1,0,0,3,0,2,-1,0.142857,0.0,0.0,0.1875,0.2,-1.0,23,3,2,16,15,-1,0.665492,0.82001,0.879588,0.6744,0.674671,-1.0,spam
36704,58,3,27,6.666666,16,3.375,3,41,34,20,0,20,0,18,12,0,12,0,12,246,22,217,217,2,2,182,0.882114,0.739837,8.272727,0.089431,9.863636,0.83870965,0,-1,7,-1,0.772727,58,57,0,6,0,51,1,156,20,71,3,58,48,118,16,12,12,0,0,1,12,0,9,1,0.231707,0.0,0.073171,0.377778,0.418033,0.029412,26,2,14,8,7,9,0.656807,0.801139,0.684777,0.713622,0.717187,0.705245,spam
36705,35,3,13,4.333334,9,3.6,3,15,13,7,0,7,0,7,4,0,4,0,4,116,15,94,94,2,2,71,0.810345,0.612069,4.733333,0.12931,6.266667,0.7553192,0,-1,3,-1,0.666667,32,25,0,0,0,25,23,73,13,4,11,41,12,75,9,8,8,0,0,2,3,0,3,3,0.215517,0.0,0.0,0.284091,0.333333,0.418182,14,2,1,9,8,3,0.725963,0.897617,0.871049,0.745932,0.758824,0.790772,spam
36706,40,3,25,6.666666,16,3.25,3,35,31,19,0,19,0,17,6,0,6,0,6,227,22,198,198,2,2,164,0.872247,0.722467,7.454546,0.096916,9.0,0.82828283,0,-1,6,-1,0.772727,40,52,0,6,1,45,2,144,20,50,6,64,31,118,16,10,10,0,0,1,11,0,8,1,0.229075,0.0,0.083333,0.365079,0.381356,0.06,24,2,13,7,6,7,0.674351,0.801139,0.697282,0.730563,0.731481,0.769238,spam


# Functions for Testing
> Now that our data has been collected it is time to create functions that will be used in later tests.

In [0]:
def loadData(csvFile):
    pickleDump = '{}.pickle'.format(csvFile)
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
    else:
        df = pd.read_csv(csvFile, low_memory=False)
        # clean data
        # strip the whitspaces from column names
        df = df.rename(str.strip, axis='columns')
        #df.drop(columns=[], inplace=True)
        # drop missing values/NaN etc.
        #df.dropna(inplace=True)
        # drop Infinity rows and NaN string from each column
        for col in df.columns:
            indexNames = df[df[col]=='Infinity'].index
            if not indexNames.empty:
                print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
            indexNames = df[df[col]=='NaN'].index
            if not indexNames.empty:
                print('deleting {} rows with NaN in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
        
        df.to_pickle(pickleDump)
    
    return df


In [0]:
def baseline_model(feature_layer,inputDim=-1):
    global model_name
    model = tf.keras.Sequential([
        feature_layer,
        Dense(128, activation='relu', input_shape=(inputDim,)),
    #print(f"out_shape[1]:{out_shape[1]}")
        Dense(32, activation='relu'),
        Dense(5, activation='softmax')
    ]) #This is the output layer

    print('Categorical Cross-Entropy Loss Function')
    model_name += "_categorical"
    model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
#         else:
#             model_name += "_binary"
#             print('Binary Cross-Entropy Loss Function')
#             model.compile(optimizer='adam',
#                     loss='binary_crossentropy',
#                     metrics=['accuracy'])
    return model

In [0]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe=dataframe.copy()
    data_y=dataframe.pop(dep_var)
    encoder = LabelEncoder()
    encoder.fit(data_y)
    data_y = encoder.transform(data_y)
    dummy_y = to_categorical(data_y)
    dataframe = dataframe.copy()
    labels = dummy_y
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

# Test LoadData Function
> This will look just like the FastAI test, but we are using Tensor, so lets make sure it works.

In [0]:
df1 = loadData('FinalDataset/All.csv')

In [17]:
df1.columns


Index(['Querylength', 'domain_token_count', 'path_token_count',
       'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
       'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
       'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
       'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
       'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
       'ISIpAddressInDomainName', 'CharacterContinuityRate',
       'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
       'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
       'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
       'Directory_LetterCount', 'Filename_LetterCount',
       'Extension_LetterCount', 'Query_LetterCount', 'LongestPathToken

In [18]:
df1.shape

(36697, 80)

In [19]:
df1['NumberRate_Extension'][:10]

0    1.0
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    1.0
Name: NumberRate_Extension, dtype: float64

In [20]:
df1.shape

(36697, 80)

In [21]:
df1.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,ldl_domain,ldl_path,ldl_filename,ldl_getArg,dld_url,dld_domain,dld_path,dld_filename,dld_getArg,urlLen,domainlength,pathLength,subDirLen,fileNameLen,this.fileExtLen,ArgLen,pathurlRatio,ArgUrlRatio,argDomanRatio,domainUrlRatio,pathDomainRatio,argPathRatio,executable,isPortEighty,NumberofDotsinURL,ISIpAddressInDomainName,CharacterContinuityRate,LongestVariableValue,URL_DigitCount,host_DigitCount,Directory_DigitCount,File_name_DigitCount,Extension_DigitCount,Query_DigitCount,URL_Letter_Count,host_letter_count,Directory_LetterCount,Filename_LetterCount,Extension_LetterCount,Query_LetterCount,LongestPathTokenLength,Domain_LongestWordLength,Path_LongestWordLength,sub-Directory_LongestWordLength,Arguments_LongestWordLength,URL_sensitiveWord,URLQueries_variable,spcharUrl,delimeter_Domain,delimeter_path,delimeter_Count,NumberRate_URL,NumberRate_Domain,NumberRate_DirectoryName,NumberRate_FileName,NumberRate_Extension,NumberRate_AfterPath,SymbolCount_URL,SymbolCount_Domain,SymbolCount_Directoryname,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,4,5,5.5,14,4.4,4,8,3,0,0,0,0,0,0,0,0,0,0,58,25,26,26,13,1,2,0.448276,0.034483,0.08,0.431034,1.04,0.07692308,0,-1,5,-1,0.6,-1,1,0,0,0,1,-1,47,22,8,13,0,-1,13,14,13,5,-1,0,0,3,0,2,-1,0.017241,0.0,0.0,0.066667,1.0,-1.0,8,3,2,1,0,-1,0.726298,0.784493,0.894886,0.850608,,-1.0,Defacement
1,0,4,5,5.5,14,6.0,4,12,4,0,0,0,0,0,0,0,0,0,0,66,25,34,34,2,2,2,0.515151,0.030303,0.08,0.378788,1.36,0.05882353,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,56,22,8,13,9,-1,13,14,13,5,-1,0,0,4,0,1,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.688635,0.784493,0.814725,0.859793,0.0,-1.0,Defacement
2,0,4,5,5.5,14,5.8,4,12,5,0,0,0,0,0,0,0,0,0,0,65,25,33,33,2,2,2,0.507692,0.030769,0.08,0.384615,1.32,0.060606062,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,55,22,8,13,8,-1,13,14,13,5,-1,0,0,4,0,1,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.695049,0.784493,0.814725,0.80188,0.0,-1.0,Defacement
3,0,4,12,5.5,14,5.5,4,32,16,0,0,0,0,0,0,0,0,0,0,109,25,77,77,2,2,2,0.706422,0.018349,0.08,0.229358,3.08,0.025974026,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,92,22,8,13,45,-1,52,14,13,13,-1,0,0,4,0,8,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.64013,0.784493,0.814725,0.66321,0.0,-1.0,Defacement
4,0,4,6,5.5,14,7.333334,4,18,11,0,0,0,0,0,0,0,0,0,0,81,25,49,49,2,2,2,0.604938,0.024691,0.08,0.308642,1.96,0.040816326,0,-1,4,-1,0.6,-1,0,0,0,0,0,-1,70,22,8,13,23,-1,24,14,13,13,-1,0,0,4,0,2,-1,0.0,0.0,0.0,0.0,,-1.0,8,3,3,0,0,-1,0.681307,0.784493,0.814725,0.804526,0.0,-1.0,Defacement


  # Experimenting with Final Dataset/All.csv
  
  #### Total Samples for each Type

In [22]:
label = 'URL_Type_obf_Type'
lblTypes=set(df[label])
for lbl in lblTypes:
    print('| {} | {} |'.format(lbl, len(df[df[label] == lbl].index)))

| benign | 7781 |
| phishing | 7586 |
| spam | 6698 |
| Defacement | 7930 |
| malware | 6712 |


In [0]:
dataPath = 'FinalDataset'
dep_var = label
cont_names = list(set(df.columns) - set(cat_names) - set([dep_var]))

In [24]:
cont_names

['Query_DigitCount',
 'URL_sensitiveWord',
 'Entropy_DirectoryName',
 'NumberRate_FileName',
 'SymbolCount_Directoryname',
 'domainlength',
 'Directory_DigitCount',
 'URLQueries_variable',
 'delimeter_Count',
 'Entropy_Domain',
 'pathDomainRatio',
 'this.fileExtLen',
 'SymbolCount_Extension',
 'ArgUrlRatio',
 'CharacterContinuityRate',
 'argPathRatio',
 'dld_getArg',
 'fileNameLen',
 'NumberRate_AfterPath',
 'dld_domain',
 'Extension_DigitCount',
 'argDomanRatio',
 'NumberRate_DirectoryName',
 'longdomaintokenlen',
 'urlLen',
 'isPortEighty',
 'Directory_LetterCount',
 'ldl_path',
 'charcompvowels',
 'Entropy_Extension',
 'avgpathtokenlen',
 'ArgLen',
 'avgdomaintokenlen',
 'NumberRate_URL',
 'ldl_url',
 'dld_path',
 'Path_LongestWordLength',
 'sub-Directory_LongestWordLength',
 'Query_LetterCount',
 'executable',
 'pathLength',
 'LongestPathTokenLength',
 'Entropy_Filename',
 'Domain_LongestWordLength',
 'delimeter_path',
 'ISIpAddressInDomainName',
 'URL_DigitCount',
 'Filename_Lette

In [25]:
df1['argPathRatio']

0         0.07692308
1         0.05882353
2        0.060606062
3        0.025974026
4        0.040816326
5        0.033898305
6        0.046511628
7               0.04
8        0.045454547
9         0.09090909
10        0.04347826
11       0.039215688
12         0.0952381
13        0.10526316
14              0.08
15        0.08695652
16        0.03846154
17       0.083333336
18        0.01724138
19       0.016949153
20       0.020408163
21       0.012578616
22       0.014814815
23       0.014084507
24            0.0125
25       0.018181818
26              0.05
27       0.037037037
28       0.039215688
29       0.036363635
            ...     
36677      0.7925926
36678     0.76363635
36679      0.7941176
36680       0.015625
36681    0.057142857
36682     0.08510638
36683      0.6626506
36684            0.1
36685      0.4074074
36686    0.036363635
36687       0.578125
36688     0.12941177
36689      0.7126437
36690     0.78504676
36691     0.08695652
36692           0.04
36693     0.0

In [0]:
df1.argPathRatio = df1['argPathRatio'].astype('float')

In [27]:
for col in df1.columns:
  print(df[col])

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
        ... 
36677     30
36678     41
36679     30
36680      0
36681      0
36682      0
36683     42
36684      0
36685      4
36686      0
36687     14
36688      2
36689     17
36690     41
36691      0
36692      0
36693      0
36694     40
36695      0
36696      0
36697    941
36698     51
36699      6
36700      0
36701      0
36702     29
36703      0
36704     58
36705     35
36706     40
Name: Querylength, Length: 36707, dtype: int64
0        4
1        4
2        4
3        4
4        4
5        4
6        4
7        4
8        4
9        4
10       4
11       4
12       4
13       4
14    

# Experimenting with Tensorflow Keras

#### Globals for Testing


In [0]:
dataFile = 'All.csv'
optimizer='adam'
epochs=10
batch_size=10
feature_columns = []

#### Numeric Columns setup

In [0]:
#numeric columns
for header in ['dld_getArg',
 'URLQueries_variable',
 'Directory_DigitCount',
 'URL_sensitiveWord',
 'spcharUrl',
 'URL_Letter_Count',
 'SymbolCount_Directoryname',
 'host_DigitCount']:
  feature_columns.append(feature_column.numeric_column(header))

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

#### Training Setup


In [0]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

batch_size = 32 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

#### Optimizer setup

In [37]:
seed = 7
np.random.seed(seed)
cvscores = []
print('optimizer: {} epochs: {} batch_size: {}'.format(optimizer, epochs, batch_size))

optimizer: adam epochs: 10 batch_size: 32


#### Create Data for Analysis

In [0]:
time_gen = int(time.time())
global model_name

model_name = f"{dataFile}_{time_gen}"
# we need to invoke a dataFile, but im not sure what it will be.
tensorboard = TensorBoard(log_dir='logs/{}'.format(model_name))

#### Define 5-fold cross validation test harness


In [38]:
inputDim = 79
print('inputdim = ', inputDim)

inputdim =  79


#### Model Creation

In [41]:
model = baseline_model(feature_layer, inputDim)
model.fit(train_ds, validation_data=val_ds, epochs=epochs)
model.save('{}.model'.format(os.path.basename(dataPath)))

W0712 21:37:09.619459 140142850721664 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Categorical Cross-Entropy Loss Function
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Setup Final Results

In [43]:
scores = model.evaluate(test_ds, verbose=1)
print(model.metrics_names)
acc, loss=scores[1]*100, scores[0]*100
print('Baseline: accuracy: {:.2f}%: loss: {:.2f}'.format(acc, loss))

resultFile = os.path.join(resultPath, dataFile)
with open('{}.result'.format(resultFile), 'a') as fout:
  fout.write('{} results...'.format(model_name))
  fout.write('\taccuracy: {:.2f} loss: {:.2f}\n'.format(acc, loss))

['loss', 'acc']
Baseline: accuracy: 65.30%: loss: 90.95
