In [1]:
%run Imports\&functions.ipynb

In [2]:
# list of binetflow files
#READ ME: This is only a fraction of the entire dataset
#Github does not allow file size above 100mb
#This is only 78mb of an 2.73 Gb total
binetflow_list =[ "binetflow/capture20110816-2.binetflow",
                 "binetflow/capture20110818-2.binetflow",
                 "binetflow/capture20110819.binetflow"]

# read binetflows into df1-df13 variables
i = 1
for binetflow in binetflow_list:
    globals()["df" + str(i)] = pd.read_csv(binetflow)
    i+=1

#place newly read pandas into list for easier data processing
# df_list = [df1,df2,df3,df4,df5,
#            df6,df7,df8,df9,df10,
#            df11,df12,df13]

#changed for smaller dataset
df_list = [df1,df2,df3]

In [3]:
#Drop null values
#Most from one column but Dropping column lost more information than dropping nulls
for df in df_list:
    df.dropna(inplace=True)

In [4]:
#Change Label column from string to boolean for modeling
for df in df_list:
    df.Label = df.Label.str.contains("Botnet")

In [5]:
for df in df_list:
    df.drop('StartTime',axis=1, inplace=True)
    df.drop('SrcAddr',axis=1, inplace=True)
    

In [6]:
#adding dataframes together for use in cloud instance
total_df = df1.append(df2)
total_df = total_df.append(df3)

# total_df = total_df.append(df4)
# total_df = total_df.append(df5)
# total_df = total_df.append(df6)
# total_df = total_df.append(df7)
# total_df = total_df.append(df8)
# total_df = total_df.append(df9)
# total_df = total_df.append(df10)
# total_df = total_df.append(df11)
# total_df = total_df.append(df12)
# total_df = total_df.append(df13)

# #Split up append functions individually rather than loop
#Because of checking shape and errors throughout 

In [7]:
#Noticed identical rows when all dataframes added together
total_df.drop_duplicates(inplace=True)

In [8]:
#regex pattern for grabbing first 6 digits of IP address
pattern ="^\d{1,3}\.\d{1,3}\."
#replace column values with pattern grab
total_df['DstAddr'] = total_df['DstAddr'].map(lambda x: re.findall(pattern,x)[0])

In [9]:
#get dummies of categorical values that don't have value counts over 30
total_df = pd.get_dummies(total_df, columns=['State','Proto','Dir'], drop_first=True)

In [10]:
total_df.shape

(490375, 177)

In [11]:
total_df.Label.value_counts(normalize=True)

False    0.996941
True     0.003059
Name: Label, dtype: float64

In [12]:
#Random sampling of minority and majority values
#READ THIS: n_samples value significantly lowered for notebooks to be able to run on laptops 
#Label value is what we are trying to predict

df_majority = total_df[total_df.Label==False]
df_minority = total_df[total_df.Label==True]

df_majority_downsampled = resample(df_majority, 
                                 replace=False,
                                 n_samples=50000,# to match majority class
                                 random_state=42) # reproducible results

# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=20000,# to match minority class
                                 random_state=42) # reproducible results
 

In [13]:
df_minority_upsampled[df_minority_upsampled.Label==True].shape

(20000, 177)

In [14]:
df_majority_downsampled[df_majority_downsampled.Label==False].shape

(50000, 177)

In [15]:
# Combine upsampled minority class with majority class
sampled_df = pd.concat([df_minority_upsampled, df_majority_downsampled])

In [16]:
sampled_df[sampled_df.Label==True].shape

(20000, 177)

In [17]:
#Grab top 100 value counts of the three remaining categorical columns

top_dstaddr = sampled_df.DstAddr.value_counts().index[:100]
top_dport = sampled_df.Dport.value_counts().index[:100]
top_sport = sampled_df.Sport.value_counts().index[:100]

# func that returns a dummified DataFrame of significant dummies in a given column
def dum_sign(dummy_col, threshold=1):

    # removes the bind
    dummy_col = dummy_col.copy()

    # what is the ratio of a dummy in whole column
    count = pd.value_counts(dummy_col) / len(dummy_col)

    # cond whether the ratios is higher than the threshold
    mask = dummy_col.isin(count[count > threshold].index)

    # replace the ones which ratio is lower than the threshold by a special name
    #dummy_col[~mask] = "others"

    return pd.get_dummies(dummy_col, prefix=dummy_col.name)

# assign values to pandas dataframe
dstaddr_dummies= dum_sign(top_dstaddr)
dport_dummies= dum_sign(top_dport)
sport_dummies= dum_sign(top_sport)

In [18]:
sampled_df= sampled_df.join(dstaddr_dummies)

In [19]:
sampled_df= sampled_df.join(dport_dummies)

In [20]:
# sampled_df= sampled_df.join(sport_dummies)

In [21]:
# sampled_df = pd.get_dummies(df, columns=['DstAddr'], drop_first=True)
# sampled_df.isnull().sum()
sampled_df[sampled_df.Label==True].shape

(20000, 377)

In [22]:
#remove dummied columns to save memory
sampled_df = sampled_df.drop(axis=1,labels=['DstAddr','Dport','Sport'])

In [23]:
sampled_df= sampled_df.fillna(0)

In [24]:
from sklearn.model_selection import train_test_split

y = sampled_df['Label']
X = sampled_df.drop(axis=1,columns=['Label'])

# Import and do Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state = 42)

In [25]:
from sklearn.linear_model import LogisticRegression

logmod = LogisticRegression(max_iter=1000,verbose=1)

# Fit on training data.
logmod.fit(X_train, y_train)

# Print coefficients.
print('Intercept:', logmod.intercept_)
print('Coef(s):', logmod.coef_)

logmod.score(X_test,y_test)



[LibLinear]Intercept: [-0.08371199]
Coef(s): [[ 8.44032488e-05 -2.13031546e-04 -1.39316744e-05  9.85061019e-04
   1.50612436e-06 -9.70961474e-05 -4.31201809e-06  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -7.02412864e-02 -1.48529698e-04 -1.83503566e-05 -2.60897525e-04
  -2.17658126e-05 -2.28023891e-05  0.00000000e+00 -3.20433755e-05
  -1.38018339e-05  0.00000000e+00 -4.06866961e-05 -1.68223116e-04
  -4.44623496e-06 -1.37658244e-05  0.00000000e+00 -8.82562966e-06
  -4.38727689e-06 -4.59224251e-06 -4.44817306e-06 -4.37102760e-06
   0.00000000e+00 -4.58494650e-06 -4.55380717e-06  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -4.39643892e-06 -1.50729883e-05 -7.69578283e-06
   0.00000000e+00 -4.56847473e-06  3.40770075e-05 -2.29082395e-03
  -6.84438165e-05 -7.73251989e-05  0.00000000e+00  0.00000000e+00
  -4.51456714e-06 -7.77717155e-05 -4.53548640e-06 -4.38018849e-06
   0.00000000e+00  0.00000000e+



0.7114285714285714

In [26]:
# Import libraries and modules

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

Using TensorFlow backend.


In [36]:
model = Sequential()

n_input = X_train.shape[1]
n_hidden = n_input

model.add(Dense(n_hidden, input_dim=n_input, activation='relu'))
# model.add(Dense(, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [37]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [38]:
#Epoch smaller for the notebook by 2 orders of magnitude
model.fit(X_train,
          y_train,
          batch_size = 589,
          epochs = 50,
          verbose = 1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x12b262240>

In [39]:
score = model.evaluate(X_test, y_test, verbose=0)
labels = model.metrics_names
print(f'{labels[0]}: {score[0]}')
print(f'{labels[1]}: {score[1]}')

loss: 4.650300817108154
acc: 0.7114857142857143


In [40]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 373)               139502    
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 374       
Total params: 139,876
Trainable params: 139,876
Non-trainable params: 0
_________________________________________________________________
