In [2]:
#!pip install --upgrade tensorflow
!pip install Keras
!pip install pandas
#!pip install pandas
!pip install matplotlib


Collecting Keras
  Using cached https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl
Collecting keras-applications>=1.0.6 (from Keras)
Collecting numpy>=1.9.1 (from Keras)
  Using cached https://files.pythonhosted.org/packages/d7/b1/3367ea1f372957f97a6752ec725b87886e12af1415216feec9067e31df70/numpy-1.16.5-cp27-cp27mu-manylinux1_x86_64.whl
Collecting keras-preprocessing>=1.0.5 (from Keras)
  Using cached https://files.pythonhosted.org/packages/28/6a/8c1f62c37212d9fc441a7e26736df51ce6f0e38455816445471f10da4f0a/Keras_Preprocessing-1.1.0-py2.py3-none-any.whl
Collecting h5py (from Keras)
  Using cached https://files.pythonhosted.org/packages/12/90/3216b8f6d69905a320352a9ca6802a8e39fdb1cd93133c3d4163db8d5f19/h5py-2.10.0-cp27-cp27mu-manylinux1_x86_64.whl
Collecting scipy>=0.14 (from Keras)
  Using cached https://files.pythonhosted.org/packages/1d/f6/7c16d60aeb3694e5611976cb4f1eaf1c6b7f1e7c55771d691013405a02

## ENCODE FUNCTION

In [1]:
import base64
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.model_selection import train_test_split
import itertools


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#{name}-{tv}
    
# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = f"{name}-{tv}"
        df[name2] = l
        
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"


# Regression chart.
def chart_regression(pred, y, sort=True):
    t = pd.DataFrame({'pred': pred, 'y': y.flatten()})
    if sort:
        t.sort_values(by=['y'], inplace=True)
    plt.plot(t['y'].tolist(), label='expected')
    plt.plot(t['pred'].tolist(), label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean())
                          >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low


# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:
# data - Pandas dataframe output.
# key - Your student key that was emailed to you.
# no - The assignment class number, should be 1 through 1.
# source_file - The full path to your Python or IPYNB file.  This must have "_class1" as part of its name.  
# .             The number must match your assignment number.  For example "_class2" for class assignment #2.
def submit(data,key,no,source_file=None):
    if source_file is None and '__file__' not in globals(): raise Exception('Must specify a filename when a Jupyter notebook.')
    if source_file is None: source_file = __file__
    suffix = '_class{}'.format(no)
    if suffix not in source_file: raise Exception('{} must be part of the filename.'.format(suffix))
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://api.heatonresearch.com/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv(index=False).encode('ascii')).decode("ascii"),
        'assignment': no, 'ext':ext, 'py':encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

## READ DATASET

In [11]:
from keras.utils.data_utils import get_file
#ver como llamar con hadoop.
try:
    path = './julioredux3.csv'
    #ugr_file = open(path,'r')
    #path = get_file('kddcup.data.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz')
except:
    print('Error loading')
    raise
    
print(path) 

# This file is a CSV, just no CSV extension or headers
# Download from: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
df = pd.read_csv(path, header=None)

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
df.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'te',
    'td',
    'sa',
    'da',
    'sp',
    'dp',
    'pr',
    'flg',
    'fwd',
    'stos',
    'pkt',
    'byt',
    'norm'
]

# display 5 rows
#df[0:6]

./julioredux3.csv
Read 2493050 rows.


## ANALYSING DATASET

In [16]:
ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(df):
    print()
    print("Analyzing: {}".format(df))
    #df = pd.read_csv(path,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])
           
     

In [17]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore

analyze(df)


Analyzing:                           te       td               sa               da  \
0        2016-07-27 13:43:21   48.380   187.96.221.207     42.219.153.7   
1        2016-07-27 13:43:21   48.380     42.219.153.7   187.96.221.207   
2        2016-07-27 13:43:25   50.632   42.219.153.191   62.205.150.146   
3        2016-07-27 13:43:25   51.052   62.205.150.146   42.219.153.191   
4        2016-07-27 13:43:27   46.996    92.225.28.133   42.219.155.111   
5        2016-07-27 13:43:27   48.852     143.72.8.137   42.219.154.107   
6        2016-07-27 13:43:27   48.852   42.219.154.107     143.72.8.137   
7        2016-07-27 13:43:28    0.000     143.72.8.137   42.219.154.121   
8        2016-07-27 13:43:28    0.000   42.219.152.249     62.83.121.37   
9        2016-07-27 13:43:28    0.000   42.219.154.121     143.72.8.137   
10       2016-07-27 13:43:28    0.000   42.219.158.242      57.41.5.186   
11       2016-07-27 13:43:28    0.000     62.83.121.37   42.219.152.249   
12       2016

** te:1416 (0%)
** td:30875 (1%)
** sa:80466 (3%)
** da:220607 (8%)
** sp:64718 (2%)
** dp:64286 (2%)
** pr:[TCP:65.4%,UDP:33.74%,ICMP:0.79%,GRE:0.05%,ESP:0.02%,IPIP:0.0%,IPv6:0.0%]
** flg:[.A....:38.07%,.AP.SF:29.14%,....S.:9.0%,.AP.S.:4.68%,.AP...:4.23%,.APRSF:4.17%,.A...F:2.63%,.A..SF:2.18%,.APRS.:1.57%,.AP..F:1.19%,...R..:1.1%,.A..S.:0.76%,.A.R..:0.68%,.APR..:0.15%,.A.R.F:0.15%,.A.RS.:0.12%,.APR.F:0.12%,.A.RSF:0.03%,...RS.:0.02%,......:0.0%,..P.S.:0.0%,UA...F:0.0%]
** fwd:[0:100.0%]
** stos:[0:83.34%,40:6.57%,72:6.46%,2:1.2%,8:0.76%,64:0.52%,4:0.29%,24:0.23%,26:0.18%,192:0.16%,16:0.05%,200:0.05%,104:0.04%,75:0.03%,74:0.02%,42:0.01%,184:0.01%,80:0.01%,20:0.01%,56:0.01%,10:0.01%,28:0.0%,96:0.0%,196:0.0%,43:0.0%,6:0.0%,12:0.0%,32:0.0%,48:0.0%,66:0.0%,44:0.0%,208:0.0%,73:0.0%,76:0.0%,3:0.0%,23:0.0%,202:0.0%,224:0.0%,216:0.0%,9:0.0%,88:0.0%,194:0.0%,1:0.0%]
** pkt:3865 (0%)
** byt:86592 (3%)
** norm:[background:99.73%,blacklist:0.27%]


In [4]:
df.info()
df.head()
df.drop(['te'], axis=1, inplace=True)

#encode_text_index(df, 'te')
#print ("1")
encode_text_index(df, 'td')
print ("2")
encode_text_index(df, 'sa')
print ("3")
encode_text_index(df, 'da')
print ("4")
encode_numeric_zscore(df, 'sp')
print ("5")
encode_numeric_zscore(df, 'dp')
print ("6")
encode_text_index(df, 'pr')
print ("7")
encode_text_index(df, 'flg')
print ("8")
encode_text_index(df, 'fwd')
print ("9")
encode_numeric_zscore(df, 'stos')
print ("10")
encode_numeric_zscore(df, 'pkt')
print ("11")
encode_numeric_zscore(df, 'byt')
print ("12")
#encode_text_dummy(df, 'norm')
outcomes = encode_text_index(df, 'norm')
num_classes = len(outcomes)

#df.dropna(inplace=True,axis=0)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14964378 entries, 0 to 14964377
Data columns (total 13 columns):
te      object
td      float64
sa      object
da      object
sp      int64
dp      int64
pr      object
flg     object
fwd     int64
stos    int64
pkt     int64
byt     int64
norm    object
dtypes: float64(1), int64(6), object(6)
memory usage: 1.4+ GB
2
3
4
5
6
7
8
9
10
11
12


Unnamed: 0,td,sa,da,sp,dp,pr,flg,fwd,stos,pkt,byt,norm
0,0,21524,2550,-1.161711,-0.677528,6,5,0,-0.656043,-0.01964,-0.01009,1
1,0,138534,2573,-1.164591,1.089758,6,5,0,-0.656043,-0.01964,-0.010038,1
2,0,138534,2573,-1.164591,1.327161,6,5,0,-0.656043,-0.01964,-0.010037,1
3,0,138534,3656,-1.164591,0.859621,6,5,0,-0.656043,-0.01964,-0.01006,1
4,0,138534,3659,-1.164591,1.113931,6,5,0,-0.656043,-0.01964,-0.010068,1


In [5]:
import pandas as pd
import io
import requests
import numpy as np
import os
from sklearn.model_selection import train_test_split
#from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard



# Break into X (predictors) & y (prediction)
x, y = to_xy(df,'norm')

# Create a test/train split.  25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

#Create neural net
model = Sequential()
model.add(Dense(25, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x.shape[1], kernel_initializer='random_normal', activation='relu'))
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
#Cambia 1 por 2
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
tensorboard = TensorBoard(log_dir="tb/{}", histogram_freq=0,  
          write_graph=True, write_images=True)
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor, tensorboard],verbose=2,epochs=1000)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 11223283 samples, validate on 3741095 samples
Epoch 1/1000
 - 562s - loss: 0.0522 - val_loss: 0.0527
Epoch 2/1000
 - 592s - loss: 0.0520 - val_loss: 0.0527
Epoch 3/1000
 - 569s - loss: 0.0520 - val_loss: 0.0527
Epoch 4/1000
 - 648s - loss: 0.0520 - val_loss: 0.0527
Epoch 5/1000
 - 670s - loss: 0.0520 - val_loss: 0.0527
Epoch 6/1000
 - 786s - loss: 0.0520 - val_loss: 0.0527
Epoch 00006: early stopping


<keras.callbacks.History at 0x1e243918dd8>

In [6]:
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

Validation score: 0.9967290325426111
