In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import sys, os
import os
from sklearn.linear_model import LogisticRegression
import re

import math
from collections import Counter


def getTokens(input):


	tokens = re.split("[, \-?:./]+",input)
	tokens = list(set(tokens))

	if 'com' in tokens:
		# Delete com as most url have it
		tokens.remove('com')

	return tokens


allurls = 'https://github.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/raw/master/data/data.csv'	#path to our all urls file
allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False)	#reading file
allurlsdata = pd.DataFrame(allurlscsv)	#converting to a dataframe

allurlsdata = np.array(allurlsdata)	#converting it into an array
random.shuffle(allurlsdata)	#shuffling

y = [d[1] for d in allurlsdata]	#all labels 
corpus = [d[0] for d in allurlsdata]	#all urls corresponding to a label (either good or bad)
vectorizer = TfidfVectorizer(tokenizer=getTokens)	#get a vector for each url but use our customized tokenizer
X = vectorizer.fit_transform(corpus)	#get the X vector

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)	#split into training and testing set 80/20 ratio

lgs = LogisticRegression()	#using logistic regression
lgs.fit(X_train, y_train)
print(lgs.score(X_test, y_test))	#pring the score. It comes out to be 98%




0.9832566325377855


In [0]:
X_predict = ['wikipedia.com','google.com/search=faizanahad','pakistanifacebookforever.com/getpassword.php/','www.radsport-voggel.de/wp-admin/includes/log.exe','ahrenhei.without-transfer.ru/nethost.exe','www.itidea.it/centroesteticosothys/img/_notes/gum.exe']

X_predict_transformed = vectorizer.transform(X_predict)

y_Predict = lgs.predict(X_predict_transformed)

for i in range(len(X_predict)):
    print(X_predict[i], ':', y_Predict[i])

wikipedia.com : good
google.com/search=faizanahad : bad
pakistanifacebookforever.com/getpassword.php/ : bad
www.radsport-voggel.de/wp-admin/includes/log.exe : bad
ahrenhei.without-transfer.ru/nethost.exe : bad
www.itidea.it/centroesteticosothys/img/_notes/gum.exe : bad


In [0]:
# %tensorflow_version 1.x
# trainign neural network
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
import json
import numpy as np
from keras.models import load_model 

Using TensorFlow backend.


In [0]:
y_converted = np.array([[1, 0] if label=='good' else [0, 1] for label in y])
X_train, X_test, y_train, y_test = train_test_split(X, y_converted, test_size=0.95, random_state=42)

In [0]:
token_length = len(vectorizer.get_feature_names())
token_length

240794

In [44]:
X_train.shape

(21023, 240794)

In [0]:
# Logistic regression model with only one layera and two output point.

model = Sequential()
model.add(Dense(2, activation='softmax', input_shape=(token_length,)))
adam = keras.optimizers.Adam(lr=0.01)
model.compile(loss='binary_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])
model.fit(X_train, y_train,
          epochs=1,
          batch_size=128)


Epoch 1/1
 3712/21023 [====>.........................] - ETA: 23s - loss: 0.6217 - acc: 0.6840

In [0]:
model.save('my_model.h5') 

In [41]:
my_model = load_model('my_model.h5')
score = my_model.evaluate(X_test[:10000], y_test[:10000], batch_size=128)
print("Baseline Error: %.2f%%" % (100-score[1]*100))
# my_model.predict(X_test)

Baseline Error: 10.37%


In [32]:
X_predict = ['wikipedia.com','google.com/search=faizanahad','pakistanifacebookforever.com/getpassword.php/','www.radsport-voggel.de/wp-admin/includes/log.exe','ahrenhei.without-transfer.ru/nethost.exe','www.itidea.it/centroesteticosothys/img/_notes/gum.exe']

X_predict_transformed = vectorizer.transform(X_predict)

y_Predict = my_model.predict(X_predict_transformed)

for i in range(len(X_predict)):
    # (1,0) = good and (0, 1) = bad
    if y_Predict[i][0] > y_Predict[i][1]:
        result = 'good'
    else:
        result = 'bad'
    print(X_predict[i], ':', result)

wikipedia.com : good
google.com/search=faizanahad : bad
pakistanifacebookforever.com/getpassword.php/ : bad
www.radsport-voggel.de/wp-admin/includes/log.exe : bad
ahrenhei.without-transfer.ru/nethost.exe : bad
www.itidea.it/centroesteticosothys/img/_notes/gum.exe : bad


In [0]:
# Converting keras model for tensorflow js
!tensorflowjs_converter --input_format keras my_model.h5 model

In [0]:
arr_tokens_count_in_docs = np.zeros(token_length)

for i in range(X.shape[0]):
    arr = X[i].toarray().flatten()
    idx_arr = np.where(arr>0)[0]
    for idx in idx_arr:
        arr_tokens_count_in_docs[idx] += 1

    if i % 20000:
        print(i, ' rows compiled!')

In [0]:
tokens_doc_count_dict = {}
tokens_list = vectorizer.get_feature_names()
tokens_index = {}
for i in range(len(tokens_list)):
    tokens_doc_count_dict[tokens_list[i]] = arr_tokens_count_in_docs[i]
    tokens_index[tokens_list[i]] = i

In [26]:
import json

tokens_info = {
    'tokens': tokens_list,
    'counts': tokens_doc_count_dict,
    'indexes': tokens_index
}
with open('model/tokenInfo.json', 'w') as outfile:
    json.dump(tokens_info, outfile)

print('Json file compiled!')

Json file compiled!


In [0]:
!zip model.zip model

In [27]:
!zip -r /content/model.zip /content/model/

  adding: content/model/ (stored 0%)
  adding: content/model/tokenInfo.json (deflated 66%)
  adding: content/model/group1-shard1of1.bin (deflated 7%)
  adding: content/model/model.json (deflated 56%)


In [0]:
model.save('logistic-regression')

In [0]:
import tensorflowjs as tfjs
tfjs.converters.save_keras_model(my_model, "tf_model")

ValueError: ignored

In [0]:
len_tokens = len(vectorizer.get_feature_names())

In [0]:
tokens_doc_count = np.zeros(len(vectorizer.get_feature_names()))
# tokens_doc_count = {}
# for name in vectorizer.get_feature_names():
#     tokens_doc_count[name] = 0


In [0]:
for i in range(5):
    

In [0]:
for i in range(X.shape[0]):
    arr = X[i].toarray().flatten()
    idx_arr = np.where(arr>0)[0]
    for idx in idx_arr:
        tokens_doc_count[idx] += 1

In [0]:
tokens_doc_count_dict = {}
tokens_list = vectorizer.get_feature_names()
tokens_index = {}
for i in range(len(tokens_list)):
    tokens_doc_count_dict[tokens_list[i]] = tokens_doc_count[i]
    tokens_index[tokens_list[i]] = i

In [0]:
tokens_doc_count_dict['biz']

1075.0

In [0]:
with open('tokenCount.json', 'w') as outfile:
    json.dump(tokens_doc_count_dict, outfile)

In [0]:
import json

tokens_info = {
    'tokens': tokens_list,
    'counts': tokens_doc_count_dict,
    'indexes': tokens_index
}
with open('tokenInfo.json', 'w') as outfile:
    json.dump(tokens_info, outfile)

In [0]:
token_list_js = f'const tokens = {tokens_list};' 
token_counts_js = f'const tokenCounts = {tokens_doc_count_dict};'

In [0]:
with open('token_list.js', 'w') as outfile:
    outfile.write(token_list_js)

with open('token_counts.js', 'w') as outfile:
    outfile.write(token_counts_js)
print('File write complete')

File write complete


In [0]:
with open('tokens.js', 'w') as outfile:
    outfile.write(js_script)

print('File write complete')

File write complete


In [0]:
f = open("demofile3.txt", "w")
f.write(js_script)
f.close()

In [0]:
X_test[0].toarray().shape

(1, 394817)

In [0]:
my_model = load_model('model.h5')











Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [0]:
my_model.predict(X_test)

ValueError: ignored