In [1]:
# Install required dependencies
! pip install tensorflow
! pip install tldextract
! pip install swifter



In [None]:
# Import required dependencies
from collections import Counter
import numpy as np
import pandas as pd
from urllib.parse import urlparse
import tldextract
import os, json, re, string, unicodedata

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curveimport matplotlib.pyplot as plt

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import swifter

In [4]:
# Constants
MAX_LEN= 500
THRESHOLD = 0.976 # Precision = 0.9997 and Recall = 0.605

In [5]:
# Functions for reading URL jsons
def convert_json_to_df(f,file,plugin_type):
    data = json.load(f)
    features_plugin = data["titles"]
    if plugin_type=='vadinfo':
        features_plugin.remove('SHA256')
    plugin_df = pd.DataFrame(columns=features_plugin, data=data["data"])
    plugin_df['snapshot'] = int(file.split("/")[-2].split('-')[1])
    return plugin_df
def get_plugin_files(files):
    urls_df = pd.DataFrame()
    timestamp = ""
    for file in files:
        file_ext = file.split('/')[-1]
        with open(file,encoding='latin1') as f:
            try:
                urls_df_i = convert_json_to_df(f, file, "urls")
                urls_df = urls_df.append(urls_df_i)
                file_ext_dot = file_ext.split('.')
                file_ext_dot_dash = file_ext_dot[0].split("_")
                timestamp = file_ext_dot_dash[1]+"_"+file_ext_dot_dash[2]
            except:
                    print("error: "+file)
    return urls_df, timestamp

In [6]:
# Functions for cleanind the URL for NLP processing
def remove_prefix(text):
    try:
        if text.startswith('ftp://'):
            text = text[len('https://'):]
        if text.startswith('https://'):
            text = text[len('https://'):]
        if text.startswith('http://'):
            text = text[len('http://'):]
        if text.startswith('www.'):
            text = text[len('www.'):]
    except:
        text = ''
    return text 

def clean(text):   
    # strip '
    text = text.strip("'")
    # convert to lower letters
    text = text.lower()  
    # remove punctuation marks
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    # remove extra spaces
    text = re.sub(' +', ' ', text)   
    # strip spaces
    text = text.strip(" ")  
    return text
# Clean url with remove short and long words
def clean_nlp(text):
    text = clean(text)
    text = ' '.join([x for x in text.split(' ') if x.isnumeric()==False and len(x)>1 and len(x)<21])
    return text


In [7]:
# Functions for disassemble the URL
def strip_se(url):
    return url.strip("'").strip('\n')

def add_http(url):
    if url.startswith('http'):
        return url
    return 'http://'+url

def get_domain(url):
    domain = tldextract.extract(url).domain
    if domain:
        return domain
    return ''
def get_domain(url):
    domain = tldextract.extract(url).domain
    if domain in ['ddns','bazar','onion']:
        url = url.strip('https://').strip('http://')
        urls = url.split('.')
        urls_i = urls.index(domain)
        if urls_i == 0:
            return domain
        return urls[urls_i-1]
    return domain

def get_subdomain(url):
    subdomain = tldextract.extract(url).subdomain
    domain = tldextract.extract(url).domain
    if domain in ['ddns','bazar','onion']:
        url = url.strip('https://').strip('http://')
        urls = url.split('.')
        urls_i = urls.index(domain)
        if urls_i == 0:
            return subdomain
        return ".".join(urls[:urls_i-1])
    return subdomain

def get_tld(url):
    tld = tldextract.extract(url).suffix
    domain = tldextract.extract(url).domain
    if domain in ['ddns','bazar','onion']:
        url = url.strip('https://').strip('http://')
        urls = url.split('.')
        urls_i = urls.index(domain)
        if urls_i == 0:
            return tld
        return ".".join(urls[urls_i:])
    return tld

def get_url_parsed(url):
    url_parsed = urlparse(url)
    return url_parsed

def get_path(url):
    url_parsed = urlparse(url)
    return url_parsed.path

In [8]:
# Functions for model features
def get_len(s):
    return len(s)
def get_count_numbers(s):
    return sum(c.isdigit() for c in s)
def get_not_alphanumeric(s):
    if s.isalnum() == True:
        return 1
    return 0
def get_count_parts(s):
    return len(s.split('.'))
def get_count_queries(s):
    url_parsed_query = urlparse(s).query
    if url_parsed_query == '':
        return 0
    print(url_parsed_query.split('&'))
    return len(url_parsed_query.split('&'))
def get_count_fragments(s):
    url_parsed_fragment = urlparse(s).fragment
    if url_parsed_fragment == '':
        return 0
    return 1
def get_count_slash(s):
    return s.count('/')
def get_double_slash(s):
    return s.count('//')
def get_count_upper(s):
    return sum(1 for c in s if c.isupper())
def get_brand_in_subdomain(s):
    for brand in ['citibank','whatsapp','netflix','dropbox','wetransfer','rakuten','itau','outlook','ebay','facebook','hsbc','linkedin','instagram','google','paypal','dhl','alibaba','bankofamerica','apple','microsoft','skype','amazon','yahoo','wellsfargo','americanexpress']:
        if brand in s:
            return 1
    return 0
def get_brand_in_path(s):
    for brand in ['citibank','whatsapp','netflix','dropbox','wetransfer','rakuten','itau','outlook','ebay','facebook','hsbc','linkedin','instagram','google','paypal','dhl','alibaba','bankofamerica','apple','microsoft','skype','amazon','yahoo','wellsfargo','americanexpress']:
        if brand in s:
            return 1
    return 0
def get_domain_alexa(s):
    if s in alexa_rank_1k_domain_unique:
        return 2
    elif s in alexa_rank_100k_domain_unique:
        return 1
    return 0
def get_max_len_path(path_clean):
    if path_clean == '':
        return 0
    path_split = [len(f) for f in path_clean.split()]
    return np.max(path_split,0)

In [9]:
# Calculating the features
def create_features(df):
    df['domain_in_alexa'] = df['Domain'].swifter.apply(get_domain_alexa)
    df['domain_len'] = df['Domain'].swifter.apply(get_len)
    df['domain_numbers'] = df['Domain'].swifter.apply(get_count_numbers)
    df['domain_isalnum'] = df['Domain'].swifter.apply(get_not_alphanumeric)
    df['subdomain_len'] = df['Subdomain'].swifter.apply(get_len)
    df['subdomain_numbers_count'] = df['Subdomain'].swifter.apply(get_count_numbers)
    df['subdomain_parts_count'] = df['Subdomain'].swifter.apply(get_count_parts)
    df['tld_len'] = df['Tld'].swifter.apply(get_len)
    df['tld_parts_count'] = df['Tld'].swifter.apply(get_count_parts)
    df['url_len'] = df['URL'].swifter.apply(get_len)
    df['queries_amount'] = df['URL'].swifter.apply(get_count_queries)
    df['fragments_amount'] = df['URL'].swifter.apply(get_count_fragments)
    df['path_len'] = df['Path'].swifter.apply(get_len)
    df['path_slash_counts'] = df['Path'].swifter.apply(get_count_slash)
    df['path_double_slash_counts'] = df['Path'].swifter.apply(get_double_slash)
    df['upper_amount'] = df['URL'].swifter.apply(get_count_upper)
    df['brand_in_subdomain'] = df['Subdomain'].swifter.apply(get_brand_in_subdomain)
    df['brand_in_path'] = df['Path'].swifter.apply(get_brand_in_path)  
    df['Path_clean'] = df['Path'].swifter.apply(lambda x: clean(x))
    df['path_max_len'] = df['Path_clean'].swifter.apply(get_max_len_path)
    return df
# Processing the url - domain, subdomain, tld, path and get URL's features
def processing(df):
    # strip url
    df['URL'] = df['URL'].apply(strip_se)
    # add http
    df['URL'] = df['URL'].apply(add_http)
    #df['url'].apply(get_url_parsed)
    # get domain
    df['Domain'] = df['URL'].apply(get_domain)
    # get sub domain
    df['Subdomain'] = df['URL'].apply(get_subdomain)
    # get tld
    df['Tld'] = df['URL'].apply(get_tld)
    # get path
    df['Path'] = df['URL'].apply(get_path)
    # Create features
    df = create_features(df)
    return df

In [10]:
# Alexa rank dict
alexa_rank = pd.read_csv('/raid0/haim/haim/data/alexa-top-500k.csv',header=None)
alexa_rank.columns = ['index','url']
alexa_rank_domain = alexa_rank['url'].apply(get_domain)
alexa_rank_1k = alexa_rank_domain.iloc[0:1000]
alexa_rank_100k = alexa_rank_domain.iloc[1000:100000]
alexa_rank_1k_domain_unique = pd.unique(alexa_rank_1k)
alexa_rank_100k_domain_unique = pd.unique(alexa_rank_100k)


In [11]:
# Read the URL plugins
path = "/raid0/haim/haim/URL_Snapshots/"
snapshots = os.listdir(path)
snapshots = [int(x.split('-')[1]) for x in snapshots if 'snap' in x]
snapshots.sort()
snapshots = ['snapshot-'+str(x) for x in snapshots]
for snap_file in snapshots:
    print(path+snap_file+"/")
    snap_file = path+snap_file+"/"
    files = []
    for r, d, f in os.walk(snap_file):
        for file in f:
            if '.json' in file:
                files.append(os.path.join(r, file))
    #try:
        data, timestamp = get_plugin_files(files)

/raid0/haim/haim/URL_Snapshots/snapshot-1/
error: /raid0/haim/haim/URL_Snapshots/snapshot-1/.ipynb_checkpoints/urls_2022-01-30_10-25-58.438453-checkpoint.json


In [12]:
data

Unnamed: 0,PID,Process,Heap,Virtual-address,URL,snapshot
0,9900,notepad.exe,0,0x15ceb795bd0,https://nwbqonnspiiwjrpo.cn,1
1,9900,notepad.exe,0,0x15ceb7d2fe8,https://ipajpheumymwwhlg.onion,1
2,9900,notepad.exe,0,0x15ceb7d3006,https://uifpby48sxnu97yn.ru,1
3,9900,notepad.exe,0,0x15ceb7d3024,https://m57mz3ocfrrx55ky.bazar,1
4,9900,notepad.exe,0,0x15ceb7d3042,https://bb7now8hgnwj1o42aoe1e6qzpc.com,1
5,9900,notepad.exe,0,0x15ceb7d308a,https://3vjnac1xj87dl1vauzom1bfuqzn.net,1
6,9900,notepad.exe,0,0x15ceb7d30c2,https://google.com,1
7,9900,notepad.exe,0,0x15ceb7d30fa,https://faceebook-com.bugs3.com/login/Secured_...,1
8,9900,notepad.exe,0,0x15ceb7d3132,https://nvidia.com,1
9,9900,notepad.exe,0,0x15ceb883bea,https://microsoft.com,1


In [13]:
url_df = processing(data)

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [14]:
url_df

Unnamed: 0,PID,Process,Heap,Virtual-address,URL,snapshot,Domain,Subdomain,Tld,Path,...,queries_amount,fragments_amount,path_len,path_slash_counts,path_double_slash_counts,upper_amount,brand_in_subdomain,brand_in_path,Path_clean,path_max_len
0,9900,notepad.exe,0,0x15ceb795bd0,https://nwbqonnspiiwjrpo.cn,1,nwbqonnspiiwjrpo,,cn,,...,0,0,0,0,0,0,0,0,,0
1,9900,notepad.exe,0,0x15ceb7d2fe8,https://ipajpheumymwwhlg.onion,1,ipajpheumymwwhlg,,onion,,...,0,0,0,0,0,0,0,0,,0
2,9900,notepad.exe,0,0x15ceb7d3006,https://uifpby48sxnu97yn.ru,1,uifpby48sxnu97yn,,ru,,...,0,0,0,0,0,0,0,0,,0
3,9900,notepad.exe,0,0x15ceb7d3024,https://m57mz3ocfrrx55ky.bazar,1,m57mz3ocfrrx55ky,,bazar,,...,0,0,0,0,0,0,0,0,,0
4,9900,notepad.exe,0,0x15ceb7d3042,https://bb7now8hgnwj1o42aoe1e6qzpc.com,1,bb7now8hgnwj1o42aoe1e6qzpc,,com,,...,0,0,0,0,0,0,0,0,,0
5,9900,notepad.exe,0,0x15ceb7d308a,https://3vjnac1xj87dl1vauzom1bfuqzn.net,1,3vjnac1xj87dl1vauzom1bfuqzn,,net,,...,0,0,0,0,0,0,0,0,,0
6,9900,notepad.exe,0,0x15ceb7d30c2,https://google.com,1,google,,com,,...,0,0,0,0,0,0,0,0,,0
7,9900,notepad.exe,0,0x15ceb7d30fa,https://faceebook-com.bugs3.com/login/Secured_...,1,bugs3,faceebook-com,com,/login/Secured_Re-login/index1.html,...,0,0,35,3,0,2,0,0,login secured re login index1 html,7
8,9900,notepad.exe,0,0x15ceb7d3132,https://nvidia.com,1,nvidia,,com,,...,0,0,0,0,0,0,0,0,,0
9,9900,notepad.exe,0,0x15ceb883bea,https://microsoft.com,1,microsoft,,com,,...,0,0,0,0,0,0,0,0,,0


In [15]:
url_df['URL_clean'] = url_df['URL'].copy().apply(remove_prefix)

In [16]:
url_df['URL_clean'] = url_df['URL_clean'].apply(lambda x: clean_nlp(x))

In [17]:
stractural_features = ['domain_in_alexa','domain_len','domain_numbers','domain_isalnum','subdomain_len','subdomain_numbers_count',
            'subdomain_parts_count','tld_len','tld_parts_count','queries_amount','fragments_amount',
            'path_len','path_slash_counts','path_double_slash_counts','brand_in_subdomain','brand_in_path','path_max_len'] 

In [18]:
df_max_min = pd.read_csv('max_min_urls.csv')
df_max_min

Unnamed: 0,domain_in_alexa,domain_len,domain_numbers,domain_isalnum,subdomain_len,subdomain_numbers_count,subdomain_parts_count,tld_len,tld_parts_count,queries_amount,fragments_amount,path_len,path_slash_counts,path_double_slash_counts,brand_in_subdomain,brand_in_path,path_max_len
0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
1,2,133,47,1,237,134,33,24,3,38,1,748,40,3,1,1,507


In [19]:
url_stractural_features = url_df[stractural_features]

In [20]:
for feature in stractural_features:
    max_feature = df_max_min[feature][1]
    min_feature = df_max_min[feature][0]
    url_stractural_features[feature] = (url_stractural_features[feature] - min_feature) / (max_feature - min_feature)    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  url_stractural_features[feature] = (url_stractural_features[feature] - min_feature) / (max_feature - min_feature)


In [21]:
# Read tokenizer
tokenizer = Tokenizer()
tokenizer.word_index = pd.read_csv('tokenizer_urls.csv').set_index('keys')['values'].to_dict()

In [22]:
url_df_clean = url_df['URL_clean']

In [23]:
url_clean_tokens = tokenizer.texts_to_sequences(url_df_clean)
url_clean_tokens = pad_sequences(url_clean_tokens, maxlen=MAX_LEN, padding='post')

In [24]:
df_output = pd.concat([url_stractural_features, pd.DataFrame(columns = ['word_'+str(i) for i in range(MAX_LEN)] , data = url_clean_tokens)], axis=1)

In [25]:
df_output

Unnamed: 0,domain_in_alexa,domain_len,domain_numbers,domain_isalnum,subdomain_len,subdomain_numbers_count,subdomain_parts_count,tld_len,tld_parts_count,queries_amount,...,word_490,word_491,word_492,word_493,word_494,word_495,word_496,word_497,word_498,word_499
0,0.0,0.113636,0.0,1.0,0.0,0.0,0.0,0.083333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.113636,0.0,1.0,0.0,0.0,0.0,0.208333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.113636,0.085106,1.0,0.0,0.0,0.0,0.083333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.113636,0.106383,1.0,0.0,0.0,0.0,0.208333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.189394,0.148936,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,0.0,0.19697,0.12766,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,1.0,0.037879,0.0,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,0.0,0.030303,0.021277,1.0,0.054852,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,0.5,0.037879,0.0,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,1.0,0.060606,0.0,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
model = tf.keras.models.load_model('/raid0/haim/haim/url_model_keras_final')

In [27]:
url_stractural_features = np.array(url_stractural_features)

In [28]:
Y_pred = model.predict([url_clean_tokens, url_stractural_features])

In [29]:
Y_pred

array([[9.1328323e-03],
       [4.0894747e-04],
       [1.2049407e-02],
       [6.0154498e-03],
       [5.1167607e-03],
       [1.8387139e-03],
       [8.9423601e-08],
       [9.9589813e-01],
       [7.9622951e-06],
       [1.6411995e-07]], dtype=float32)

In [30]:
df_output['pred'] = Y_pred

In [31]:
df_output

Unnamed: 0,domain_in_alexa,domain_len,domain_numbers,domain_isalnum,subdomain_len,subdomain_numbers_count,subdomain_parts_count,tld_len,tld_parts_count,queries_amount,...,word_491,word_492,word_493,word_494,word_495,word_496,word_497,word_498,word_499,pred
0,0.0,0.113636,0.0,1.0,0.0,0.0,0.0,0.083333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.009132832
1,0.0,0.113636,0.0,1.0,0.0,0.0,0.0,0.208333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.0004089475
2,0.0,0.113636,0.085106,1.0,0.0,0.0,0.0,0.083333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.01204941
3,0.0,0.113636,0.106383,1.0,0.0,0.0,0.0,0.208333,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.00601545
4,0.0,0.189394,0.148936,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.005116761
5,0.0,0.19697,0.12766,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.001838714
6,1.0,0.037879,0.0,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,8.94236e-08
7,0.0,0.030303,0.021277,1.0,0.054852,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.9958981
8,0.5,0.037879,0.0,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,7.962295e-06
9,1.0,0.060606,0.0,1.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.6412e-07


In [None]:
# Create onnx model

In [32]:
! pip install onnxruntime



In [33]:
! pip install git+https://github.com/onnx/tensorflow-onnx

Collecting git+https://github.com/onnx/tensorflow-onnx
  Cloning https://github.com/onnx/tensorflow-onnx to /tmp/pip-req-build-m71moxzu
Building wheels for collected packages: tf2onnx
  Building wheel for tf2onnx (setup.py) ... [?25ldone
[?25h  Created wheel for tf2onnx: filename=tf2onnx-1.10.0-py3-none-any.whl size=439728 sha256=99ee57f0cc0e6e8680ffac1e19b44a426cd12c3c3141549ce1e8ec7174badb39
  Stored in directory: /tmp/pip-ephem-wheel-cache-d_0g63uh/wheels/8d/c7/fe/7dc73855334ce8bd94435f3c357aeca17c69252864204cb133
Successfully built tf2onnx


In [34]:
!python -m tf2onnx.convert --saved-model /raid0/haim/haim/url_model_keras_final --output url_model_tensorflow.onnx

2022-03-21 10:17:14.931740: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-21 10:17:14.931791: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-03-21 10:17:16.330698: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-21 10:17:16.330740: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-21 10:17:16.330772: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
2022-03-21 10:17:16.331047: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized wi

In [None]:
# Inference onnx model

In [35]:
import onnx
import onnxruntime

In [36]:
ONNX_URL_FILE_PATH = "/raid0/haim/haim/url_model_tensorflow.onnx"

In [37]:
session = onnxruntime.InferenceSession(ONNX_URL_FILE_PATH, None)
input_name_0 = session.get_inputs()[0].name
input_name_1 = session.get_inputs()[1].name
output_name = session.get_outputs()[0].name

input_1
input_2
dense_1


In [38]:
result = session.run([output_name], {input_name_0: url_clean_tokens.astype(np.float32), input_name_1: url_stractural_features.astype(np.float32)})
result

[array([[9.1327727e-03],
        [4.0906668e-04],
        [1.2049228e-02],
        [6.0154796e-03],
        [5.1168203e-03],
        [1.8385947e-03],
        [8.9406967e-08],
        [9.9589813e-01],
        [7.9572201e-06],
        [1.4901161e-07]], dtype=float32)]

In [39]:
#from zipfile import ZipFile
#import os
#file = "url_model_keras.zip"  # zip file name
#directory = "raid0/haim/haim/url_model_keras"
#with ZipFile(file, 'w') as zip:
#    for path, directories, files in os.walk(directory):
#        for file in files:
#            file_name = os.path.join(path, file)
#            zip.write(file_name) # zipping the file
#print("Contents of the zip file:")
#with ZipFile(file, 'r') as zip:
#    zip.printdir()