In [0]:
import tensorflow as tf 
import pandas as pd
import numpy as np 

from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle

In [77]:
# Download the dataset from GitHub repo
!curl -O https://media.githubusercontent.com/media/pietromoretto/lanGuesser/master/dataset.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  109M  100  109M    0     0  40.0M      0  0:00:02  0:00:02 --:--:-- 39.9M


In [80]:
!ls

dataset.csv  keras_saved_model.h5  __pycache__	tokenizer_state.pkl
gdrive	     predictor.py	   sample_data


In [0]:
convert = {
    ".c": "C",
    ".java": "JAVA",
    ".py": "PYTHON",
    ".rb": "RUBY",
    ".html": "HTML",
    ".css": "CSS",
    ".php": "PHP",
    ".sql": "SQL",
    ".cs": "C#",
    ".js": "JAVASCRIPT",
    ".go": "GO",
}

In [0]:
# Create the dataframe

data = pd.read_csv("dataset.csv")

data = shuffle(data, random_state=22)
data = data.dropna()

data.head()

Unnamed: 0,content,language
7967,// Copyright 2014 The go-ethereum Authors\n// ...,GO
7845,"// Copyright 2013, Google Inc. All rights rese...",GO
2812,# Copyright (c) 2003-2014 CORE Security Techno...,PYTHON
11944,/*\n * Clover - 4chan browser https://github.c...,JAVA
15611,$LOAD_PATH.unshift File.expand_path('../../lib...,RUBY


In [0]:
# Print how many languages are in the dataset

for k, v in convert.items():
    cust = data["language"].apply(lambda lan: lan == v)
    print(v, len(cust[cust == True]))

C 427
JAVA 1931
PYTHON 1630
RUBY 3935
HTML 574
CSS 715
PHP 2033
SQL 1229
C# 1958
JAVASCRIPT 435
GO 1487


In [0]:
data.describe()

Unnamed: 0,content,language
count,16354,16354
unique,16354,11
top,UPDATE `spell_proc_event` SET `procFlags`=0x14...,RUBY
freq,1,3935


In [0]:
# One Hot Encoding on programming languages (labels)

encoder = LabelBinarizer()
lang_encoded = encoder.fit_transform(data["language"].values)
num_lang = len(lang_encoded[0])

print(data['language'].values[0])
print(encoder.classes_)
print(lang_encoded[0])

GO
['C' 'C#' 'CSS' 'GO' 'HTML' 'JAVA' 'JAVASCRIPT' 'PHP' 'PYTHON' 'RUBY'
 'SQL']
[0 0 0 1 0 0 0 0 0 0 0]


In [0]:
# Split data into train and test sets (80 / 20)

train_size = int(len(data) * .8)
test_size = (len(data) - train_size)

print(f"Train size: {train_size}")
print (f"Test size: {test_size}")

Train size: 13083
Test size: 3271


In [0]:
# Split labels into train and test sets

train_lang = lang_encoded[:train_size]
test_lang = lang_encoded[train_size:]

In [0]:
# Tokenize the features

from tensorflow.keras.preprocessing import text

VOCAB_SIZE=400 

train_content = data['content'].values[:train_size]
test_content = data['content'].values[train_size:]

tokenizer = text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_content)

body_train = tokenizer.texts_to_matrix(train_content)
body_test = tokenizer.texts_to_matrix(test_content)

In [0]:
# Preview the first input from our training data

print(len(body_train[0]))
print(body_train[0])

400
[0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1.
 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [0]:
# Create the model

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
model.add(tf.keras.layers.Dense(25, activation='relu'))
model.add(tf.keras.layers.Dense(num_lang, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                20050     
_________________________________________________________________
dense_1 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_2 (Dense)              (None, 11)                286       
Total params: 21,611
Trainable params: 21,611
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Train and evaluate the model

model.fit(body_train, train_lang, epochs=10, batch_size=128, validation_split=0.1)

print('Eval loss/accuracy: {}".format(model.evaluate(body_test, test_lang, batch_size=128)))

Train on 11774 samples, validate on 1309 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Eval loss/accuracy: [0.033974358766419153, 0.9947752]


In [0]:
# Export the model to a file

model.save('keras_saved_model.h5')

In [0]:
# Save the processor state of the tokenizer

import pickle

with open('./tokenizer_state.pkl', 'wb') as f:
  pickle.dump(tokenizer, f)

In [71]:
%%writefile predictor.py
import pickle
import os
import numpy as np


class Predictor:
  def __init__(self, model, processor):
    self._model = model
    self._processor = processor
  
  def predict(self, instances, **kwargs):
    preprocessed_data = self._processor.texts_to_matrix(instances)
    predictions = self._model.predict(preprocessed_data)
    return predictions.tolist()

  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir,'keras_saved_model.h5'))
    with open(os.path.join(model_dir, 'tokenizer_state.pkl'), 'rb') as f:
      processor = pickle.load(f)

    return cls(model, processor)

Overwriting predictor.py


In [73]:
# Test model on custom data

from predictor import Predictor


test = [
    """public class OracleJdbcTest
{
	String driverClass = "oracle.jdbc.driver.OracleDriver";

	Connection con;
	
	public void init(FileInputStream fs) throws ClassNotFoundException, SQLException, FileNotFoundException, IOException
	{
		Properties props = new Properties();
		props.load(fs);
		String url = props.getProperty("db.url");
		String userName = props.getProperty("db.user");
		String password = props.getProperty("db.password");
		Class.forName(driverClass);

		con=DriverManager.getConnection(url, userName, password);
	}
	
	public void fetch() throws SQLException, IOException
	{
		PreparedStatement ps = con.prepareStatement("select SYSDATE from dual");
		ResultSet rs = ps.executeQuery();
		
		while (rs.next())
		{
			// do the thing you do
		}
		rs.close();
		ps.close();
	}

	public static void main(String[] args) 
	{
		OracleJdbcTest test = new OracleJdbcTest();
		test.init();
		test.fetch();
	}
}""",
    """java.util.Date utilDate = new java.util.Date();
java.sql.Date sqlDate = new java.sql.Date(utilDate.getTime());""",
    """import socket
import subprocess
import sys
from datetime import datetime

# Clear the screen
subprocess.call('clear', shell=True)

# Ask for input
remoteServer    = raw_input("Enter a remote host to scan: ")
remoteServerIP  = socket.gethostbyname(remoteServer)

# Print a nice banner with information on which host we are about to scan
print "-" * 60
print "Please wait, scanning remote host", remoteServerIP
print "-" * 60

# Check what time the scan started
t1 = datetime.now()

# Using the range function to specify ports (here it will scans all ports between 1 and 1024)

# We also put in some error handling for catching errors

try:
    for port in range(1,1025):  
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        result = sock.connect_ex((remoteServerIP, port))
        if result == 0:
            print "Port {}: 	 Open".format(port)
        sock.close()

except KeyboardInterrupt:
    print "You pressed Ctrl+C"
    sys.exit()

except socket.gaierror:
    print 'Hostname could not be resolved. Exiting'
    sys.exit()

except socket.error:
    print "Couldn't connect to server"
    sys.exit()

# Checking the time again
t2 = datetime.now()

# Calculates the difference of time, to see how long it took to run the script
total =  t2 - t1

# Printing the information to screen
print 'Scanning Completed in: ', total""",
    """def a():
                print(12)""",
]


predictor = Predictor.from_path(".")
results = predictor.predict(test)

for result in results:
  print("Predicted labels:")
  max_pred = -10000
  max_i = -1
  for i, pred in enumerate(result):
    if pred > max_pred:
      max_i = i
      max_pred = pred
    
    
  print(encoder.classes_[max_i], max_pred)
  print()


Predicted labels:
JAVA 0.9799880981445312

Predicted labels:
JAVA 0.9334894418716431

Predicted labels:
PYTHON 1.0

Predicted labels:
PYTHON 0.9999996423721313

