In [0]:
import tensorflow as tf 
import pandas as pd
import numpy as np 

from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle

In [3]:
# Download the dataset from GitHub repo

!curl -O https://media.githubusercontent.com/media/pietromoretto/lanGuesser/master/dataset.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  109M  100  109M    0     0  40.4M      0  0:00:02  0:00:02 --:--:-- 40.3M


In [2]:
!ls

sample_data


In [0]:
convert = {
    ".c": "C",
    ".java": "JAVA",
    ".py": "PYTHON",
    ".rb": "RUBY",
    ".html": "HTML",
    ".css": "CSS",
    ".php": "PHP",
    ".sql": "SQL",
    ".cs": "C#",
    ".js": "JAVASCRIPT",
    ".go": "GO",
}

In [5]:
# Create the dataframe

data = pd.read_csv("dataset.csv")

data = shuffle(data, random_state=22)
data = data.dropna()

data.head()

Unnamed: 0,content,language
7967,// Copyright 2014 The go-ethereum Authors\n// ...,GO
7845,"// Copyright 2013, Google Inc. All rights rese...",GO
2812,# Copyright (c) 2003-2014 CORE Security Techno...,PYTHON
11944,/*\n * Clover - 4chan browser https://github.c...,JAVA
15611,$LOAD_PATH.unshift File.expand_path('../../lib...,RUBY


In [6]:
# Print how many languages are in the dataset

for k, v in convert.items():
    cust = data["language"].apply(lambda lan: lan == v)
    print(v, len(cust[cust == True]))

C 427
JAVA 1931
PYTHON 1630
RUBY 3935
HTML 574
CSS 715
PHP 2033
SQL 1229
C# 1958
JAVASCRIPT 435
GO 1487


In [7]:
data.describe()

Unnamed: 0,content,language
count,16354,16354
unique,16354,11
top,<?php\nnamespace rz_shop_cart;\n\n\nuse \Cart\...,RUBY
freq,1,3935


In [8]:
# One Hot Encoding on programming languages (labels)

encoder = LabelBinarizer()
lang_encoded = encoder.fit_transform(data["language"].values)
num_lang = len(lang_encoded[0])

print(data['language'].values[0])
print(encoder.classes_)
print(lang_encoded[0])

GO
['C' 'C#' 'CSS' 'GO' 'HTML' 'JAVA' 'JAVASCRIPT' 'PHP' 'PYTHON' 'RUBY'
 'SQL']
[0 0 0 1 0 0 0 0 0 0 0]


In [9]:
# Split data into train and test sets (80 / 20)

train_size = int(len(data) * .8)
test_size = (len(data) - train_size)

print(f"Train size: {train_size}")
print (f"Test size: {test_size}")

Train size: 13083
Test size: 3271


In [0]:
# Split labels into train and test sets

train_lang = lang_encoded[:train_size]
test_lang = lang_encoded[train_size:]

In [0]:
# Tokenize the features

from tensorflow.keras.preprocessing import text

VOCAB_SIZE = 10000 

train_content = data['content'].values[:train_size]
test_content = data['content'].values[train_size:]

tokenizer = text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_content)

body_train = tokenizer.texts_to_matrix(train_content)
body_test = tokenizer.texts_to_matrix(test_content)

In [12]:
# Preview the first input from our training data

print(len(body_train[0]))
print(body_train[0])

10000
[0. 0. 1. ... 0. 0. 0.]


In [13]:
# Create the model

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
model.add(tf.keras.layers.Dense(25, activation='relu'))
model.add(tf.keras.layers.Dense(num_lang, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                500050    
_________________________________________________________________
dense_1 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_2 (Dense)              (None, 11)                286       
Total params: 501,611
Trainable params: 501,611
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Train and evaluate the model

model.fit(body_train, train_lang, epochs=10, batch_size=128, validation_split=0.1)

print("Eval loss/accuracy: {}".format(model.evaluate(body_test, test_lang, batch_size=128)))

Train on 11774 samples, validate on 1309 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Eval loss/accuracy: [0.012580081310410204, 0.9966093]


In [0]:
# Export the model to a file

model.save('keras_saved_model.h5')

In [0]:
# Save the processor state of the tokenizer

import pickle

with open('./tokenizer_state.pkl', 'wb') as f:
  pickle.dump(tokenizer, f)

In [18]:
%%writefile predictor.py
import pickle
import os
import numpy as np


class Predictor:
  def __init__(self, model, processor):
    self._model = model
    self._processor = processor
  
  def predict(self, instances, **kwargs):
    preprocessed_data = self._processor.texts_to_matrix(instances)
    predictions = self._model.predict(preprocessed_data)
    return predictions.tolist()

  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir,'keras_saved_model.h5'))
    with open(os.path.join(model_dir, 'tokenizer_state.pkl'), 'rb') as f:
      processor = pickle.load(f)

    return cls(model, processor)

Writing predictor.py


In [32]:
# Test model on custom data

from predictor import Predictor


test = [
    """var rows = prompt("How many rows for your multiplication table?");
    var cols = prompt("How many columns for your multiplication table?");
    if(rows == "" || rows == null)
   		 rows = 10;
    if(cols== "" || cols== null)
   		 cols = 10;
    createTable(rows, cols);
    function createTable(rows, cols)
    {
      var j=1;
      var output = "<table border='1' width='500' cellspacing='0'cellpadding='5'>";
      for(i=1;i<=rows;i++)
      {
    	output = output + "<tr>";
        while(j<=cols)
        {
  		  output = output + "<td>" + i*j + "</td>";
   		  j = j+1;
   		}
   		 output = output + "</tr>";
   		 j = 1;
    }
    output = output + "</table>";
    document.write(output);
    }""",
    """<!DOCTYPE html>
<html>
<body>

<h2>HTML Links</h2>
<p>HTML links are defined with the a tag:</p>

<a href="https://www.w3schools.com">This is a link</a>

</body>
</html>""",
    """a:link {
  color: gray;
}

a:visited {
  color: green;
}

a:hover {
  color: rebeccapurple;
}

a:active {
  color: teal;
}""",
    """#include <stdio.h>
int main(){

    int dividend, divisor, quotient, remainder;

    printf("Enter dividend: ");
    scanf("%d", &dividend);

    printf("Enter divisor: ");
    scanf("%d", &divisor);

    // Computes quotient
    quotient = dividend / divisor;

    // Computes remainder
    remainder = dividend % divisor;

    printf("Quotient = %d\n", quotient);
    printf("Remainder = %d", remainder);

    return 0;
}""",
    """for i in (1..10)
    rno = rand(100) + 1
    msg = case rno
        when 42: "The ultimate result."
        when 1..10: "Way too small."
        when 11..15,19,27: "Sorry, too small"
        when 80..99: "Way to large"
        when 100:
                print "TOPS\n"
                "Really way too large"
        else "Just wrong"
    end
    print "Result: ", rno, ": ", msg, "\n"
end""",
    """import pickle
import os
import numpy as np


class Predictor:
    def __init__(self, model, processor):
        self._model = model
        self._processor = processor
        self.languages = [
            "C",
            "C#",
            "CSS",
            "GO",
            "HTML",
            "JAVA",
            "JAVASCRIPT",
            "PHP",
            "PYTHON",
            "RUBY",
            "SQL",
        ]

    def predict(self, instances, **kwargs):
        preprocessed_data = self._processor.texts_to_matrix(instances)
        predictions = self._model.predict(preprocessed_data)
        return predictions.tolist()

    @classmethod
    def from_path(cls, model_dir):
        import tensorflow.keras as keras

        model = keras.models.load_model(os.path.join(model_dir, "keras_saved_model.h5"))
        with open(os.path.join(model_dir, "tokenizer_state.pkl"), "rb") as f:
            processor = pickle.load(f)

        return cls(model, processor)

""",
    """// In Go, an _array_ is a numbered sequence of elements of a
// specific length.

package main

import "fmt"

func main() {

    // Here we create an array `a` that will hold exactly
    // 5 `int`s. The type of elements and length are both
    // part of the array's type. By default an array is
    // zero-valued, which for `int`s means `0`s.
    var a [5]int
    fmt.Println("emp:", a)

    // We can set a value at an index using the
    // `array[index] = value` syntax, and get a value with
    // `array[index]`.
    a[4] = 100
    fmt.Println("set:", a)
    fmt.Println("get:", a[4])""",
    """
CREATE TABLE emp (
empno INT PRIMARY KEY,
ename VARCHAR(10),
job VARCHAR(9),
mgr INT NULL,
hiredate DATETIME,
sal NUMERIC(7,2),
comm NUMERIC(7,2) NULL,
dept INT)
begin
insert into emp values
    (1,'JOHNSON','ADMIN',6,'12-17-1990',18000,NULL,4)
insert into emp values
    (2,'HARDING','MANAGER',9,'02-02-1998',52000,300,3)
insert into emp values
    (3,'TAFT','SALES I',2,'01-02-1996',25000,500,3)
insert into emp values
    (4,'HOOVER','SALES I',2,'04-02-1990',27000,NULL,3)
insert into emp values
    (5,'LINCOLN','TECH',6,'06-23-1994',22500,1400,4)
insert into emp values
    (6,'GARFIELD','MANAGER',9,'05-01-1993',54000,NULL,4)
insert into emp values
    (7,'POLK','TECH',6,'09-22-1997',25000,NULL,4)
insert into emp values
    (8,'GRANT','ENGINEER',10,'03-30-1997',32000,NULL,2)
insert into emp values
    (9,'JACKSON','CEO',NULL,'01-01-1990',75000,NULL,4)
insert into emp values
    (10,'FILLMORE','MANAGER',9,'08-09-1994',56000,NULL,2)
insert into emp values
    (11,'ADAMS','ENGINEER',10,'03-15-1996',34000,NULL,2)
insert into emp values
    (12,'WASHINGTON','ADMIN',6,'04-16-1998',18000,NULL,4)
insert into emp values
    (13,'MONROE','ENGINEER',10,'12-03-2000',30000,NULL,2)
insert into emp values
    (14,'ROOSEVELT','CPA',9,'10-12-1995',35000,NULL,1)
end
CREATE TABLE dept (
deptno INT NOT NULL,
dname VARCHAR(14),
loc VARCHAR(13))
begin
insert into dept values (1,'ACCOUNTING','ST LOUIS')
insert into dept values (2,'RESEARCH','NEW YORK')
insert into dept values (3,'SALES','ATLANTA')
insert into dept values (4, 'OPERATIONS','SEATTLE')
end
""",
    """if ( isset( $_FILES['fupload'] ) ) {

     print "name: ".     $_FILES['fupload']['name']       ."<br />";
     print "size: ".     $_FILES['fupload']['size'] ." bytes<br />";
     print "temp name: ".$_FILES['fupload']['tmp_name']   ."<br />";
     print "type: ".     $_FILES['fupload']['type']       ."<br />";
     print "error: ".    $_FILES['fupload']['error']      ."<br />";

     if ( $_FILES['fupload']['type'] == "image/gif" ) {

         $source = $_FILES['fupload']['tmp_name'];
         $target = "upload/".$_FILES['fupload']['name'];
         move_uploaded_file( $source, $target );// or die ("Couldn't copy");
         $size = getImageSize( $target );

         $imgstr = "<p><img width=\"$size[0]\" height=\"$size[1]\" ";
         $imgstr .= "src=\"$target\" alt=\"uploaded image\" /></p>";

         print $imgstr;
     }
 }""",
    """public partial class MyControl : UserControl
{
  public MyControl()
  {
    InitializeComponent();
    ResizeRedraw = true;
  }

  protected override void OnPaintBackground(PaintEventArgs e)
  {
    base.OnPaintBackground(e);
    // draw a blue ellipse into the control area
    e.Graphics.FillEllipse(new SolidBrush(Color.Blue), 2, 2,
      ClientRectangle.Width - 4, ClientRectangle.Height - 4);
  }
}""",
    """import java.util.Scanner;
public class Demo {

    public static void main(String[] args) {

    	int year;
    	Scanner scan = new Scanner(System.in);
    	System.out.println("Enter any Year:");
    	year = scan.nextInt();
    	scan.close();
        boolean isLeap = false;

        if(year % 4 == 0)
        {
            if( year % 100 == 0)
            {
                if ( year % 400 == 0)
                    isLeap = true;
                else
                    isLeap = false;
            }
            else
                isLeap = true;
        }
        else {
            isLeap = false;
        }

        if(isLeap==true)
            System.out.println(year + " is a Leap Year.");
        else
            System.out.println(year + " is not a Leap Year.");
    }
}""",
]


predictor = Predictor.from_path(".")
results = predictor.predict(test)

for result in results:
    print("Predicted labels:")
    max_pred = -10000
    max_i = -1
    for i, pred in enumerate(result):
        if pred > max_pred:
            max_i = i
            max_pred = pred

    print(encoder.classes_[max_i], max_pred)
    print()


Predicted labels:
JAVASCRIPT 0.6685922145843506

Predicted labels:
HTML 0.9625486731529236

Predicted labels:
CSS 0.9457711577415466

Predicted labels:
C 0.7276895046234131

Predicted labels:
RUBY 0.9799514412879944

Predicted labels:
PYTHON 0.9999970197677612

Predicted labels:
GO 0.9999423027038574

Predicted labels:
SQL 0.999998927116394

Predicted labels:
PHP 0.954882025718689

Predicted labels:
C# 0.6327311992645264

Predicted labels:
JAVA 0.9997807145118713

