In [1]:
import numpy as np
import pandas as pd
import glob

In [2]:
file_ext = {"C": ["gcc", "c", "h"],
            "C#": ["csharp"],
            "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"],
            "Common Lisp": ["sbcl"],
            "Haskell": ["hs", "lhs", "ghc"],
            "Java": ["java", "class", "jar"],
            "Javascript": ["js", "javascript"],
            "OCaml": ["ocaml", "ml"],
            "Perl": ["pl", "pm", "t", "pod", "perl"],
            "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"],
            "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"],
            "Ruby": ["rb", "rbw", "jruby", "yarv"],
            "Scala": ["scala"],
            "Scheme": ["scm", "ss", "racket"],
            "Tcl": ["tcl"]}

In [3]:
def read_bench_files():
    files = glob.glob("benchmarksgame/benchmarksgame/bench/*/*.*")
    texts = []
    for file in files:
        ext = get_ext(file.split(".")[-1])
        with open(file) as fh:
            if ext != None:
                texts.append((fh.read(), ext))
    return texts

In [4]:
def get_ext(ext):
    for key, value in file_ext.items():
        if ext in value:
            return key

In [5]:
data = read_bench_files()
data = pd.DataFrame(data, columns = ["Code", "Language"])
data.head()

Unnamed: 0,Code,Language
0,"/*\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...",C
1,"/*\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...",C
2,"/*\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...",C
3,;; The Computer Language Benchmarks Game\n;; h...,Clojure
4,;; The Computer Language Benchmarks Game\n;; h...,Clojure


In [6]:
data.Language.value_counts()

Ruby           73
C              61
PHP            55
Java           51
Scala          43
C#             41
Clojure        38
Python         36
Common Lisp    34
OCaml          34
Perl           34
Haskell        33
Scheme         29
Javascript     25
dtype: int64

In [7]:
y = data.loc[:,"Language"]
y.head()

0          C
1          C
2          C
3    Clojure
4    Clojure
Name: Language, dtype: object

In [8]:
X = data.loc[:,"Code"]
X.head()

0    /*\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...
1    /*\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...
2    /*\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...
3    ;; The Computer Language Benchmarks Game\n;; h...
4    ;; The Computer Language Benchmarks Game\n;; h...
Name: Code, dtype: object

In [9]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
from sklearn.base import TransformerMixin
class DumbFeaturizer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [[1] for _ in X]

In [12]:
from sklearn.tree import DecisionTreeClassifier
pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.11914893617021277

In [13]:
import re

In [14]:
def char_count(text, char):
    return text.count(char)

In [15]:
def string_count(string, code):
    value = len(re.findall(string, code))
    return value

In [16]:
keywords = ["public", "private", "static", "if", "else", "elif", "def", "void", "int", 
            "float", "for", "while", "import", "define", "function", "return", "format", 
            "and", "var", "loop", "array", "local"]
symbols = [":", ";", "{", "}", "(", ")", "#", "[", "]", ","]

In [17]:
len(keywords), len(symbols)

(22, 10)

In [18]:
def add_features():
    df = data
    my_dict = {}
    for index, row in df.iterrows():
        for keyword in keywords:
            value = string_count(keyword, row["Code"])
            if keyword not in my_dict:
                my_dict[keyword] = [value]
            else:
                my_dict[keyword].append(value)
        for symbol in symbols:
            count = char_count(symbol, row["Code"])
            if symbol not in my_dict:
                my_dict[symbol] = [count]
            else:
                my_dict[symbol].append(count)
        
    return my_dict

In [19]:
features = add_features()

In [20]:
fdf = pd.DataFrame.from_dict(features)

In [21]:
data = fdf.join(data)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 587 entries, 0 to 586
Data columns (total 34 columns):
#           587 non-null int64
(           587 non-null int64
)           587 non-null int64
,           587 non-null int64
:           587 non-null int64
;           587 non-null int64
[           587 non-null int64
]           587 non-null int64
and         587 non-null int64
array       587 non-null int64
def         587 non-null int64
define      587 non-null int64
elif        587 non-null int64
else        587 non-null int64
float       587 non-null int64
for         587 non-null int64
format      587 non-null int64
function    587 non-null int64
if          587 non-null int64
import      587 non-null int64
int         587 non-null int64
local       587 non-null int64
loop        587 non-null int64
private     587 non-null int64
public      587 non-null int64
return      587 non-null int64
static      587 non-null int64
var         587 non-null int64
void        587 non-null in

In [46]:
y = data.loc[:,("Language")]

In [47]:
y.head()

0          C
1          C
2          C
3    Clojure
4    Clojure
Name: Language, dtype: object

In [50]:
y.indo()

AttributeError: 'Series' object has no attribute 'indo'

In [51]:
X = data.loc[:, ("Code", "public", "private", "static", "if", "else", "elif", "def", "void", "int", 
            "float", "for", "while", "import", "define", "function", "return", "format", 
            "and", "var", "loop", "array", "local", ":", ";", "{", "}", "(", ")", "#", "[", "]", ",")]

In [52]:
X.head()

Unnamed: 0,Code,public,private,static,if,else,elif,def,void,int,...,:,;,{,},(,),#,[,],","
0,"/*\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...",0,0,1,25,2,0,8,1,21,...,0,0,0,0,0,0,0,0,0,0
1,"/*\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...",0,0,1,29,3,0,12,1,22,...,0,0,0,0,0,0,0,0,0,0
2,"/*\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...",0,0,3,26,2,0,13,1,25,...,0,0,0,0,0,0,0,0,0,0
3,;; The Computer Language Benchmarks Game\n;; h...,0,0,0,8,0,0,11,0,4,...,0,0,0,0,0,0,0,0,0,0
4,;; The Computer Language Benchmarks Game\n;; h...,0,0,0,11,0,0,9,0,4,...,0,0,0,0,0,0,0,0,0,0


In [53]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 587 entries, 0 to 586
Data columns (total 33 columns):
Code        587 non-null object
public      587 non-null int64
private     587 non-null int64
static      587 non-null int64
if          587 non-null int64
else        587 non-null int64
elif        587 non-null int64
def         587 non-null int64
void        587 non-null int64
int         587 non-null int64
float       587 non-null int64
for         587 non-null int64
while       587 non-null int64
import      587 non-null int64
define      587 non-null int64
function    587 non-null int64
return      587 non-null int64
format      587 non-null int64
and         587 non-null int64
var         587 non-null int64
loop        587 non-null int64
array       587 non-null int64
local       587 non-null int64
:           587 non-null int64
;           587 non-null int64
{           587 non-null int64
}           587 non-null int64
(           587 non-null int64
)           587 non-null i

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

ValueError: Number of labels=352 does not match number of samples=33