In [50]:
import glob
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [51]:
extensions = ['c', 'gcc', 'csharp', 'sbcl', 'clojure', 'ghc', 'java', 'javascript', 'ocaml', 'perl', 'php',
              'python3', 'jruby', 'yarv', 'scala', 'racket']
extension_dict = {'c': 'C', 'gcc': 'C', 'clojure':'Clojure', 'csharp': 'Csharp', 'sbcl': 'Common Lisp', 
                  'ghc': 'Haskell', 'java': 'Java', 'javascript': 'JavaScript', 'ocaml': 'OCaml', 'perl': 'Perl', 
                  'php': 'PHP', 'python3': 'Python', 'jruby': 'Ruby', 'yarv': 'Ruby', 'scala': 'Scala', 
                  'racket': 'Scheme'}

In [52]:
def read_languages(directory, extension):
    files = glob.glob('{}/**/*.{}'.format(directory, extension))
    texts = []
    for file in files:
        with open(file, encoding ='latin-1') as f:
            texts.append(f.read())
    return texts

In [53]:
X = []
y = []

for extension in extensions:
    x_files = read_languages('benchmarksgame-2014-08-31/benchmarksgame/bench', extension)
    X += x_files
    y += len(x_files) * [extension_dict[extension]]

In [67]:
print(y)

['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Csharp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp', 'Common Lisp',

In [54]:
len(X), len(y)

(559, 559)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, test_size=0.4)

In [56]:
map_pipeline = [('vect', CountVectorizer()), ('tuff', TfidfTransformer()), ('bay', MultinomialNB())]

In [57]:
pipeline = Pipeline(map_pipeline)

In [58]:
pipeline.fit(X_train, y_train)
pass

In [59]:
print('train score:', pipeline.score(X_train, y_train))
print('test score:', pipeline.score(X_test, y_test))

train score: 0.880597014925
test score: 0.625


In [60]:
def test_file():
    files = glob.glob('polyglot/test/*')
    tests = []
    for file in files:
        with open(file) as f:
            tests.append(f.read())
    return tests 

In [61]:
language_tests = test_file()

In [62]:
y_test = pd.read_csv('polyglot/test.csv', names=['index', 'language'])

In [63]:
test = pipeline.predict(language_tests)

In [64]:
pd.DataFrame(test, y_test['language'])

Unnamed: 0_level_0,0
language,Unnamed: 1_level_1
clojure,Clojure
clojure,Ruby
clojure,Ruby
clojure,Ruby
python,Ruby
python,Ruby
python,Ruby
python,Java
javascript,Ruby
javascript,Ruby
