In [1]:
import glob
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Programming Language Classifier

In [2]:
extensions = ['c', 'gcc', 'csharp', 'sbcl', 'clojure', 'ghc', 'java', 'javascript', 'ocaml', 'perl', 'php',
              'python3', 'jruby', 'yarv', 'scala', 'racket']
extension_dict = {'c': 'C', 'gcc': 'C', 'clojure':'Clojure', 'csharp': 'Csharp', 'sbcl': 'Common Lisp', 
                  'ghc': 'Haskell', 'java': 'Java', 'javascript': 'JavaScript', 'ocaml': 'OCaml', 'perl': 'Perl', 
                  'php': 'PHP', 'python3': 'Python', 'jruby': 'Ruby', 'yarv': 'Ruby', 'scala': 'Scala', 
                  'racket': 'Scheme'}

## Import different language file types by extension

In [3]:
def read_languages(directory, extension):
    files = glob.glob('{}/**/*.{}'.format(directory, extension))
    texts = []
    for file in files:
        with open(file, encoding ='latin-1') as f:
            texts.append(f.read())
    return texts

 *create x and y values*

In [4]:
X = []
y = []

for extension in extensions:
    x_files = read_languages('benchmarksgame-2014-08-31/benchmarksgame/bench', extension)
    X += x_files
    y += len(x_files) * [extension_dict[extension]]

In [5]:
len(X), len(y)

(559, 559)

## Split data into test and training

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, test_size=0.4)

## Make pipline

In [41]:
map_pipeline = [('vect', CountVectorizer(
                  token_pattern=r'[a-zA-z0-9]{3,}|\s|[^\w\d\s]')), 
                ('bay', MultinomialNB())]

In [42]:
pipeline = Pipeline(map_pipeline)

In [46]:
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)

0.9761194029850746

## Import and run test files

In [47]:
def test_file():
    files = glob.glob('polyglot/test/*')
    tests = []
    for file in files:
        with open(file) as f:
            tests.append(f.read())
    return tests 
language_tests = test_file()
y_test = pd.read_csv('polyglot/test.csv', names=['index', 'language'])
pipeline.predict(language_tests)

array(['Clojure', 'JavaScript', 'Scala', 'C', 'Ruby', 'Ruby', 'Ruby',
       'Java', 'Haskell', 'Haskell', 'Scheme', 'Clojure', 'Scheme',
       'Scheme', 'Java', 'C', 'Scala', 'Scala', 'Perl', 'PHP', 'C', 'PHP',
       'Clojure', 'PHP', 'OCaml', 'OCaml', 'Java', 'Python', 'Java',
       'Ruby', 'Ruby', 'Java'], 
      dtype='<U11')

In [40]:
pd.DataFrame(test, y_test['language'])

Unnamed: 0_level_0,0
language,Unnamed: 1_level_1
clojure,Clojure
clojure,JavaScript
clojure,Scala
clojure,C
python,Ruby
python,Scala
python,Ruby
python,Haskell
javascript,Haskell
javascript,Haskell
