In [1]:
import glob
import pandas as pd
import csv
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Programming Language Classifier

In [2]:
extensions = ['c', 'gcc', 'csharp', 'sbcl', 'clojure', 'ghc', 'java', 'javascript', 'ocaml', 'perl', 'php',
              'python3', 'jruby', 'yarv', 'scala', 'racket']
extension_dict = {'c': 'c', 'gcc': 'c', 'clojure':'clojure', 'csharp': 'csharp', 'sbcl': 'common lisp', 
                  'ghc': 'haskell', 'java': 'java', 'javascript': 'javaScript', 'ocaml': 'ocaml', 'perl': 'perl', 
                  'php': 'php', 'python3': 'python', 'jruby': 'ruby', 'yarv': 'ruby', 'scala': 'scala', 
                  'racket': 'scheme'}

## Import different language file types by extension

In [3]:
def read_languages(loc):
    files = glob.glob(loc, recursive=True)
    texts = []
    for file in files:
        with open(file, encoding ='latin-1') as f:
            texts.append(f.read())
    return texts

 *create x and y values*

In [4]:
X = []
y = []

for extension in extensions:
    x_files = read_languages('benchmarksgame-2014-08-31/benchmarksgame/bench/**/*.{}'.format(extension))
    X += x_files
    y += len(x_files) * [extension_dict[extension]]

In [5]:
len(X), len(y)

(560, 560)

## Split data into test and training

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, test_size=0.4)

## Make pipline

In [7]:
map_pipeline = [('vect', CountVectorizer(
                  token_pattern=r'[a-zA-z0-9]{3,}|\s|[^\w\d\s]')), 
                ('bay', MultinomialNB())]

In [8]:
pipeline = Pipeline(map_pipeline)

In [9]:
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)

0.97619047619047616

## Import and run test files

In [10]:
test_samples = []
test_languages = []

for item in range(1, 33):
    x = read_languages('polyglot/test/{}'.format(item))
    test_samples += x
with open('polyglot/test.csv') as languages:
    reader = csv.reader(languages)
    for row in reader:
        test_languages.append(row[1])

print(test_languages)

['clojure', 'clojure', 'clojure', 'clojure', 'python', 'python', 'python', 'python', 'javascript', 'javascript', 'javascript', 'javascript', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'scheme', 'scheme', 'scheme', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']


In [13]:
pipeline.predict(test_samples)

array(['clojure', 'clojure', 'clojure', 'clojure', 'ruby', 'java', 'ruby',
       'ruby', 'csharp', 'java', 'scala', 'scala', 'ruby', 'ruby', 'ruby',
       'java', 'haskell', 'haskell', 'scheme', 'scheme', 'scheme', 'java',
       'csharp', 'scala', 'scala', 'perl', 'php', 'c', 'php', 'php',
       'ocaml', 'clojure'], 
      dtype='<U11')

In [14]:
pipeline.score(test_samples, test_languages)

0.5625

####Not a very good result with the classifier being 56%.  I w