In [30]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import cross_validation

## Import of Training Data
See rosetta_code_scraper.py for details on data collection from RosettaCode.org.

In [40]:
raw_data = pd.read_csv('coding_data.csv', names=['Language', 'Task', 'Code'])
data = raw_data[raw_data['Code'] != '`PARSER ERROR']
clean_data = pd.DataFrame({'Code': data['Code'],
                           'Language': data['Language'].str.lower(),
                           'Task': data['Task']})
clean_data = clean_data.dropna()
clean_data.groupby('Language').size().sort_values()

Language
scheme         279
php            363
javascript     472
clojure        480
ocaml          498
c#             509
scala          542
common lisp    560
haskell        670
java           677
perl           677
c              705
ruby           714
python         766
tcl            776
dtype: int64

## Analysis of Test Data

In [41]:
pipe = Pipeline([
        ('vect', CountVectorizer(token_pattern=r'[a-zA-Z]+|\s+|\_+|[^\w\d\s]', max_df=.4, ngram_range=(1,5))),
        ('clf', MultinomialNB(0.0125)),
])

In [None]:
pipe.fit(clean_data['Code'], clean_data['Language'])

In [None]:
test_ans = pd.read_csv('test.csv', names=['num', 'test_lang'])

test_code = []
for i in range(1, 33):
    with open('test/{}.txt'.format(i)) as f:
        test_code.append(f.read())

test_ans['test_code'] = test_code

In [None]:
test_ans.head()

In [None]:
predicted_code = pipe.predict(test_ans.test_code)

In [None]:
accuracy_score(test_ans.test_lang, predicted_code)

#### Excellent, the model predicted all the text correctly!

In [39]:
pd.DataFrame({"Actual": test_ans['test_lang'], "Predicted": predicted_code})


Unnamed: 0,Actual,Predicted
0,clojure,clojure
1,clojure,clojure
2,clojure,clojure
3,clojure,clojure
4,python,python
5,python,python
6,python,python
7,python,python
8,javascript,javascript
9,javascript,javascript


## Analysis of Training Data for Optimal Parameters

In [6]:
pipeline = Pipeline([
        ('vect', CountVectorizer(token_pattern=r'[a-zA-Z]+|\s+|\_+|[^\w\d\s]')),
        ('clf', MultinomialNB()),])
# k = pipeline.get_params().items()
# for a in k:
#     print(a)

In [7]:
parameters = {
    'vect__max_df': (0.35, 0.4),
    'vect__ngram_range': ((1, 3), (1, 4)),
    'clf__alpha': (0.125, 0.01, 0.015),
}

In [8]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

Since adding Tcl and Javascript to the csv, this no longer seems to be working.  Will leave this in for reference as to how I came upon the parameters used, but it seems no longer needed to optimize, as it correctly predicted all tests.

In [29]:
# grid_search.fit(data['Code'], data['Language'])

In [None]:
# best_parameters = grid_search.best_estimator_.get_params()
# for param_name in sorted(parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
# scores = cross_validation.cross_val_score(pipeline, data['Code'], data['Language'], cv=10)