In [280]:
import pandas as pd
import numpy as np
import glob
import re
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_union, FeatureUnion
from sklearn.preprocessing import PolynomialFeatures, normalize

## Read the files in using the glob module

In [281]:
def read_prog_files(file_ext):
    files = glob.glob('/Users/mjoneill/Documents/week5/polyglot/benchmarksgame-2014-08-31/benchmarksgame/bench/**/*.{}'.format(file_ext), recursive=True)
    texts = []
    for file in files:
        with open(file,encoding='latin-1') as f:
            texts.append(f.read())
    return texts


In [282]:
file_extensions = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'java', 'javascript', 'ocaml', 'perl', 'hack', 'php', 'python3', 'jruby', 'yarv', 'scala', 'racket']
ext_dict = {'jruby': 'ruby', 'csharp': 'c#', 'hack': 'php', 'sbcl': 'common lisp', 'ocaml': 'ocaml', 'python3': 'python', 'php': 'php', 'perl': 'perl', 'racket': 'scheme', 'c': 'c', 'javascript': 'javascript', 'gcc': 'c', 'yarv': 'ruby', 'java': 'java', 'clojure': 'clojure', 'scala': 'scala'}
X = []
y = []

for ext in file_extensions:
    x_texts = read_prog_files(ext)
    X += x_texts
    y += (len(x_texts) * [ext_dict[ext]])
# X = pd.DataFrame(X)   
# print(len(X), len(y))
# len(set(y))
# len(X)

## Split into training and testing data sets

In [283]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, train_size=0.6)
len(X_train)

331

### Instantiate the count vectorizer object

In [284]:
cv = CountVectorizer()

### Fit the training data to the count vectorizer object

In [285]:
cv.fit(X_train)
# cv.transform(X_train)
print(len(cv.get_feature_names()))


6031


### Create Pipeline object to string together data featurization and estimation

In [286]:
pipe = Pipeline([('vectorizer', CountVectorizer()), ('classifier', MultinomialNB())])

## Fit

In [287]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [288]:
print('Training Score')
pipe.score(X_train, y_train)


Training Score


0.97885196374622352

In [289]:
print('Test Score')
pipe.score(X_test, y_test)

Test Score


0.89639639639639634

# Part 2

In this part I use the regular expressions module to furthur specify which features
are being pulled out of the data

In [290]:
pipe2 = Pipeline([('vectorizer', CountVectorizer(token_pattern= r'[a-zA-Z\;\:\)\(\}&\$%<\?>~,\+!#@]{2,}')),
#                   ('inverse weight', TfidfTransformer()),
                  ('classifier', MultinomialNB())])

# pipe2 = Pipeline([('vectorizer', CountVectorizer()),
#                   ('inverse weight', TfidfTransformer()),
#                   ('classifier', MultinomialNB())])

In [291]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [292]:
pipe2.score(X_train, y_train)

1.0

In [293]:
print(pipe2.score(X_test, y_test))

0.905405405405


### A Different Attempt

In this part, I attempt to build my own program to get features from the data files.

In [294]:
def percent_colon(document):
    test = re.findall(r':', document)
    return len(test) / len(document)

def percent_semi_colon(document):
    test = re.findall(r';', document)
    return len(test)/ len(document)

def percent_curly_braces(document):
    test = re.findall(r'\{|\}', document)
    return len(test) /len(document)

def percent_parens(document):
    test = re.findall(r'\(|\)', document)
    return len(test) /len(document)

def pervent_var(document):
    test = re.findall(r'var|Var', document)
    return len(test) / len(document)

def percent_def(document):
    test = re.findall(r'def', document)
    return len(test) / len(document)
    
def percent_new_line(document):
    test = re.findall(r'/n', document)
    return len(test) / len(document)


    
    

In [295]:
df = pd.DataFrame()

pc_list = []
psc_list = []
pcb_list = []
pp_list = []
pv_list = []
pd_list = []

for text in X:
    pc = percent_colon(text)
    psc = percent_semi_colon(text)
#     pcb = percent_curly_braces(text)
#     pp = percent_parens(text)
#     pv = pervent_var(text)
#     pd = percent_def(text)
    pc_list.append(pc)
    psc_list.append(psc)
#     pcb_list.append(pcb)
#     pp_list.append(pp)
#     pv_list.append(pv)
#     pd_list.append(pd)
data = pd.DataFrame('',(pc_list, psc_list))
new_df = df.append(data)
new_df.head()

PandasError: DataFrame constructor not properly called!

In [None]:
ys = pd.DataFrame(y)
print(ys.shape)
print(df.shape)

In [None]:
mv = MultinomialNB()
mv.fit(new_df,ys)