In [603]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import RegexpTokenizer
from word2number import w2n
from sklearn.metrics import f1_score, multilabel_confusion_matrix

In [446]:
tokenizer = RegexpTokenizer("[A-Za-z0-9_'*@%]+()*")
tokenizer.tokenize("Trouble can't in CaesarCipherSolver howManyWordsIn(); debugging for multiple lists?")

['', '', '', '', '', '', '', '', '']

In [610]:
def load_data(filename, label_name):
    with open(filename, 'r') as f:
        return [[eval(data_point)['question'], eval(data_point)[label_name]] for data_point in f.read().splitlines() if label_name in eval(data_point)]

def is_spelled_out_number(text):
    try:
        w2n.word_to_num(text)
        return True
    except ValueError:
        return False
    
def is_function(text):
    if '()' in text:
        return True
    else:
        return False

def is_snake_case(text):
    if '_' in text:
        return True
    else:
        return False

def is_filename(text):
    if '.c' in text or '.py' in text:
        return True
    else:
        return False
    
def preprocess_sentence(sentence):
    
    tokenizer = RegexpTokenizer("[^;\s.?,!()]+\.c|[^;\s.,?!()]+\.py|[^;\s.?!(),]+\(\)|[^;\s.?,!()]+")
    sentence = tokenizer.tokenize(sentence)
    
    for i in range(len(sentence)):
        if sentence[i].isnumeric():
            sentence[i] = "numericnumber"
        elif is_spelled_out_number(sentence[i]):
            sentence[i] = "nonnumericnumber"
#         elif is_function(sentence[i]):
#             sentence[i] = "function"
#         elif is_filename(sentence[i]):
#             sentence[i] = "filename"
#         elif is_snake_case(sentence[i]):
#             sentence[i] = "snakecase"
        
    
    return ' '.join(sentence)
    
def preprocess_data(x_data, vectorizer=None):
    sentence_lens = [[len(sentence.split())] for sentence in x_data]
    processed_x_data = [preprocess_sentence(sentence) for sentence in x_data]
    X = None

    if vectorizer:
        X = vectorizer.transform(processed_x_data)
    else:
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(processed_x_data)
        
    X = np.hstack((X.toarray(), sentence_lens))
    
    return X, vectorizer

def prepare_y_binary(y_data, true):
    return [1 if label == true else 0 for label in y_data]


In [576]:
tokenizer = RegexpTokenizer("[^;\s.?,!()]+\.c|[^;\s.,?!()]+\.py|[^;\s.?!(),]+\(\)|[^;\s.?,!()]+")

preprocess_sentence("I think all the tests pass except for algo_csum.py, which loops infinitely. I'm honestly not sure where to begin, because")

"I think all the tests pass except for algo_csum.py which loops infinitely I'm honestly not sure where to begin because"

In [667]:
# data = np.array(load_data('../data/questions.json', 'question_type'))
data = np.array(load_data('../data/questions.json', 'actual_question'))
train_data, test_data = train_test_split(data)

In [668]:
X, vectorizer = preprocess_data(train_data[:,0])
clf = RandomForestClassifier()
y = train_data[:,1]
# y = prepare_y_binary(train_data[:,1], 't')
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [669]:
X_test, _ = preprocess_data(test_data[:,0], vectorizer)
y_preds = clf.predict(X_test)
y_test = test_data[:,1]
# y_test = prepare_y_binary(test_data[:,1], 't')
cross_val_score(clf, X_test, y_test).mean(), sum(clf.predict(X) == y) / len(y), sum(y_preds == y_test) / len(y_preds), multilabel_confusion_matrix(y_test, y_preds)

(0.7363636363636364,
 1.0,
 0.9056603773584906,
 array([[[39,  1],
         [ 4,  9]],
 
        [[ 9,  4],
         [ 1, 39]]]))

In [580]:
y_preds[10:20]

array(['f', 't', 't', 'f', 't', 'f', 't', 't', 't', 't'], dtype='<U597')

In [119]:
from nltk.corpus import stopwords

In [121]:
stop_words = stopwords.words('english')

In [133]:
for word in "I couldnt solve the FileNotFound Error?".lower().split():
    if word not in stop_words:
        print(word)

couldnt
solve
filenotfound
error?


In [144]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each