In [41]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from word2number import w2n
from sklearn.metrics import f1_score, multilabel_confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle

stop_words = stopwords.words('english')

In [208]:
def load_data(filename, label_name):
    with open(filename, 'r') as f:
        return np.array([[eval(data_point)['question'], eval(data_point)[label_name]] for data_point in f.read().splitlines() if label_name in eval(data_point)])
    
def is_spelled_out_number(text):
    try:
        w2n.word_to_num(text)
        return True
    except ValueError:
        return False
    
def is_function(text): # change to regex
    if '()' in text:
        return True
    else:
        return False

def is_snake_case(text): # change to regex
    if '_' in text:
        return True
    else:
        return False

def is_filename(text):
    if '.c' in text or '.py' in text:
        return True
    else:
        return False
    
def is_system_word(text):
    system_words = {'git', 'github', 'gitlab', 'ssh', 'intellij', 'server', 'sdl', '@gitlab', 'sdk', 'config'}
    
    if text in system_words:
        return True
    else:
        return False
    
def preprocess_sentence(sentence):
    tokenizer = RegexpTokenizer("[^;\s.?,!()]+\.c|[^;\s.,?!()]+\.py|[^;\s.?!(),]+\(\)|[^;\s.?,!()]+")
#     sentence = [word for word in tokenizer.tokenize(sentence.lower()) if word not in stop_words]
    sentence = tokenizer.tokenize(sentence.lower())
    for i in range(len(sentence)):
        if sentence[i].isnumeric():
            sentence[i] = "numericnumber"
        elif is_spelled_out_number(sentence[i]):
            sentence[i] = "nonnumericnumber"
# #         elif is_function(sentence[i]):
# #             sentence[i] = "function"
        elif is_filename(sentence[i]):
            sentence[i] = "filename"
# #         elif is_snake_case(sentence[i]):
# #             sentence[i] = "snakecase"
#         elif is_TA_or_instructor_name(sentence[i]):
#             sentence[i] = "name"
        elif is_system_word(sentence[i]):
            sentence[i] = "sys"
        
    
    return ' '.join(sentence)
    
def preprocess_data(x_data, vectorizer=None):
    X = None

    if vectorizer:
        X = vectorizer.transform(x_data)
    else:
        vectorizer = CountVectorizer(preprocessor=preprocess_sentence, ngram_range=(1,2))
        X = vectorizer.fit_transform(x_data)
        
    return X, vectorizer

In [212]:
question_type_data = load_data('../data/questions.json', 'question_type')
train_data, test_data = train_test_split(question_type_data)
X, vectorizer = preprocess_data(train_data[:,0])
y = train_data[:,1]
clf = RandomForestClassifier()
clf.fit(X, y)

X_test, _ = preprocess_data(test_data[:,0], vectorizer)
y_test = test_data[:,1]
y_preds = clf.predict(X_test)
print(clf.classes_)
print(multilabel_confusion_matrix(y_test, y_preds))
# print("Cross Val Score: {}".format(cross_val_score(clf, X_test, y_test).mean()))
print("Training Error: {}".format(sum(clf.predict(X) == y) / len(y)))
print("Test Error: {}".format(sum(y_preds == y_test) / len(y_preds)))

['c' 'd' 'sys']
[[[37  4]
  [ 0 15]]

 [[20  2]
  [ 4 30]]

 [[49  0]
  [ 2  5]]]
Training Error: 1.0
Test Error: 0.8928571428571429


In [213]:
for i in range(len(y_preds)):
    if y_preds[i] != y_test[i]:
        print("Question: {}, Prediction: {}, Actual: {}".format(test_data[:,0][i], clf.predict(X_test[i])[0], y_test[i]))

Question: problems running cache tests with makefile, Prediction: d, Actual: sys
Question: I'm getting a heap buffer overflow error when I try to run my lab1 code, which I don't know how to read/deal with , Prediction: c, Actual: d
Question: "bad register name %rflags", Prediction: c, Actual: d
Question: oh no :( we are failing something weird, Prediction: c, Actual: d
Question: we are getting a null pointer exception , Prediction: c, Actual: d
Question: Tests are passing in IntelliJ but Github says "commit failed." Output also says I won the game even if I haven't., Prediction: d, Actual: sys


In [187]:
pickle.dump((clf, vectorizer), open('../models/question_type_clf.pkl', 'wb'))

In [188]:
def predict_question_type(question):
    clf, vectorizer = pickle.load(open('../models/question_type_clf.pkl', 'rb'))
    question, _ = preprocess_data([question], vectorizer)
    return clf.predict(question)[0]

In [189]:
predict_question_type('I have to copy the definition of struct vector into my testing file, in addition to vector.c, to avoid "incomplete definition" errors. In the lab, we put it in the header so we didn\'t have to repeat code. Is that a better way to do this? Or should I put it in both places?')

'c'