In [21]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from word2number import w2n
from sklearn.metrics import f1_score, multilabel_confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle

In [91]:
def load_data(filename, label_name):
    with open(filename, 'r') as f:
        return np.array([[eval(data_point)['question'], eval(data_point)[label_name]] for data_point in f.read().splitlines() if label_name in eval(data_point)])

def is_TA_or_instructor_name(text):
    names = {"adam", "victor", "jesse", "peter"}
    
    if text in names:
        return True
    else:
        return False
    
def is_spelled_out_number(text):
    try:
        w2n.word_to_num(text)
        return True
    except ValueError:
        return False
    
def is_function(text): # change to regex
    if '()' in text:
        return True
    else:
        return False

def is_snake_case(text): # change to regex
    if '_' in text:
        return True
    else:
        return False

def is_filename(text):
    if '.c' in text or '.py' in text:
        return True
    else:
        return False
    
def preprocess_sentence(sentence):
    tokenizer = RegexpTokenizer("[^;\s.?,!()]+\.c|[^;\s.,?!()]+\.py|[^;\s.?!(),]+\(\)|[^;\s.?,!()]+")
    sentence = tokenizer.tokenize(sentence.lower())
    
    for i in range(len(sentence)):
        if sentence[i].isnumeric():
            sentence[i] = "numericnumber"
        elif is_spelled_out_number(sentence[i]):
            sentence[i] = "nonnumericnumber"
#         elif is_function(sentence[i]):
#             sentence[i] = "function"
        elif is_filename(sentence[i]):
            sentence[i] = "filename"
#         elif is_snake_case(sentence[i]):
#             sentence[i] = "snakecase"
        elif is_TA_or_instructor_name(sentence[i]):
            sentence[i] = "name"
        
    
    return ' '.join(sentence)

    
def preprocess_data(x_data, vectorizer=None):
    sentence_lens = [[len(sentence.split())] for sentence in x_data]
    X = None

    if vectorizer:
        X = vectorizer.transform(x_data)
    else:
        vectorizer = CountVectorizer(preprocessor=preprocess_sentence)
#         vectorizer = TfidfVectorizer(preprocessor=preprocess_sentence)
        X = vectorizer.fit_transform(x_data)
        
    X = np.hstack((X.toarray(), sentence_lens))
    
    return X, vectorizer

In [166]:
actual_question_data = load_data('../data/questions.json', 'actual_question')

train_data, test_data = train_test_split(actual_question_data)
X, vectorizer = preprocess_data(train_data[:,0])
y = train_data[:,1]
clf = RandomForestClassifier()
clf.fit(X, y)

X_test, _ = preprocess_data(test_data[:,0], vectorizer)
y_test = test_data[:,1]
y_preds = clf.predict(X_test)
# print("Cross Val Score: {}".format(cross_val_score(clf, X_test, y_test).mean()))
print("Training Error: {}".format(sum(clf.predict(X) == y) / len(y)))
print("Test Error: {}".format(sum(y_preds == y_test) / len(y_preds)))


Training Error: 1.0
Test Error: 0.9577464788732394


In [167]:
for i in range(len(y_preds)):
    if y_preds[i] != y_test[i]:
        print("Question: {}, Prediction: {}, Actual: {}".format(test_data[:,0][i], clf.predict_proba([X_test[i]])[0][1], y_test[i]))

Question: LRU conceptual asdfafsdaf, Prediction: 0.4, Actual: t
Question: TrieMap Movie Autocompleter, Prediction: 0.4, Actual: t
Question: Help with sigsegv_handler , Prediction: 0.28, Actual: t


In [168]:
pickle.dump((clf, vectorizer), open('../models/actual_question_clf.pkl', 'wb'))

In [169]:
def actual_question_prob(question):
    clf, vectorizer = pickle.load(open('../models/actual_question_clf.pkl', 'rb'))
    question, _ = preprocess_data([question], vectorizer)
    return clf.predict_proba(question)[0][1]

In [177]:
print(clf.classes_)
print(multilabel_confusion_matrix(y_test, y_preds))

['f' 't']
[[[54  3]
  [ 0 14]]

 [[14  0]
  [ 3 54]]]
