In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from word2number import w2n
from sklearn.metrics import f1_score, multilabel_confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
def load_data(filename, label_name):
    with open(filename, 'r') as f:
        return np.array([[eval(data_point)['question'], eval(data_point)[label_name]] for data_point in f.read().splitlines() if label_name in eval(data_point)])

def is_TA_or_instructor_name(text):
    names = {"adam", "victor", "jesse", "peter"}
    
    if text in names:
        return True
    else:
        return False
    
def is_spelled_out_number(text):
    try:
        w2n.word_to_num(text)
        return True
    except ValueError:
        return False
    
def is_function(text): # change to regex
    if '()' in text:
        return True
    else:
        return False

def is_snake_case(text): # change to regex
    if '_' in text:
        return True
    else:
        return False

def is_filename(text):
    if '.c' in text or '.py' in text:
        return True
    else:
        return False
    
def preprocess_sentence(sentence):
    tokenizer = RegexpTokenizer("[^;\s.?,!()]+\.c|[^;\s.,?!()]+\.py|[^;\s.?!(),]+\(\)|[^;\s.?,!()]+")
    sentence = tokenizer.tokenize(sentence.lower())
    
    for i in range(len(sentence)):
        if sentence[i].isnumeric():
            sentence[i] = "numericnumber"
        elif is_spelled_out_number(sentence[i]):
            sentence[i] = "nonnumericnumber"
#         elif is_function(sentence[i]):
#             sentence[i] = "function"
        elif is_filename(sentence[i]):
            sentence[i] = "filename"
#         elif is_snake_case(sentence[i]):
#             sentence[i] = "snakecase"
        elif is_TA_or_instructor_name(sentence[i]):
            sentence[i] = "name"
        
    
    return ' '.join(sentence)

from sklearn.base import BaseEstimator, TransformerMixin

class questionLengthExtractor(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def transform(self, X, y=None):
        return [[len(sentence.split())] for sentence in X]

    def fit(self, X, y=None):
        return self 
    
def preprocess_data(x_data, vectorizer=None):
    sentence_lens = [[len(sentence.split())] for sentence in x_data]
    X = None

    if vectorizer:
        X = vectorizer.transform(x_data)
    else:
        vectorizer = CountVectorizer(preprocessor=preprocess_sentence)
#         vectorizer = TfidfVectorizer(preprocessor=preprocess_sentence)
        X = vectorizer.fit_transform(x_data)
        
    X = np.hstack((X.toarray(), sentence_lens))
    
    return X, vectorizer

In [4]:
question_type_data = load_data('../data/questions.json', 'question_type')
actual_question_data = load_data('../data/questions.json', 'actual_question')







