In [1]:
#from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle            ## reading models from disk
import nltk, string      ## For tokenizer

pd.set_option('display.max_colwidth', -1)    ## Problem texts can be long and may not load on Jupyter

## Reading pickled fitted vectorizers & defining tokenizer

In [2]:
## Tokenizer need not be pickled. It's common for all subjects
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.translate(remove_punctuation_map)))

In [3]:
with open('vec_mth.pkl', 'rb') as f:
    vec_mth = pickle.load(f)
with open('vec_phy.pkl', 'rb') as f:
    vec_phy = pickle.load(f)
with open('vec_chm.pkl', 'rb') as f:
    vec_chm = pickle.load(f)

## Reading pickled classifier models

In [4]:
## Classifiers
with open('clf_mth.pkl', 'rb') as f:
    clf_mth = pickle.load(f)
with open('clf_phy.pkl', 'rb') as f:
    clf_phy = pickle.load(f)
with open('clf_chm.pkl', 'rb') as f:
    clf_chm = pickle.load(f)

## Set vectorizer & classifier based on input subject

In [5]:
def vec_by_subject(argument): 
    switcher = { 
        'phy': vec_phy, 
        'chm': vec_chm, 
        'mth': vec_mth, 
    } 
    # get() method of dictionary data type returns value of passed argument if it is present  
    # in dictionary otherwise second argument will be assigned as default value of passed argument 
    return switcher.get(argument, "mth") 

In [6]:
def clf_by_subject(argument): 
    switcher = { 
        'phy': clf_phy, 
        'chm': clf_chm, 
        'mth': clf_mth, 
    } 
    # get() method of dictionary data type returns value of passed argument if it is present  
    # in dictionary otherwise second argument will be assigned as default value of passed argument 
    return switcher.get(argument, "mth") 

In [7]:
vec = vec_by_subject('chm')
clf = clf_by_subject('chm')
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=3,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [8]:
#text = 'what is the heat capacity of plastic'
text = 'what is the chemical formula of aldol?'

## Testing pickled models

In [9]:
def chapter_clf_model(text):                   ## This combined function could not be pickled !!
    text = [text]                              ## Convert input string to a list which is an iterable needed for tf-idf
    feat = vec.transform(text)    ## Convert text to tfidf matrix
    pred = clf.predict(feat)                   ## Predict label of chapter
    #cname = encoder.inverse_transform(pred)    ## Convert label to chapter name
    return ''.join(pred)                      ## Converting array prediction to a single string

## Prediction

In [10]:
chapter_clf_model(text)

'Aldehydes and Ketones'