# Classification for Math Problems

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Classify images of math problems into categories.

## Set Up the Notebook

In [None]:
!pip install datasets
from datasets import load_dataset
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from ast import literal_eval
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier


Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 18.8 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 512 kB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 70.3 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 56.1 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 36.9 MB/s 
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)
[K     |█████████

## Data Preparation

Don't run this section everytime. Simply load the csv file.

In [None]:
# all math problem categories
categories = [
    "algebra__linear_1d",
    "algebra__linear_1d_composed",
    "algebra__linear_2d",
    "algebra__linear_2d_composed",
    "algebra__polynomial_roots",
    "algebra__polynomial_roots_composed",
    "algebra__sequence_next_term",
    "algebra__sequence_nth_term",
    "arithmetic__add_or_sub",
    "arithmetic__add_or_sub_in_base",
    "arithmetic__add_sub_multiple",
    "arithmetic__div",
    "arithmetic__mixed",
    "arithmetic__mul",
    "arithmetic__mul_div_multiple",
    "arithmetic__nearest_integer_root",
    "arithmetic__simplify_surd",
    "calculus__differentiate",
    "calculus__differentiate_composed",
    "comparison__closest",
    "comparison__closest_composed",
    "comparison__kth_biggest",
    "comparison__kth_biggest_composed",
    "comparison__pair",
    "comparison__pair_composed",
    "comparison__sort",
    "comparison__sort_composed",
    "measurement__conversion",
    "measurement__time",
    "numbers__base_conversion",
    "numbers__div_remainder",
    "numbers__div_remainder_composed",
    "numbers__gcd",
    "numbers__gcd_composed",
    "numbers__is_factor",
    "numbers__is_factor_composed",
    "numbers__is_prime",
    "numbers__is_prime_composed",
    "numbers__lcm",
    "numbers__lcm_composed",
    "numbers__list_prime_factors",
    "numbers__list_prime_factors_composed",
    "numbers__place_value",
    "numbers__place_value_composed",
    "numbers__round_number",
    "numbers__round_number_composed",
    "polynomials__add",
    "polynomials__coefficient_named",
    "polynomials__collect",
    "polynomials__compose",
    "polynomials__evaluate",
    "polynomials__evaluate_composed",
    "polynomials__expand",
    "polynomials__simplify_power",
    "probability__swr_p_level_set",
    "probability__swr_p_sequence"
]

In [None]:
combined_dataset = []  # list of dataframes of all categories

sample_size = 10000  # sample 10000 rows from each category

for category in categories:
  dataset = load_dataset('math_dataset', category, split='train') # load
  pd_dataset = dataset.to_pandas() # convert to panda dataframe
  pd_dataset = pd_dataset.sample(n=sample_size, random_state=1) # random sample
  pd_dataset['category'] = category # add category column
  pd_dataset = pd_dataset.drop('answer', axis='columns') # remove answer column
  combined_dataset.append(pd_dataset) 

df = pd.concat(combined_dataset) # concatenate into one panda dataframe
df = df.reset_index(drop=True) 

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading and preparing dataset math_dataset/algebra__linear_1d (download: 2.17 GiB, generated: 88.31 MiB, post-processed: Unknown size, total: 2.26 GiB) to /root/.cache/huggingface/datasets/math_dataset/algebra__linear_1d/1.0.0/b50c178104db3805dba98dde3ae6a2bec787fcdd308e0b924c69067217104a6c...


Downloading:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
df.head()

NameError: ignored

In [None]:
# add a column: a list of words obtained by tokenizing the question string.
df['tokens'] = np.empty
for ind in df.index:
  # remove digits and tokenize the question string
  df['tokens'][ind] = word_tokenize(''.join(i for i in df['question'][ind] if not i.isdigit()))

In [None]:
df.info()

In [None]:
# get one hot encoding of category
encoded = pd.get_dummies(df['category'])
# join the encoded df
df = df.join(encoded)

In [None]:
# save as csv
df.to_csv('data.csv', index = False)

## Exploratory Data Analysis

In [None]:
# read data from csv
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data.csv') # load from csv
# convert tokens column from string to list
#df['tokens'] = df['tokens'].apply(literal_eval)

Examine at the head of the dataframe.

In [None]:
df = df.drop(labels= 'tokens',axis=1)

In [None]:
df.head()

Unnamed: 0,question,category,algebra__linear_1d,algebra__linear_1d_composed,algebra__linear_2d,algebra__linear_2d_composed,algebra__polynomial_roots,algebra__polynomial_roots_composed,algebra__sequence_next_term,algebra__sequence_nth_term,arithmetic__add_or_sub,arithmetic__add_or_sub_in_base,arithmetic__add_sub_multiple,arithmetic__div,arithmetic__mixed,arithmetic__mul,arithmetic__mul_div_multiple,arithmetic__nearest_integer_root,arithmetic__simplify_surd,calculus__differentiate,calculus__differentiate_composed,comparison__closest,comparison__closest_composed,comparison__kth_biggest,comparison__kth_biggest_composed,comparison__pair,comparison__pair_composed,comparison__sort,comparison__sort_composed,measurement__conversion,measurement__time,numbers__base_conversion,numbers__div_remainder,numbers__div_remainder_composed,numbers__gcd,numbers__gcd_composed,numbers__is_factor,numbers__is_factor_composed,numbers__is_prime,numbers__is_prime_composed,numbers__lcm,numbers__lcm_composed,numbers__list_prime_factors,numbers__list_prime_factors_composed,numbers__place_value,numbers__place_value_composed,numbers__round_number,numbers__round_number_composed,polynomials__add,polynomials__coefficient_named,polynomials__collect,polynomials__compose,polynomials__evaluate,polynomials__evaluate_composed,polynomials__expand,polynomials__simplify_power,probability__swr_p_level_set,probability__swr_p_sequence
0,Solve -7*q - 416 = -472 for q.,algebra__linear_1d,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Solve 37*c + 29125 = -41*c + 56558 - 31177 for c.,algebra__linear_1d,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Solve 29*u + 17*u = 30*u + 128 for u.,algebra__linear_1d,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Solve 63*b + 3292 - 3355 = 0 for b.,algebra__linear_1d,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Solve 217*u - 73*u + 561 = -591 for u.,algebra__linear_1d,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Examine one question from each category.

In [None]:
df.groupby("category").sample(n=1)

Unnamed: 0,question,category,tokens,algebra__linear_1d,algebra__linear_1d_composed,algebra__linear_2d,algebra__linear_2d_composed,algebra__polynomial_roots,algebra__polynomial_roots_composed,algebra__sequence_next_term,algebra__sequence_nth_term,arithmetic__add_or_sub,arithmetic__add_or_sub_in_base,arithmetic__add_sub_multiple,arithmetic__div,arithmetic__mixed,arithmetic__mul,arithmetic__mul_div_multiple,arithmetic__nearest_integer_root,arithmetic__simplify_surd,calculus__differentiate,calculus__differentiate_composed,comparison__closest,comparison__closest_composed,comparison__kth_biggest,comparison__kth_biggest_composed,comparison__pair,comparison__pair_composed,comparison__sort,comparison__sort_composed,measurement__conversion,measurement__time,numbers__base_conversion,numbers__div_remainder,numbers__div_remainder_composed,numbers__gcd,numbers__gcd_composed,numbers__is_factor,numbers__is_factor_composed,numbers__is_prime,numbers__is_prime_composed,numbers__lcm,numbers__lcm_composed,numbers__list_prime_factors,numbers__list_prime_factors_composed,numbers__place_value,numbers__place_value_composed,numbers__round_number,numbers__round_number_composed,polynomials__add,polynomials__coefficient_named,polynomials__collect,polynomials__compose,polynomials__evaluate,polynomials__evaluate_composed,polynomials__expand,polynomials__simplify_power,probability__swr_p_level_set,probability__swr_p_sequence
2662,Solve 9*q - 15*q - 33*q - 868 = 23*q for q.,algebra__linear_1d,"[Solve, *q, -, *q, -, *q, -, =, *q, for, q, .]",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16479,Let d = 19 + -16. Let g be (4/6)/((-6)/(-9)). ...,algebra__linear_1d_composed,"[Let, d, =, +, -, ., Let, g, be, (, /, ), /, (...",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27227,"Solve 4*l = -4*y + 324, -3658*l + 3604*l + y =...",algebra__linear_2d,"[Solve, *l, =, -*y, +, ,, -*l, +, *l, +, y, =,...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39465,"Suppose 4*q - 101 = 3*v, 87 = 3*q + 3*v + 6. L...",algebra__linear_2d_composed,"[Suppose, *q, -, =, *v, ,, =, *q, +, *v, +, .,...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
44516,Solve 7*v**5 - 25027*v**4 + 281680*v**3 - 4417...,algebra__polynomial_roots,"[Solve, *v**, -, *v**, +, *v**, -, *v**, -, *v...",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
56724,Let d(u) = u**3 + 2 - u**2 - 2 - u. Suppose 5*...,algebra__polynomial_roots_composed,"[Let, d, (, u, ), =, u**, +, -, u**, -, -, u, ...",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
68834,"What comes next: -3345, -4904, -7479, -11052, ...",algebra__sequence_next_term,"[What, comes, next, :, -, ,, -, ,, -, ,, -, ,,...",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
74614,"What is the k'th term of -108738, -108726, -10...",algebra__sequence_nth_term,"[What, is, the, k'th, term, of, -, ,, -, ,, -, ?]",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
82381,13887 + -31205112.4,arithmetic__add_or_sub,"[+, -, .]",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
94615,"In base 13, what is 4 + -1b?",arithmetic__add_or_sub_in_base,"[In, base, ,, what, is, +, -b, ?]",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# sample 1000 rows from each category to save memory
df_sampled = df.groupby('category').sample(n=1000)[['tokens', 'category']]

In [None]:
# explode on tokens
df_sampled = df_sampled.explode('tokens')

In [None]:
# filter out words with length 1
df_sampled = df_sampled[df_sampled['tokens'].str.len() > 1]

Examine top 20 most frequent words.

In [None]:
df_sampled['tokens'].value_counts().head(20)

Let           32029
the           24224
is            24058
What          18406
of            18318
be             9858
Suppose        9434
and            7118
to             7053
in             6829
Calculate      5479
-/             4895
Solve          4221
for            4108
Is             4093
common         4000
derivative     3634
sqrt           3246
base           3000
by             3000
Name: tokens, dtype: int64

Examine top 10 most frequent words for each category.

In [None]:
top_10_each_category = {}
categories = [
    "algebra__linear_1d",
    "algebra__linear_1d_composed",
    "algebra__linear_2d",
    "algebra__linear_2d_composed",
    "algebra__polynomial_roots",
    "algebra__polynomial_roots_composed",
    "algebra__sequence_next_term",
    "algebra__sequence_nth_term",
    "arithmetic__add_or_sub",
    "arithmetic__add_or_sub_in_base",
    "arithmetic__add_sub_multiple",
    "arithmetic__div",
    "arithmetic__mixed",
    "arithmetic__mul",
    "arithmetic__mul_div_multiple",
    "arithmetic__nearest_integer_root",
    "arithmetic__simplify_surd",
    "calculus__differentiate",
    "calculus__differentiate_composed",
    "comparison__closest",
    "comparison__closest_composed",
    "comparison__kth_biggest",
    "comparison__kth_biggest_composed",
    "comparison__pair",
    "comparison__pair_composed",
    "comparison__sort",
    "comparison__sort_composed",
    "measurement__conversion",
    "measurement__time",
    "numbers__base_conversion",
    "numbers__div_remainder",
    "numbers__div_remainder_composed",
    "numbers__gcd",
    "numbers__gcd_composed",
    "numbers__is_factor",
    "numbers__is_factor_composed",
    "numbers__is_prime",
    "numbers__is_prime_composed",
    "numbers__lcm",
    "numbers__lcm_composed",
    "numbers__list_prime_factors",
    "numbers__list_prime_factors_composed",
    "numbers__place_value",
    "numbers__place_value_composed",
    "numbers__round_number",
    "numbers__round_number_composed",
    "polynomials__add",
    "polynomials__coefficient_named",
    "polynomials__collect",
    "polynomials__compose",
    "polynomials__evaluate",
    "polynomials__evaluate_composed",
    "polynomials__expand",
    "polynomials__simplify_power",
    "probability__swr_p_level_set",
    "probability__swr_p_sequence"
]
for category in categories:
  subset = df_sampled[df_sampled['category'] == category]
  ls = subset['tokens'].value_counts().head(10).keys().to_list()
  top_10_each_category[category] = ls
top_10_each_category

{'algebra__linear_1d': ['Solve',
  'for',
  '*b',
  '*m',
  '*s',
  '*y',
  '*z',
  '*d',
  '*w',
  '*c'],
 'algebra__linear_1d_composed': ['Let',
  'Solve',
  'for',
  'be',
  'Suppose',
  '*z',
  '*u',
  '*w',
  '*s',
  '*i'],
 'algebra__linear_2d': ['Solve',
  'for',
  '*i',
  '*d',
  '*k',
  '*p',
  '*n',
  '*l',
  '*c',
  '*m'],
 'algebra__linear_2d_composed': ['Let',
  'for',
  'Solve',
  'be',
  'Suppose',
  '*b',
  '*r',
  '*z',
  '*f',
  '*j'],
 'algebra__polynomial_roots': ['Factor',
  'that',
  'is',
  'What',
  'given',
  'Find',
  'Suppose',
  'Solve',
  'Let',
  'Determine'],
 'algebra__polynomial_roots_composed': ['Let',
  'be',
  'Factor',
  'derivative',
  'of',
  'the',
  'Suppose',
  'that',
  'first',
  'second'],
 'algebra__sequence_next_term': ['next',
  'What',
  'is',
  'in',
  'term',
  'the',
  'comes'],
 'algebra__sequence_nth_term': ['term',
  'the',
  'What',
  'is',
  'of',
  "i'th",
  "j'th",
  "n'th",
  "l'th",
  "f'th"],
 'arithmetic__add_or_sub': ['is'

Compare frequencies of some key words across different categories.

In [None]:
#word search per category: Calculate
df_words = df.astype(str)
df_words['Word Count'] = df_words['question'].str.contains('Calculate')
df_zero = df_words.groupby(by ='category').sum()
df_nonzero = df_zero[df_zero['Word Count'] != 0]
df_nonzero

Unnamed: 0_level_0,Word Count
category,Unnamed: 1_level_1
algebra__polynomial_roots,1143
algebra__polynomial_roots_composed,956
arithmetic__add_or_sub,1049
arithmetic__add_sub_multiple,1970
arithmetic__div,2420
arithmetic__mixed,1962
arithmetic__mul,1089
arithmetic__mul_div_multiple,1997
numbers__div_remainder,5017
numbers__div_remainder_composed,5048


In [None]:
#word search per category: Derivative
df_words = df.astype(str)
df_words['Word Count'] = df_words['question'].str.contains('derivative')
df_zero = df_words.groupby(by ='category').sum()
df_nonzero = df_zero[df_zero['Word Count'] != 0]
df_nonzero

Unnamed: 0_level_0,Word Count
category,Unnamed: 1_level_1
algebra__linear_1d_composed,265
algebra__linear_2d_composed,200
algebra__polynomial_roots_composed,3912
calculus__differentiate,8733
calculus__differentiate_composed,8937
comparison__closest_composed,30
comparison__kth_biggest_composed,22
comparison__pair_composed,156
comparison__sort_composed,92
numbers__div_remainder_composed,193


In [None]:
#word search per category: Divide
df_words = df.astype(str)
df_words['Word Count'] = df_words['question'].str.contains('divide')
df_zero = df_words.groupby(by ='category').sum()
df_nonzero = df_zero[df_zero['Word Count'] != 0]
df_nonzero

Unnamed: 0_level_0,Word Count
category,Unnamed: 1_level_1
arithmetic__div,7479
numbers__div_remainder,10000
numbers__div_remainder_composed,10000
numbers__is_factor,3278
numbers__is_factor_composed,3288


## Building Multiple Machine Learning Pipelines

In [None]:
from datetime import datetime
from sqlalchemy import create_engine
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, make_scorer, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import cross_val_score, RandomizedSearchCV,  RepeatedStratifiedKFold, GridSearchCV
from lightgbm import LGBMClassifier, plot_importance
from sklearn import set_config

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


  "Since version 1.0, "


In [None]:
# tokenize function
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


Build a pipeline with the random forest classifier.

In [None]:
from sklearn.preprocessing import FunctionTransformer

#nlp_mo_pipeline_hb = Pipeline([
 #                       ('tfidfvect', TfidfVectorizer(tokenizer=tokenize)),
  #                      ('FunctionTransformer', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
   #                     ('multiclassifier',MultiOutputClassifier(HistGradientBoostingClassifier()))
    #                ])    
    
#nlp_mo_pipeline_rf = Pipeline([
 #                       ('tfidfvect', TfidfVectorizer(tokenizer=tokenize)),
  #                      ('multiclassifier',MultiOutputClassifier(RandomForestClassifier(n_jobs=-1)))
   #                 ])

nlp_mo_pipeline_lgb = Pipeline([
                        ('tfidfvect', TfidfVectorizer(tokenizer=tokenize)),
                        ('multiclassifier',MultiOutputClassifier(LGBMClassifier(n_jobs=-1)))
                    ])

nlp_chain_nlp_pipeline_lgb = Pipeline([
                        ('tfidfvect', TfidfVectorizer(tokenizer=tokenize)),
                        ('classifierchain',ClassifierChain(LGBMClassifier(n_jobs=-1)))
                    ])

In [None]:
#, \'nlp_chain_nlp_pipeline_lgb':nlp_chain_nlp_pipeline_lgb'nlp_mo_pipeline_rf': nlp_mo_pipeline_rf, \
pipeline_dict = {'nlp_mo_pipeline_lgb ': nlp_mo_pipeline_lgb, \
                 'nlp_chain_nlp_pipeline_lgb':nlp_chain_nlp_pipeline_lgb
                  } 

In [None]:
X = df['question']
Y = df.iloc[:,2:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
#use F score to measure accuracy
f1_results = {} 
for pipename, pipevalue in pipeline_dict.items() :
    print ("Training pipeline : {} ...".format(pipename))
    display(pipevalue)
    scores = cross_val_score(pipevalue, X_train, y_train, scoring='f1_weighted', cv=3)
    f1_results[pipename] = scores.mean()
    print ("Pipeline : {} F1 mean score {}".format(pipename, scores.mean())) 

Training pipeline : nlp_mo_pipeline_lgb  ...


Pipeline(steps=[('tfidfvect',
                 TfidfVectorizer(tokenizer=<function tokenize at 0x7fb88e7dc0e0>)),
                ('multiclassifier',
                 MultiOutputClassifier(estimator=LGBMClassifier()))])

Pipeline : nlp_mo_pipeline_lgb  F1 mean score 0.9496990926490598
Training pipeline : nlp_chain_nlp_pipeline_lgb ...


Pipeline(steps=[('tfidfvect',
                 TfidfVectorizer(tokenizer=<function tokenize at 0x7fb88e7dc0e0>)),
                ('classifierchain',
                 ClassifierChain(base_estimator=LGBMClassifier()))])



Pipeline : nlp_chain_nlp_pipeline_lgb F1 mean score 0.9484264813520733


In [None]:
f1_results

{'nlp_chain_nlp_pipeline_lgb': 0.9484264813520733,
 'nlp_mo_pipeline_lgb ': 0.9496990926490598}

In [None]:
best_pipline = pipeline_dict[best_pipline_name]
best_pipline

Pipeline(steps=[('tfidfvect',
                 TfidfVectorizer(tokenizer=<function tokenize at 0x7fb88e7dc0e0>)),
                ('multiclassifier',
                 MultiOutputClassifier(estimator=LGBMClassifier()))])

In [None]:
best_pipline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'tfidfvect', 'multiclassifier', 'tfidfvect__analyzer', 'tfidfvect__binary', 'tfidfvect__decode_error', 'tfidfvect__dtype', 'tfidfvect__encoding', 'tfidfvect__input', 'tfidfvect__lowercase', 'tfidfvect__max_df', 'tfidfvect__max_features', 'tfidfvect__min_df', 'tfidfvect__ngram_range', 'tfidfvect__norm', 'tfidfvect__preprocessor', 'tfidfvect__smooth_idf', 'tfidfvect__stop_words', 'tfidfvect__strip_accents', 'tfidfvect__sublinear_tf', 'tfidfvect__token_pattern', 'tfidfvect__tokenizer', 'tfidfvect__use_idf', 'tfidfvect__vocabulary', 'multiclassifier__estimator__boosting_type', 'multiclassifier__estimator__class_weight', 'multiclassifier__estimator__colsample_bytree', 'multiclassifier__estimator__importance_type', 'multiclassifier__estimator__learning_rate', 'multiclassifier__estimator__max_depth', 'multiclassifier__estimator__min_child_samples', 'multiclassifier__estimator__min_child_weight', 'multiclassifier__estimator__min_split_gain', 'multiclass

##Model Evaluation

In [None]:
result = best_pipline.fit(X_train, y_train)

In [None]:
predictions = best_pipline.predict(X_test)
print(classification_report(y_test, predictions, target_names= y_test.columns.to_list()))

                                      precision    recall  f1-score   support

                  algebra__linear_1d       0.95      1.00      0.97      1938
         algebra__linear_1d_composed       0.84      0.85      0.84      1973
                  algebra__linear_2d       1.00      1.00      1.00      1949
         algebra__linear_2d_composed       0.86      0.85      0.85      2084
           algebra__polynomial_roots       0.93      0.47      0.62      2099
  algebra__polynomial_roots_composed       0.68      0.91      0.78      1915
         algebra__sequence_next_term       1.00      1.00      1.00      1980
          algebra__sequence_nth_term       1.00      1.00      1.00      2049
              arithmetic__add_or_sub       1.00      0.79      0.88      2056
      arithmetic__add_or_sub_in_base       1.00      1.00      1.00      1989
        arithmetic__add_sub_multiple       0.85      0.99      0.91      1955
                     arithmetic__div       1.00      1.00      

  _warn_prf(average, modifier, msg_start, len(result))
