In [3]:
import pandas as pd
import numpy as np

from scipy.sparse import hstack, csr_matrix, vstack

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import *
from sklearn.semi_supervised import *

from tqdm import *

import matplotlib.pyplot as plt
import gc
import dill

import lightgbm as lgb
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
df = pd.read_csv("questions.csv").drop(['answers_encoded'], axis=1)
df.columns = ['label', 'content']
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,label,content
0,H06,cho da giac loi n dinh left n > 0 right . so t...
1,H08,"trong khong gian text ox yz , cho hai duong th..."
2,H08,"trong khong gian, cho mat cau left s right : l..."
3,H08,"trong khong gian voi he toa do oxyz , cho vect..."
4,D01,"tim tap gia tri lon nhat, gia tri nho nhat cua..."


In [5]:
df['label'].unique()

array(['H06', 'H08', 'D01', 'D08', 'D07', 'H07', 'H03', 'D09', 'H05',
       'D05', 'D02', 'D03', 'D06', 'H01', 'H02', 'D10', 'H04', 'D04'],
      dtype=object)

In [6]:
len(df)

9829

In [7]:
df.groupby(['label']).count()

Unnamed: 0_level_0,content
label,Unnamed: 1_level_1
D01,727
D02,494
D03,293
D04,305
D05,675
D06,436
D07,1529
D08,655
D09,749
D10,436


In [8]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=0)

Load train và test data

In [9]:
# Load data
# train_df = pd.read_csv("train.csv")
# test_df = pd.read_csv("test.csv")

In [10]:
train_df.head()

Unnamed: 0,label,content
2091,D07,cho ham so y = f left x right xac dinh tren ma...
7601,H06,cho hinh chop s.abc co day abc la tam giac vuo...
8377,H05,cho hinh chop s.abc co day abc vuong o c va bc...
7259,D09,tim int frac dx x^0 left x + 0 right dx answer...
6614,H03,cho s.abcd . sa = a . day la hinh vuong canh a...


In [11]:
test_df.head()

Unnamed: 0,label,content
5134,H05,cho lang tru deu abc.a'b'c' .canh ben bang can...
6576,D02,xep 0 nguoi vao 0 ban tron khong danh so. hoi ...
2686,H03,cho hinh chop s.abcd overrightarrow sa = overr...
2218,H07,cho mat cau s left x + 0 right ^0 + left y + 0...
2880,H02,khang dinh nao sau day la sai ? answers: voi 0...


In [12]:
df = pd.concat([train_df, test_df], axis=0)
# del train_df, test_df
# gc.collect()

Tạo feature TFIDF đơn giản

In [25]:
tfidf = TfidfVectorizer(
    dtype=np.float32,
    min_df = 10, 
    max_df = 0.2, 
#     max_features=2000,
    ngram_range=(2,3),
    use_idf=True,
#     sublinear_tf=True,
    tokenizer=lambda x: x.split(" "),
#     token_pattern=r'(?u)\b\w\w+__\([\w\s]*\)'
)

In [26]:
X_train_tfidf = tfidf.fit_transform(train_df['content'])
X_test_tfidf = tfidf.transform(test_df['content'])

In [27]:
EXCLUED_COLS = ['id', 'content', 'label']
static_cols = [c for c in train_df.columns if not c in EXCLUED_COLS]
X_train_static = train_df[static_cols].values
X_test_static = test_df[static_cols].values

In [32]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

In [33]:
y_train = train_df['label'].values
y_test = test_df['label'].values

In [34]:
X_train.shape, X_test.shape, y_train.shape

((7863, 9650), (1966, 9650), (7863,))

In [35]:
# svd = TruncatedSVD(n_components=250)
# X_train = svd.fit_transform(X_train)
# X_test = svd.transform(X_test)

In [36]:
X_train.shape

(7863, 9650)

# Ensemble method

In [37]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.multiclass import unique_labels
from imblearn.ensemble import BalancedBaggingClassifier

In [21]:
classes_encoded = {}
index = 0
for y in y_train:
    if y not in classes_encoded:
        classes_encoded[y] = index
        index += 1

y_train = [classes_encoded[y] for y in y_train]
y_test = [classes_encoded[y] for y in y_test]
classes_encoded

{'D07': 0,
 'H06': 1,
 'H05': 2,
 'D09': 3,
 'H03': 4,
 'H04': 5,
 'D06': 6,
 'D01': 7,
 'D04': 8,
 'H07': 9,
 'D05': 10,
 'H01': 11,
 'D02': 12,
 'D10': 13,
 'D08': 14,
 'D03': 15,
 'H02': 16,
 'H08': 17}

In [38]:
from sklearn.model_selection import cross_val_predict
models = [
#     lgb.LGBMClassifier(objective='multiclass', verbose=-1, learning_rate=0.01, max_depth=10, num_leaves=50, n_estimators=300, max_bin=4000,),
#     RandomForestClassifier (n_estimators=100, max_depth=7, max_features=0.8, random_state=112),
#     ExtraTreesClassifier (n_estimators=100, max_depth=5, max_features=0.7, random_state=1),
#     BalancedBaggingClassifier(n_estimators=100, max_samples=0.7, max_features=0.7, n_jobs=-1, random_state=42),
    SVC(C=5.0, gamma=0.05, probability=True),
#     KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    MLPClassifier(hidden_layer_sizes=(100), solver='adam', random_state=111),
    LogisticRegression(C=5.0, n_jobs=-1, solver='lbfgs', random_state=112),
#     LinearSVC(C=1.0, random_state=111)
]

In [39]:
for model in models:
    model.fit(X_train, y_train)
    
    pred = model.predict(X_test)
    print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         D01       0.94      0.90      0.92       145
         D02       0.98      0.96      0.97        99
         D03       0.92      0.93      0.92        58
         D04       0.98      0.85      0.91        61
         D05       0.94      0.97      0.95       135
         D06       0.92      0.77      0.84        87
         D07       0.87      0.96      0.91       306
         D08       0.87      0.89      0.88       131
         D09       0.95      0.89      0.92       150
         D10       0.98      0.98      0.98        87
         H01       0.88      0.88      0.88        57
         H02       0.89      0.82      0.85        39
         H03       0.81      0.87      0.84       141
         H04       0.75      0.71      0.73        73
         H05       0.72      0.85      0.78        93
         H06       0.72      0.75      0.74       122
         H07       0.83      0.80      0.81       147
         H08       0.86    



              precision    recall  f1-score   support

         D01       0.93      0.92      0.93       145
         D02       0.98      0.98      0.98        99
         D03       0.92      0.97      0.94        58
         D04       0.98      0.87      0.92        61
         D05       0.94      0.98      0.96       135
         D06       0.95      0.79      0.86        87
         D07       0.90      0.95      0.92       306
         D08       0.91      0.93      0.92       131
         D09       0.98      0.93      0.95       150
         D10       0.98      0.99      0.98        87
         H01       0.89      0.88      0.88        57
         H02       0.88      0.92      0.90        39
         H03       0.84      0.89      0.87       141
         H04       0.79      0.71      0.75        73
         H05       0.71      0.81      0.75        93
         H06       0.77      0.75      0.76       122
         H07       0.85      0.82      0.83       147
         H08       0.78    

In [40]:
names = ['SVM', 'MLP', 'LR']
vm = VotingClassifier(estimators=[(names[i], m) for i, m in enumerate(models)], n_jobs=-1, voting='soft')

vm.fit(X_train, y_train)

pred = vm.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         D01       0.96      0.92      0.94       145
         D02       0.98      0.98      0.98        99
         D03       0.92      0.98      0.95        58
         D04       0.98      0.89      0.93        61
         D05       0.99      0.98      0.98       135
         D06       0.92      0.91      0.91        87
         D07       0.91      0.96      0.93       306
         D08       0.90      0.93      0.92       131
         D09       0.98      0.91      0.94       150
         D10       0.98      1.00      0.99        87
         H01       0.86      0.89      0.88        57
         H02       0.92      0.90      0.91        39
         H03       0.85      0.90      0.87       141
         H04       0.82      0.75      0.79        73
         H05       0.70      0.80      0.75        93
         H06       0.78      0.74      0.76       122
         H07       0.86      0.82      0.84       147
         H08       0.77    

In [41]:
from sklearn.pipeline import Pipeline

In [42]:
qpipe = Pipeline([('fe', tfidf), ('clf', vm)])

In [43]:
qpipe.predict(["cho phương trình 0^ x^0 + 0 0^ x^0 = 0^ frac x 0 + 0 . tích các giá trị của x là"])

array(['D08'], dtype=object)

In [44]:
import pickle

In [46]:
dill.dump(qpipe, open('qmodel.pkl', 'wb'))

In [47]:
qpipe = dill.load(open('qmodel.pkl', 'rb'))

In [48]:
dill.dump(qpipe.steps[0][1], open('tfidf.pkl', 'wb'))
dill.dump(qpipe.steps[1][1], open('clf.pkl', 'wb'))

In [102]:
import time

from datasketch import MinHash, MinHashLSHForest
from datasketch.hashfunc import sha1_hash64

In [228]:
def get_forest(data, perms):
    start_time = time.time()

    minhash = []

    for text in data:
        tokens = text.split(" ")
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)

    forest = MinHashLSHForest(num_perm=perms)

    for i, m in enumerate(minhash):
        forest.add(i, m)

    forest.index()

    print('It took %s seconds to build forest.' % (time.time() - start_time))

    return forest

In [229]:
mm = get_forest(df['content'], 512)

It took 28.530836820602417 seconds to build forest.


In [260]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = text.split(" ")
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    print(idx_array)
    
    result = database.iloc[idx_array]
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [261]:
import re
from unidecode import unidecode

def cleaner(text):
    # normalize
    text = unidecode(text)
#     text = re.sub(r'[^A-Za-z ]', ' ', text.strip(' \n\r\t'))
    text = text.lower().strip(' \n\r\t')
    text = text.replace('\\', ' ').strip(' \n\r\t')
    text = text.replace('-', '+')
    text = re.sub(r'[0-9]+', '0', text.strip(' \n\r\t'))
    text = re.sub(r'[\[\]}{()]', ' ', text.strip(' \n\r\t'))
    text = re.sub(r'(?<=[^0-9])\.(?=\d+)', '', text.strip(' \n\r\t'))
    text = re.sub(r'\|<\|0+0a+0+image0\|>\||\\|<\/font>|<font|color="#[0-9]+">|<img src="[^>]+" \/>|<i>|</i>|<br| \/>', '', text.strip(' \n\r\t'))
    text = re.sub(r'[ ]{2,}', ' ', text.strip(' \n\r\t'))
    return text.strip(' \n\r\t')

In [262]:
v = db['content'].values[0]
v = cleaner("Hai cắt thẳng cắt nhau là 2 đường thẳng đồng phẳng. answers: Đúng | Sai")
v

'hai cat thang cat nhau la 0 duong thang dong phang. answers: dung | sai'

In [263]:
result = predict(v, df, 512, 5, mm)
print('\n Top Recommendation(s) is(are) \n', result)

[ 682 8235  978  472   93]
It took 0.012655019760131836 seconds to query forest.

 Top Recommendation(s) is(are) 
      label                                            content
5244   H01  hai duong thang cheo nhau la 0 duong thang kho...
3555   H01  hai duong thang cat nhau la 0 duong thang dong...
4744   H06  trung diem cac canh cua mot tu dien deu la cac...
4769   H06  tam cac mat cua mot hinh lap phuong la cac din...
581    H06  tam cac mat cua mot hinh tu dien deu la cac di...


In [243]:
df['content'].values[682]

'hai duong thang cheo nhau la 0 duong thang khong dong phang. answers: dung | sai'