In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import hstack, csr_matrix, vstack

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import *
from sklearn.semi_supervised import *

from tqdm import *

import matplotlib.pyplot as plt
import gc
import dill

import lightgbm as lgb
%matplotlib inline

In [2]:
df = pd.read_csv("questions.csv").drop(['answers_encoded'], axis=1)
df.columns = ['label', 'content']
df = df.sample(frac=1).reset_index(drop=True)
df.content[0]

'cho hinh chop s.abcd co day abcd la hinh binh hanh. gia su m thuoc doan thang sb. mat phang left adm right cat hinh chop s.abcd theo thiet dien la hinh gi? answers: hinh tam giac. | hinh thang. | hinh binh hanh | hinh chu nhat.'

In [3]:
df['label'].unique()

array(['H03', 'D07', 'D02', 'D09', 'D05', 'H06', 'H01', 'D06', 'H02',
       'D01', 'H07', 'D04', 'D10', 'D08', 'H08', 'D03', 'H04', 'H05'],
      dtype=object)

In [4]:
len(df)

9829

In [5]:
df.groupby(['label']).count()

Unnamed: 0_level_0,content
label,Unnamed: 1_level_1
D01,727
D02,494
D03,293
D04,305
D05,675
D06,436
D07,1529
D08,655
D09,749
D10,436


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=0)

Load train và test data

In [7]:
# Load data
# train_df = pd.read_csv("train.csv")
# test_df = pd.read_csv("test.csv")
train_df['label'].count()

7863

In [8]:
test_df['label'].count()

1966

In [9]:
train_df.head()

Unnamed: 0,label,content
2183,D07,tim m de phuong trinh: sqrt 0x + x^0 = m co 0 ...
7477,H06,"mot mieng ton hinh chu nhat co chieu dai 0cm ,..."
8330,H05,cho hinh chop s.abcd co sa bot left abc text d...
7388,D09,cho f left x right la mot nguyen ham cua f lef...
6487,H03,cho hinh chop s.abcd. sa ? day.day la hinh vuo...


In [10]:
test_df.head()

Unnamed: 0,label,content
4868,H05,cho hinh chop s.abc co left sbc right bot left...
6450,D02,0 lop co 0 ban. hoi co bao nhieu cach chia lop...
2542,H03,"cho tu dien abcd va ba diem p, , ,q, , ,r lan ..."
2287,H07,"cho left p right :x + y + z + 0 = 0, left q ri..."
2736,H02,"trong mat phang oxy , cho overrightarrow v = l..."


In [11]:
df = pd.concat([train_df, test_df], axis=0)
# del train_df, test_df
# gc.collect()
df.shape

(9829, 2)

Tạo feature TFIDF đơn giản

In [12]:
tfidf = TfidfVectorizer(
    dtype=np.float32,
    min_df = 10, 
    max_df = 0.2, 
#     max_features=2000,
    ngram_range=(2,3),
    use_idf=True,
#     sublinear_tf=True,
    tokenizer=lambda x: x.split(" "),
#     token_pattern=r'(?u)\b\w\w+__\([\w\s]*\)'
)

In [31]:
type(tfidf)

sklearn.feature_extraction.text.TfidfVectorizer

In [13]:
X_train_tfidf = tfidf.fit_transform(train_df['content'])
X_test_tfidf = tfidf.transform(test_df['content'])

In [14]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [15]:
EXCLUED_COLS = ['id', 'content', 'label']
static_cols = [c for c in train_df.columns if not c in EXCLUED_COLS]
X_train_static = train_df[static_cols].values
X_test_static = test_df[static_cols].values

In [30]:
static_cols

[]

In [16]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

In [19]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [21]:
type(csr_matrix(X_train_static))

scipy.sparse.csr.csr_matrix

In [22]:
type(hstack([X_train_tfidf, csr_matrix(X_train_static)]))

scipy.sparse.coo.coo_matrix

In [17]:
X_train

<7863x9631 sparse matrix of type '<class 'numpy.float64'>'
	with 455983 stored elements in Compressed Sparse Row format>

In [24]:
y_train = train_df['label'].values
y_test = test_df['label'].values

In [25]:
X_train.shape, X_test.shape, y_train.shape

((7863, 9631), (1966, 9631), (7863,))

In [35]:
# svd = TruncatedSVD(n_components=250)
# X_train = svd.fit_transform(X_train)
# X_test = svd.transform(X_test)

In [26]:
X_train.shape

(7863, 9631)

# Ensemble method

In [27]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.multiclass import unique_labels
# from imblearn.ensemble import BalancedBaggingClassifier

In [28]:
classes_encoded = {}
index = 0
for y in y_train:
    if y not in classes_encoded:
        classes_encoded[y] = index
        index += 1

y_train = [classes_encoded[y] for y in y_train]
y_test = [classes_encoded[y] for y in y_test]
classes_encoded

{'D07': 0,
 'H06': 1,
 'H05': 2,
 'D09': 3,
 'H03': 4,
 'H04': 5,
 'D06': 6,
 'D01': 7,
 'D04': 8,
 'H07': 9,
 'D05': 10,
 'H01': 11,
 'D02': 12,
 'D10': 13,
 'D08': 14,
 'D03': 15,
 'H02': 16,
 'H08': 17}

In [29]:
y_train

[0,
 1,
 2,
 3,
 4,
 5,
 3,
 6,
 7,
 0,
 1,
 4,
 3,
 8,
 5,
 9,
 10,
 1,
 3,
 4,
 0,
 11,
 0,
 7,
 4,
 1,
 9,
 6,
 10,
 3,
 3,
 6,
 5,
 5,
 3,
 9,
 1,
 12,
 6,
 1,
 11,
 10,
 0,
 9,
 0,
 7,
 9,
 3,
 9,
 13,
 9,
 0,
 4,
 14,
 10,
 1,
 4,
 2,
 9,
 0,
 1,
 5,
 12,
 10,
 0,
 6,
 15,
 3,
 9,
 0,
 14,
 12,
 0,
 8,
 14,
 6,
 14,
 11,
 4,
 12,
 4,
 3,
 14,
 0,
 0,
 1,
 13,
 0,
 0,
 4,
 10,
 11,
 0,
 1,
 6,
 2,
 10,
 3,
 5,
 0,
 3,
 5,
 4,
 9,
 11,
 6,
 9,
 0,
 16,
 15,
 0,
 3,
 5,
 4,
 15,
 0,
 5,
 0,
 7,
 6,
 1,
 0,
 16,
 1,
 14,
 2,
 6,
 6,
 2,
 4,
 2,
 7,
 9,
 7,
 0,
 5,
 12,
 13,
 10,
 7,
 0,
 4,
 8,
 13,
 7,
 9,
 1,
 0,
 2,
 10,
 11,
 4,
 3,
 10,
 0,
 14,
 6,
 12,
 5,
 8,
 0,
 0,
 15,
 5,
 10,
 3,
 6,
 9,
 2,
 9,
 0,
 3,
 9,
 14,
 4,
 4,
 3,
 4,
 0,
 13,
 3,
 0,
 3,
 0,
 1,
 1,
 12,
 9,
 7,
 1,
 5,
 7,
 11,
 7,
 10,
 0,
 3,
 12,
 5,
 4,
 12,
 12,
 0,
 13,
 4,
 13,
 0,
 13,
 16,
 6,
 15,
 14,
 14,
 12,
 5,
 9,
 7,
 1,
 11,
 12,
 7,
 6,
 7,
 4,
 7,
 9,
 8,
 9,
 3,
 14,
 7,
 13,
 15,
 11,
 0

In [31]:
from sklearn.model_selection import cross_val_predict
models = [
#     lgb.LGBMClassifier(objective='multiclass', verbose=-1, learning_rate=0.01, max_depth=10, num_leaves=50, n_estimators=300, max_bin=4000,),
#     RandomForestClassifier (n_estimators=100, max_depth=7, max_features=0.8, random_state=112),
#     ExtraTreesClassifier (n_estimators=100, max_depth=5, max_features=0.7, random_state=1),
#     BalancedBaggingClassifier(n_estimators=100, max_samples=0.7, max_features=0.7, n_jobs=-1, random_state=42),
    SVC(C=5.0, gamma=0.05, probability=True),
#     KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    MLPClassifier(hidden_layer_sizes=(100), solver='adam', random_state=111),
    LogisticRegression(C=5.0, n_jobs=-1, solver='lbfgs', random_state=112),
#     LinearSVC(C=1.0, random_state=111)
]

In [32]:
for model in models:
    model.fit(X_train, y_train)
    
    pred = model.predict(X_test)
    print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       306
           1       0.72      0.77      0.74       122
           2       0.72      0.80      0.76        93
           3       0.97      0.93      0.95       150
           4       0.77      0.83      0.80       141
           5       0.78      0.82      0.80        73
           6       0.90      0.82      0.86        87
           7       0.94      0.94      0.94       145
           8       0.92      0.89      0.90        61
           9       0.82      0.78      0.80       147
          10       0.94      0.98      0.96       135
          11       0.79      0.72      0.75        57
          12       0.96      0.96      0.96        99
          13       0.99      0.99      0.99        87
          14       0.89      0.85      0.87       131
          15       0.94      0.84      0.89        58
          16       0.92      0.90      0.91        39
          17       0.67    



              precision    recall  f1-score   support

           0       0.91      0.95      0.93       306
           1       0.78      0.75      0.76       122
           2       0.73      0.83      0.77        93
           3       0.97      0.94      0.96       150
           4       0.80      0.84      0.82       141
           5       0.84      0.77      0.80        73
           6       0.91      0.85      0.88        87
           7       0.95      0.97      0.96       145
           8       0.92      0.95      0.94        61
           9       0.83      0.80      0.82       147
          10       0.96      0.99      0.97       135
          11       0.75      0.75      0.75        57
          12       0.97      0.96      0.96        99
          13       0.99      1.00      0.99        87
          14       0.91      0.88      0.89       131
          15       0.96      0.91      0.94        58
          16       0.88      0.95      0.91        39
          17       0.68    

In [40]:
names = ['SVM', 'MLP', 'LR']
vm = VotingClassifier(estimators=[(names[i], m) for i, m in enumerate(models)], n_jobs=-1, voting='soft')

vm.fit(X_train, y_train)

pred = vm.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         D01       0.96      0.92      0.94       145
         D02       0.98      0.98      0.98        99
         D03       0.92      0.98      0.95        58
         D04       0.98      0.89      0.93        61
         D05       0.99      0.98      0.98       135
         D06       0.92      0.91      0.91        87
         D07       0.91      0.96      0.93       306
         D08       0.90      0.93      0.92       131
         D09       0.98      0.91      0.94       150
         D10       0.98      1.00      0.99        87
         H01       0.86      0.89      0.88        57
         H02       0.92      0.90      0.91        39
         H03       0.85      0.90      0.87       141
         H04       0.82      0.75      0.79        73
         H05       0.70      0.80      0.75        93
         H06       0.78      0.74      0.76       122
         H07       0.86      0.82      0.84       147
         H08       0.77    

In [41]:
from sklearn.pipeline import Pipeline

In [42]:
qpipe = Pipeline([('fe', tfidf), ('clf', vm)])

In [43]:
qpipe.predict(["cho phương trình 0^ x^0 + 0 0^ x^0 = 0^ frac x 0 + 0 . tích các giá trị của x là"])

array(['D08'], dtype=object)

In [44]:
import pickle

In [46]:
dill.dump(qpipe, open('qmodel.pkl', 'wb'))

In [47]:
qpipe = dill.load(open('qmodel.pkl', 'rb'))

In [48]:
dill.dump(qpipe.steps[0][1], open('tfidf.pkl', 'wb'))
dill.dump(qpipe.steps[1][1], open('clf.pkl', 'wb'))

In [102]:
import time

from datasketch import MinHash, MinHashLSHForest
from datasketch.hashfunc import sha1_hash64

In [228]:
def get_forest(data, perms):
    start_time = time.time()

    minhash = []

    for text in data:
        tokens = text.split(" ")
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)

    forest = MinHashLSHForest(num_perm=perms)

    for i, m in enumerate(minhash):
        forest.add(i, m)

    forest.index()

    print('It took %s seconds to build forest.' % (time.time() - start_time))

    return forest

In [229]:
mm = get_forest(df['content'], 512)

It took 28.530836820602417 seconds to build forest.


In [260]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = text.split(" ")
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    print(idx_array)
    
    result = database.iloc[idx_array]
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [261]:
import re
from unidecode import unidecode

def cleaner(text):
    # normalize
    text = unidecode(text)
#     text = re.sub(r'[^A-Za-z ]', ' ', text.strip(' \n\r\t'))
    text = text.lower().strip(' \n\r\t')
    text = text.replace('\\', ' ').strip(' \n\r\t')
    text = text.replace('-', '+')
    text = re.sub(r'[0-9]+', '0', text.strip(' \n\r\t'))
    text = re.sub(r'[\[\]}{()]', ' ', text.strip(' \n\r\t'))
    text = re.sub(r'(?<=[^0-9])\.(?=\d+)', '', text.strip(' \n\r\t'))
    text = re.sub(r'\|<\|0+0a+0+image0\|>\||\\|<\/font>|<font|color="#[0-9]+">|<img src="[^>]+" \/>|<i>|</i>|<br| \/>', '', text.strip(' \n\r\t'))
    text = re.sub(r'[ ]{2,}', ' ', text.strip(' \n\r\t'))
    return text.strip(' \n\r\t')

In [262]:
v = db['content'].values[0]
v = cleaner("Hai cắt thẳng cắt nhau là 2 đường thẳng đồng phẳng. answers: Đúng | Sai")
v

'hai cat thang cat nhau la 0 duong thang dong phang. answers: dung | sai'

In [263]:
result = predict(v, df, 512, 5, mm)
print('\n Top Recommendation(s) is(are) \n', result)

[ 682 8235  978  472   93]
It took 0.012655019760131836 seconds to query forest.

 Top Recommendation(s) is(are) 
      label                                            content
5244   H01  hai duong thang cheo nhau la 0 duong thang kho...
3555   H01  hai duong thang cat nhau la 0 duong thang dong...
4744   H06  trung diem cac canh cua mot tu dien deu la cac...
4769   H06  tam cac mat cua mot hinh lap phuong la cac din...
581    H06  tam cac mat cua mot hinh tu dien deu la cac di...


In [243]:
df['content'].values[682]

'hai duong thang cheo nhau la 0 duong thang khong dong phang. answers: dung | sai'