# **Prepare Data**

**Import deps**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from os import path
import json
# from pandas import json_normalize
from sklearn.utils import shuffle
import fastai
from fastai.text import *
from fastai.callbacks import *
from sklearn.model_selection import train_test_split


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Constants**

In [None]:
MIN_AMOUNT = 41 # EXPERIMENT WITH THIS
VAL_PERC = 0.8
MICRO_DS = 9
DO_SHUFFLE = True
BS = 16
LABELS = ['PAY&BENEFITS', 'MANAGEMENT', 'WORKPLACE']
X_COL = 'text'
Y_COL = 'labels'
SEPARATOR = '|'

**load dataset for LM (language model)**

In [None]:
lm_set_path = path.join('/kaggle/input/ml-json/ml_class.json')
with open(lm_set_path) as file:
    data = json.load(file)

**Normalizer LM dataset**

In [None]:
norm_df = pd.io.json.json_normalize(data['data'])
lm_df = norm_df[['content', 'annotation.labels']]
lm_df.columns = [X_COL, Y_COL]
lm_df[Y_COL] = lm_df[Y_COL].str.join(sep=SEPARATOR)
lm_df = lm_df[[Y_COL, X_COL]]
lm_df = shuffle(lm_df)
print(lm_df.head(5))

**Load dataset for Classifier** (we have created separate labeled dataset with 120 samples, which will be splited into train and valid set)

In [None]:
class_set = pd.read_csv('/kaggle/input/smdata2/micro1.csv')
print(class_set.head())

In [None]:
# def sampling_k_elements(group, k=MIN_AMOUNT):
#     if len(group) < k:
#         return group
#     return group.sample(k)


# def get_micro_df(df, labels):
#     # delete labels which has less then 10 samples
#     g = df.groupby(Y_COL)
#     new_df = g.filter(lambda x: len(x) > MIN_AMOUNT)
    
#     # get just defined list of labels     
#     new_df=new_df[new_df[Y_COL].isin(labels)]
    
#     print('Value counts \n', new_df[Y_COL].value_counts())
#     # get balanced training micro set
#     balanced = new_df.groupby(Y_COL).apply(sampling_k_elements).reset_index(drop=True)
#     # return balanced
#     return balanced

# micro_df = get_micro_df(lm_df, LABELS)

In [None]:

# before we used  get_micro_df(lm_df, LABELS) to generate training set for classifier.
# But rorigin dataset had no much multilabeled and reliable samples
micro_df = class_set

micro_df = shuffle(micro_df)

# split on train test datasets
split_v = int(VAL_PERC * len(micro_df)) + 1
print('spliter:', split_v)


print('micro_df.shape:', micro_df.shape)
# split dataset on 80/20
df_train, df_valid = micro_df[:split_v], micro_df[split_v:]
print('df_train value counts \n',df_train[Y_COL].value_counts())

In [None]:
print(df_valid)

generate LM databunch

In [None]:
data_lm = TextLMDataBunch.from_df(path='/',
                                  train_df=lm_df,
                                  valid_df=df_valid,
                                  label_cols=Y_COL,
                                  text_cols=[X_COL]
                                  )
print('vocab-----------------------------------------------------------')
print(data_lm.vocab)
print(data_lm)

In [None]:
print('micro df info')
print( micro_df.info())
print('micro df info', micro_df.shape)
print('df_train', df_train.shape)
print('df_val', df_valid.shape)
print('lm_df shape', lm_df.shape)

In [None]:
data_class = TextClasDataBunch.from_df('./', train_df=df_train, valid_df=df_valid, vocab=data_lm.vocab, bs=BS,
                                       label_cols=Y_COL,
                                       text_cols=X_COL,
                                       label_delim='|'
                                      )


# **View clas databunch**

In [None]:
data_class.show_batch(rows=10)

# **Create LM learner**

In [None]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
learn.lr_find()
learn.recorder.plot(suggestion=True)

# **Train LM Learner**

In [None]:
learn.fit_one_cycle(4, 1e-1)

In [None]:
print('LM Predict next words:')
print(learn.predict('very challenging in terms', n_words=100))

In [None]:
print(learn.predict('at first I thought it will be temporary but', n_words=10))

# **Create Classifier Learner**

In [None]:
cl_learn = text_classifier_learner(data_class, AWD_LSTM, drop_mult=0.3, metrics=[fbeta])

In [None]:
cl_learn.lr_find()
cl_learn.recorder.plot(suggestion=True)


# **Train Classifier Learner**

## Test 3 (experiment with lr)--------------------------------------

In [None]:
lr = 1e-1
cl_learn.fit_one_cycle(1,slice(lr/(2.6**4),lr), moms=(0.8,0.7) )
cl_learn.save('clas_fine_tuned')
cl_learn.freeze_to(-2)
cl_learn.lr_find()
cl_learn.recorder.plot()
lr = 1e-2
cl_learn.fit_one_cycle(2,slice(lr/(2.6**4),lr), moms=(0.8,0.7) )
cl_learn.save('clas_fine_tuned')
cl_learn.freeze_to(-3)
cl_learn.lr_find()
cl_learn.recorder.plot()
lr = 1e-2
cl_learn.fit_one_cycle(2,slice(lr/(2.6**4),lr), moms=(0.8,0.7) )
cl_learn.save('clas_fine_tuned')
cl_learn.unfreeze()
cl_learn.lr_find()
cl_learn.recorder.plot()
lr = 1e-2
cl_learn.fit_one_cycle(3,slice(lr/(2.6**4),lr), moms=(0.8,0.7) )
cl_learn.save('clas_fine_tuned')
cl_learn.load('clas_fine_tuned')
lr = 1e-2
cl_learn.fit_one_cycle(4,slice(lr/(2.6**4),lr), moms=(0.8,0.7) )
cl_learn.save('clas_fine_tuned')
lr = 1e-2
cl_learn.fit_one_cycle(4,slice(lr/(2.6**4),lr), moms=(0.8,0.7) )


pred = cl_learn.predict("good working team and management")

pred

**-------------------------------------------------------------------------**

In [None]:
cl_learn.recorder.plot_losses()

# **Test Classifier Learner**

In [None]:
print(cl_learn.predict("I want more money and company gives out a very good benefits and good  team"))
# show training results Target vs Predicted
print(cl_learn.show_results(100))
# cl_learn.export('export.pkl')

In [None]:
print(cl_learn.predict("good management team at the company"))