In [1]:
import gc
import json
import numpy as np 
import pandas as pd
import re

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm_notebook as tqdm
import Levenshtein 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression

from scipy import spatial
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

/kaggle/input/tensorflow2-question-answering/sample_submission.csv
/kaggle/input/tensorflow2-question-answering/simplified-nq-train.jsonl
/kaggle/input/tensorflow2-question-answering/simplified-nq-test.jsonl


Using TensorFlow backend.


In [2]:
html_tags = ['<P>', '</P>', '<Table>', '</Table>', '<Tr>', '</Tr>', '<Ul>', '<Ol>', '<Dl>', '</Ul>', '</Ol>', \
             '</Dl>', '<Li>', '<Dd>', '<Dt>', '</Li>', '</Dd>', '</Dt>']
r_buf = ['is', 'are', 'do', 'does', 'did', 'was', 'were', 'will', 'can', 'the', 'a', 'an', 'of', 'in', 'and', 'on', \
         'what', 'where', 'when', 'which'] + html_tags

def clean(x):
    x = x.lower()
    for r in r_buf:
        x = x.replace(r, '')
    x = re.sub(' +', ' ', x)
    return x

## Prepairing train dataset

Here we are going to collect data from json files and format it to the tabular data. We will formulate the problem as a binary classification problem and will try to classify if chosen candidate is an answer.

In [3]:
%%time
n_samples = 75000 # Number of samples to read from the train.json

# Read data from train.json and prepare features
ids = []
question_tfidfs = []
answer_tfidfs = []
candidates_str = []
targets = []
targets_str = []
targets_str_short = []
features = []
rank_features = []

with open('/kaggle/input/tensorflow2-question-answering/simplified-nq-train.jsonl', 'r') as json_file:
    cnt = 0
    for line in tqdm(json_file):
        json_data = json.loads(line) 
        
        # TFIDF for document
        stop_words = text.ENGLISH_STOP_WORDS.union(["book"])
        tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words=stop_words)
        tfidf.fit([json_data['document_text']])

        # TFIDF for question
        question = json_data['question_text']
        question_tfidf = tfidf.transform([question]).todense()
        
        # Collect annotations
        start_token_true = json_data['annotations'][0]['long_answer']['start_token']
        end_token_true = json_data['annotations'][0]['long_answer']['end_token']
        
        # Collect short annotations
        if json_data['annotations'][0]['yes_no_answer'] == 'NONE':
            if len(json_data['annotations'][0]['short_answers']) > 0:
                s_ans = str(json_data['annotations'][0]['short_answers'][0]['start_token']) + ':' + \
                    str(json_data['annotations'][0]['short_answers'][0]['end_token'])
            else:
                s_ans = ''
        else:
            s_ans = json_data['annotations'][0]['yes_no_answer']

        cos_d_buf = []
        euc_d_buf = []
        lev_d_buf = []
        
        doc_tokenized = json_data['document_text'].split(' ')
        candidates = json_data['long_answer_candidates']
        candidates = [c for c in candidates if c['top_level'] == True]
        
        if start_token_true != -1:
            for c in candidates:
                ids.append(str(json_data['example_id']))

                # TFIDF for candidate answer
                start_token = c['start_token']
                end_token = c['end_token']
                answer = ' '.join(doc_tokenized[start_token:end_token])
                answer_tfidf = tfidf.transform([answer]).todense()

                # Extract some features
                cos_d = spatial.distance.cosine(question_tfidf, answer_tfidf)
                euc_d = np.linalg.norm(question_tfidf - answer_tfidf)
                lev_d = Levenshtein.distance(clean(question), clean(answer))
                lev_r = Levenshtein.ratio(clean(question), clean(answer))
                jar_s = Levenshtein.jaro(clean(question), clean(answer))
                jaw_s = Levenshtein.jaro_winkler(clean(question), clean(answer))
                tfidf_score = np.sum(question_tfidf*answer_tfidf.T)
                question_tfidf_sum = np.sum(question_tfidf)
                answer_tfidf_sum = np.sum(answer_tfidf)

                features.append([
                    cos_d, 
                    euc_d, 
                    lev_d, 
                    lev_r, 
                    jar_s, 
                    jaw_s, 
                    tfidf_score, 
                    question_tfidf_sum, 
                    answer_tfidf_sum
                ])
                
                cos_d_buf.append(cos_d)
                euc_d_buf.append(euc_d)
                lev_d_buf.append(lev_d)

                targets_str.append(str(start_token_true) + ':' + str(end_token_true))
                candidates_str.append(str(start_token) + ':' + str(end_token))
                targets_str_short.append(s_ans)

                # Get target
                if start_token == start_token_true and end_token == end_token_true:
                    target = 1
                else:
                    target = 0
                targets.append(target)

            rank_cos_d = np.argsort(cos_d_buf)
            rank_euc_d = np.argsort(euc_d_buf)
            rank_lev_d = np.argsort(lev_d_buf)
            rank_cos_d_ismin = (cos_d_buf == np.nanmin(cos_d_buf)).astype(int)
            rank_euc_d_ismin = (euc_d_buf == np.nanmin(euc_d_buf)).astype(int)
            rank_lev_d_ismin = (lev_d_buf == np.nanmin(lev_d_buf)).astype(int)
            rank_features.append(np.array([rank_cos_d, rank_euc_d, rank_lev_d, \
                                           rank_cos_d_ismin, rank_euc_d_ismin, rank_lev_d_ismin]).T)

        cnt += 1
        if cnt >= n_samples:
            break
        
train = pd.DataFrame()
train['example_id'] = ids
train['target'] = targets
train['CorrectString'] = targets_str
train['CorrectString_short'] = targets_str_short
train['CandidateString'] = candidates_str

features = np.array(features)
features_df = pd.DataFrame(features)
features_df.columns = [f'feature_{i}' for i in range(features.shape[1])]
train = pd.concat([train, features_df], axis=1)

rank_features = np.concatenate(rank_features, axis=0)
rank_features_df = pd.DataFrame(rank_features)
rank_features_df.columns = [f'rank_feature_{i}' for i in range(rank_features.shape[1])]
train = pd.concat([train, rank_features_df], axis=1)

del features, features_df, \
    rank_features, rank_features_df
gc.collect()

train.to_csv('train_data.csv', index=False)
print(f'train.shape: {train.shape}')
print(f'Mean target: {train.target.mean()}')
train.head(20)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  dist = 1.0 - uv / np.sqrt(uu * vv)


## Preparing test dataset

In [4]:
%%time
ids = []
question_tfidfs = []
answer_tfidfs = []
candidates_str = []
targets = []
targets_str = []
features = []
rank_features = []

with open('/kaggle/input/tensorflow2-question-answering/simplified-nq-test.jsonl', 'r') as json_file:
    for line in tqdm(json_file):
        json_data = json.loads(line) 
        
        # TFIDF for document
        stop_words = text.ENGLISH_STOP_WORDS.union(["book"])
        tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words=stop_words)
        tfidf.fit([json_data['document_text']])
        
        # TFIDF for question
        question = json_data['question_text']
        question_tfidf = tfidf.transform([question]).todense()
        
        doc_tokenized = json_data['document_text'].split(' ')
        candidates = json_data['long_answer_candidates']
        candidates = [c for c in candidates if c['top_level'] == True]
        
        cos_d_buf = []
        euc_d_buf = []
        lev_d_buf = []
        
        for c in candidates:
            ids.append(str(json_data['example_id']))
            
            # TFIDF for candidate answer
            start_token = c['start_token']
            end_token = c['end_token']
            answer = ' '.join(doc_tokenized[start_token:end_token])
            answer_tfidf = tfidf.transform([answer]).todense()
            
            # Extract some features
            cos_d = spatial.distance.cosine(question_tfidf, answer_tfidf)
            euc_d = np.linalg.norm(question_tfidf - answer_tfidf)
            lev_d = Levenshtein.distance(clean(question), clean(answer))
            lev_r = Levenshtein.ratio(clean(question), clean(answer))
            jar_s = Levenshtein.jaro(clean(question), clean(answer))
            jaw_s = Levenshtein.jaro_winkler(clean(question), clean(answer))
            tfidf_score = np.sum(question_tfidf*answer_tfidf.T)
            question_tfidf_sum = np.sum(question_tfidf)
            answer_tfidf_sum = np.sum(answer_tfidf)

            features.append([
                cos_d, 
                euc_d, 
                lev_d, 
                lev_r, 
                jar_s, 
                jaw_s, 
                tfidf_score, 
                question_tfidf_sum, 
                answer_tfidf_sum
            ])

            cos_d_buf.append(cos_d)
            euc_d_buf.append(euc_d)
            lev_d_buf.append(lev_d)
            
            candidates_str.append(str(start_token) + ':' + str(end_token))
        
        rank_cos_d = np.argsort(cos_d_buf)
        rank_euc_d = np.argsort(euc_d_buf)
        rank_lev_d = np.argsort(lev_d_buf)
        rank_cos_d_ismin = (cos_d_buf == np.nanmin(cos_d_buf)).astype(int)
        rank_euc_d_ismin = (euc_d_buf == np.nanmin(euc_d_buf)).astype(int)
        rank_lev_d_ismin = (lev_d_buf == np.nanmin(lev_d_buf)).astype(int)
        rank_features.append(np.array([rank_cos_d, rank_euc_d, rank_lev_d, \
                                       rank_cos_d_ismin, rank_euc_d_ismin, rank_lev_d_ismin]).T)
        
test = pd.DataFrame()
test['example_id'] = ids
test['CandidateString'] = candidates_str

features = np.array(features)
features_df = pd.DataFrame(features)
features_df.columns = [f'feature_{i}' for i in range(features.shape[1])]
test = pd.concat([test, features_df], axis=1)

rank_features = np.concatenate(rank_features, axis=0)
rank_features_df = pd.DataFrame(rank_features)
rank_features_df.columns = [f'rank_feature_{i}' for i in range(rank_features.shape[1])]
test = pd.concat([test, rank_features_df], axis=1)

del features, features_df, rank_features, rank_features_df
gc.collect()

test.to_csv('test_data.csv', index=False)
print(f'test.shape: {test.shape}')
test.head(10)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




test.shape: (13412, 17)
CPU times: user 29.4 s, sys: 208 ms, total: 29.6 s
Wall time: 29 s


Unnamed: 0,example_id,CandidateString,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,rank_feature_0,rank_feature_1,rank_feature_2,rank_feature_3,rank_feature_4,rank_feature_5
0,-1220107454853145579,18:136,0.850419,1.304162,477.0,0.104673,0.425633,0.425633,0.149581,2.236068,3.63142,1,1,1,0,0,0
1,-1220107454853145579,141:211,0.427402,0.924556,222.0,0.206406,0.496494,0.496494,0.572598,2.236068,4.225217,5,5,4,1,1,1
2,-1220107454853145579,240:336,0.836092,1.29313,376.0,0.133333,0.448399,0.448399,0.163908,2.236068,5.986303,3,3,5,0,0,0
3,-1220107454853145579,336:425,0.727202,1.205987,321.0,0.142857,0.441379,0.441379,0.272798,2.236068,5.33745,2,2,3,0,0,0
4,-1220107454853145579,425:488,0.841886,1.297602,230.0,0.188153,0.431496,0.431496,0.158114,2.236068,4.949747,4,4,2,0,0,0
5,-1220107454853145579,488:570,0.66045,1.149304,266.0,0.167183,0.443715,0.443715,0.33955,2.236068,4.664005,0,0,0,0,0,0
6,8777415633185303067,23:190,0.868018,1.317587,873.0,0.056216,0.388144,0.388144,0.131982,1.732051,3.352803,8,8,13,0,0,0
7,8777415633185303067,190:269,0.505128,1.005115,312.0,0.137741,0.424499,0.424499,0.494872,1.732051,5.571429,1,1,7,0,0,0
8,8777415633185303067,269:363,0.61764,1.111432,380.0,0.12037,0.460943,0.460943,0.38236,1.732051,6.490209,14,14,1,0,0,0
9,8777415633185303067,363:507,0.612899,1.107158,559.0,0.081967,0.438645,0.438645,0.387101,1.732051,6.034306,3,3,5,0,0,0


## Build the model

In [5]:
p_buf = []
n_splits = 4

kf = GroupKFold(
    n_splits=n_splits)

err_buf = []   

cols_to_drop = ['example_id', 'target', 'CorrectString', 'CorrectString_short', 'CandidateString']

X = train.drop(cols_to_drop, axis=1, errors='ignore')
y = train['target'].values
g = train['example_id'].values

X_test = test.drop(cols_to_drop, axis=1, errors='ignore')
id_test = test['example_id'].values

print(f'X.shape: {X.shape}, y.shape: {y.shape}')
print(f'X_test.shape: {X_test.shape}')

n_features = X.shape[1]

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': 16,
    'learning_rate': 0.0055, 
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'verbose': -1,
    'num_threads': 5,
}

for fold_i, (train_index, valid_index) in enumerate(kf.split(X, y, g)):
    print('Fold {}/{}'.format(fold_i + 1, n_splits))
    params = lgb_params.copy() 
    
    X_train, y_train = X.iloc[train_index], y[train_index]
    X_valid, y_valid = X.iloc[valid_index], y[valid_index]

    print(f'X_train.shape: {X_train.shape}, X_valid.shape: {X_valid.shape}')
    feature_names = list(X_train.columns)

    lgb_train = lgb.Dataset(
        X_train, 
        y_train, 
        feature_name=feature_names,
        )
    lgb_train.raw_data = None

    lgb_valid = lgb.Dataset(
        X_valid, 
        y_valid,
        feature_name=feature_names,
        )
    lgb_valid.raw_data = None

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_train, lgb_valid],
        early_stopping_rounds=100, 
        verbose_eval=100, 
    )

    # Feature importance
    if fold_i == 0:
        importance = model.feature_importance()
        model_fnames = model.feature_name()
        tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
        tuples = [x for x in tuples if x[1] > 0]
        print('Important features:')
        for i in range(20):
            if i < len(tuples):
                print(i, tuples[i])
            else:
                break

    # Evaluate model
    p = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)
    valid_df = train.loc[valid_index]
    valid_df['pred'] = p
    pred_df = valid_df.sort_values('pred', ascending=True).groupby('example_id').tail(1)

    pred_df_long = pred_df[['example_id', 'CorrectString', 'CandidateString']]
    pred_df_long.rename({'CandidateString': 'PredictionString'}, axis=1, inplace=True)
    pred_df_long['example_id'] = pred_df_long['example_id'].apply(lambda x: x + '_long')

    pred_df_short = pred_df[['example_id', 'CorrectString_short', 'CandidateString']]
    pred_df_short.rename({'CorrectString_short': 'CorrectString', 'CandidateString': 'PredictionString'}, \
                         axis=1, inplace=True)
    pred_df_short['example_id'] = pred_df_short['example_id'].apply(lambda x: x + '_short')
    pred_df_short['PredictionString'] = ''

    pred_df = pd.concat([pred_df_long, pred_df_short], axis=0).sort_values('example_id')
#     print(pred_df.head(20))

    err = f1_score(pred_df['CorrectString'].values, pred_df['PredictionString'].values, average='micro')
    print('{} F1: {}'.format(fold_i, err))
    
    # Inference on test data
    p_test = model.predict(X_test[feature_names], num_iteration=model.best_iteration)
    p_buf.append(p_test)
    err_buf.append(err)

#     if fold_i >= 0: # Comment this to run several folds
#         break

    del model, lgb_train, lgb_valid, p
    gc.collect()

X.shape: (1441992, 15), y.shape: (1441992,)
X_test.shape: (13412, 15)
Fold 1/4
X_train.shape: (1081494, 15), X_valid.shape: (360498, 15)
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0987246	valid_1's binary_logloss: 0.0992234
[200]	training's binary_logloss: 0.0940822	valid_1's binary_logloss: 0.0947264
[300]	training's binary_logloss: 0.0918887	valid_1's binary_logloss: 0.0926376
[400]	training's binary_logloss: 0.0906643	valid_1's binary_logloss: 0.0914977
[500]	training's binary_logloss: 0.0898095	valid_1's binary_logloss: 0.0907414
[600]	training's binary_logloss: 0.089268	valid_1's binary_logloss: 0.0902963
[700]	training's binary_logloss: 0.0888553	valid_1's binary_logloss: 0.0899958
[800]	training's binary_logloss: 0.0885396	valid_1's binary_logloss: 0.0898006
[900]	training's binary_logloss: 0.08828	valid_1's binary_logloss: 0.0896585
[1000]	training's binary_logloss: 0.0880476	valid_1's binary_logloss: 0.0895631
Did not meet 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

0 F1: 0.2629198966408269
Fold 2/4
X_train.shape: (1081494, 15), X_valid.shape: (360498, 15)
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0988796	valid_1's binary_logloss: 0.0988814
[200]	training's binary_logloss: 0.0942539	valid_1's binary_logloss: 0.0943567
[300]	training's binary_logloss: 0.092058	valid_1's binary_logloss: 0.0922407
[400]	training's binary_logloss: 0.0908232	valid_1's binary_logloss: 0.0910993
[500]	training's binary_logloss: 0.0899632	valid_1's binary_logloss: 0.0903146
[600]	training's binary_logloss: 0.0894196	valid_1's binary_logloss: 0.0898552
[700]	training's binary_logloss: 0.0890195	valid_1's binary_logloss: 0.0895629
[800]	training's binary_logloss: 0.0887008	valid_1's binary_logloss: 0.0893485
[900]	training's binary_logloss: 0.088435	valid_1's binary_logloss: 0.0892007
[1000]	training's binary_logloss: 0.0881882	valid_1's binary_logloss: 0.0890867
Did not meet early stopping. Best iteration is:
[1000]	tr

In [6]:
err_mean = np.mean(err_buf)
err_std = np.std(err_buf)
print('F1 = {:.4f} +/- {:.4f}'.format(err_mean, err_std))

F1 = 0.2644 +/- 0.0022


## Prepare submission

In [7]:
valid_df = train.loc[valid_index]
test['pred'] = np.mean(p_buf, axis=0)
pred_df = test.sort_values('pred', ascending=True).groupby('example_id').tail(1)

pred_df_long = pred_df[['example_id', 'CandidateString']]
pred_df_long.rename({'CandidateString': 'PredictionString'}, axis=1, inplace=True)
pred_df_long['example_id'] = pred_df_long['example_id'].apply(lambda x: str(x) + '_long')

pred_df_short = pred_df[['example_id', 'CandidateString']]
pred_df_short.rename({'CandidateString': 'PredictionString'}, axis=1, inplace=True)
pred_df_short['example_id'] = pred_df_short['example_id'].apply(lambda x: str(x) + '_short')
pred_df_short['PredictionString'] = ''

subm = pd.concat([pred_df_long, pred_df_short], axis=0).sort_values('example_id')
subm.to_csv('submission.csv', index=False)
print(f'subm.shape: {subm.shape}')
print(subm.head(20))

subm.shape: (692, 2)
                       example_id PredictionString
10999   -1011141123527297803_long          542:582
10999  -1011141123527297803_short                 
5036    -1028916936938579349_long          781:923
5036   -1028916936938579349_short                 
9426    -1055197305756217938_long          221:335
9426   -1055197305756217938_short                 
5467    -1074129516932871805_long        3491:3607
5467   -1074129516932871805_short                 
2915    -1114334749483663139_long         744:3809
2915   -1114334749483663139_short                 
1368    -1152268629614456016_long          491:526
1368   -1152268629614456016_short                 
1871    -1219507076732106786_long        2398:2500
1871   -1219507076732106786_short                 
1       -1220107454853145579_long          141:211
1      -1220107454853145579_short                 
10661   -1237358188352001279_long          815:857
10661  -1237358188352001279_short                 
2261    -1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
