Functions

In [1]:
#######################
# EXTERNAL LIBRARIES
#######################
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
from wordfreq import word_frequency
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import fbeta_score,make_scorer,accuracy_score, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import random
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import pickle

%matplotlib inline

#######################
# RANDOM STATE
#######################
random_state = 13
random.seed(random_state)
np.random.seed(random_state)

#######################
# FEATURE ENGINEERING: PROCESSING TEXT (X)
#######################

##############
### Word rarity
##############

def rarityfin(word, lang):
    """
    given a word, returns a number which indicates how rare is it.
    """
    result = -np.log(word_frequency(word, lang))
    if str(result)=="inf": #words which are not part of the corpus are given value 0 so as not to be considered in the following steps
        return 0
    return result

def wordrarity(doc, option = "all"):
    """
    given a text, returns a dataframe with the following columns:
    - part of speech: noun, pronoun, verb, adverb ...
    - rarity at percentiles 60 and 85 for each POS (computed with the previous functino)
    """

    leaveout = []
    if option =="clean":
        leaveout = ["X","SPACE", "SYM", "PUNCT"]

    firsttext = pd.Series(doc)
    wordpos = pd.DataFrame({'word':firsttext,'pos':firsttext.apply(lambda x: x.pos_)}) #part-of-speech
    wordpos = wordpos[wordpos["pos"].isin(leaveout)==False] 
    wordpos["word"] = wordpos["word"].apply(lambda x: x.text.lower())
    wordpos["rarity"] = wordpos["word"].apply(lambda x: rarityfin(x,"en"))
    summary = wordpos.groupby("pos").agg({"rarity":[lambda x: np.percentile(x,q=60),lambda x: np.percentile(x,q=85)]}).T.reset_index(drop=True)
    features = {}
    for i in summary.index:
        dictrow = summary.loc[i].to_dict()
        processed = {entry+"_"+str(i):dictrow[entry] for entry in dictrow}
        features.update(processed)
    return features

##############
### Sentence complexity
##############

def aux_sentence_complexity(sentence, option = "all"):
    """
    given a single sentence, it computes
    - the total number of words
    - the number of words belonging to different categories (CCONJ, SCONJ, AUX, VERB)
    - variety: the number of different parts-of-speech found in the sentence
    """
    leaveout = []
    if option =="clean":
        leaveout = ["X","SPACE", "SYM", "PUNCT"]

    dicti = {'LEN':0,'CCONJ':0, 'SCONJ':0, 'AUX':0, 'VERB':0, 'VARIETY':0}

    poses = []
    for token in sentence:
        if token.pos_ not in leaveout: 
            poses.append(token.pos_)
            if token.pos_ in dicti:
                dicti[token.pos_] += 1
    dicti["LEN"] = len(sentence)
    dicti["VARIETY"] = len(set(poses))
    return dicti

def sentence_complexity(doc, option, ignore_length):
    """
    given a text, computes the median values (check previous function) considering the longest sentences (20 %)
    """
    series_sentences = pd.Series(doc.sents)
    summarysentences = series_sentences.apply(lambda x: aux_sentence_complexity(x, option)).apply(pd.Series).sort_values(["LEN","SCONJ","AUX","CCONJ","VERB"])
    reduced = summarysentences[summarysentences['LEN']>=5]
    features = reduced.iloc[-len(reduced)//5:].median().to_dict()
    if ignore_length == False:
        features["NR_SENT"] = len(series_sentences)
    return features

#########
### Text difficulty
##########

def extract_features(doc, option, ignore_length):
    """
    Computes word and sentence difficulty
    """
    f = wordrarity(doc, option)
    f.update(sentence_complexity(doc, option, ignore_length))
    if ignore_length == False:
        f.update({"NR_WORDS":len(doc)})
    return f

##########
### Main
##########

def process_text(texts, option, ignore_length):
    """
    Main function to create X from raw texts
    """
    nlp = spacy.load("en_core_web_lg") #load model
    texts["text_nlp"] = texts["text"].apply(nlp) #nlp English language model
    processed_docs=[]
    for doc in list(texts["text_nlp"]):
        processed_docs.append(extract_features(doc, option, ignore_length))
    X = pd.DataFrame(processed_docs)
    return X

#######################
# FEATURE ENGINEERING: LABELLING (Y)
#######################
def process_label_num(texts):
    """
    from CEFR levels to numbers, A1 ->0, A2 -> 1...
    """  
    leveldict0 = {"A1": 0, "A2": 1, "B1": 2, "B2": 3, "C1": 4, "C2": 5}
    Y = texts[["label"]]
    Y["label"] = Y["label"].apply(lambda x: leveldict0[x])
    return Y

def process_label_smcat(texts):
    """
    from CEFR levels to multilabel. example for B1: >=A2 -> 1, >=B1 -> 1, >=B2 -> 0... rest -> 0
    """
    leveldict1 = {"A1": [0, 0, 0, 0, 0], "A2": [1, 0, 0, 0, 0], "B1": [1, 1, 0, 0, 0], "B2": [1, 1, 1, 0, 0], "C1": [1, 1, 1, 1, 0], "C2": [1, 1, 1, 1, 1]}
    Y = texts["label"].apply(lambda x: leveldict1[x]).apply(pd.Series)
    Y.columns = [">=A2",">=B1", ">=B2", ">=C1", "C2"]
    return Y

#######################
# TRAIN - TEST SPLIT
#######################

def train_test_index_split(df, frac=0.8):
    """
    division in train-test (list of index)
    """
    train_index = list(df.sample(frac=frac).index)
    test_index = list(set(df.index)-set(train_index))
    return train_index, test_index

def split_data(data, train_index, test_index):
    """
    Given X, y, and the lists of indexes belonging to train and test sets, creates train and test datasets
    """

    data_train = data[data.index.isin(train_index)]
    data_test = data[data.index.isin(test_index)]
    return data_train, data_test

#######################
# TRAINING
#######################
def train_xgb(X_train,y_train, problem_type):
    """
    Trains an xgb model on training data
    """

    if problem_type == "classification":
        model = xgb.XGBClassifier()
        scoring = "accuracy"
    elif problem_type =="regression":
        model = xgb.XGBRegressor()
        scoring = "neg_mean_squared_error"
    elif problem_type =="multilabel":
        list_models = []
        for column in y_train.columns:
            single_model = train_xgb(X_train, y_train[column],"classification")
            list_models.append(single_model)
        return list_models

    xgbc_BS_complete = {
        'learning_rate':Real(0.001,1,"log-uniform"),
        'max_depth': Integer(1, 20,'uniform'),
        'gamma': (1, 100, 'uniform'),
        'subsample': Real(0.01, 0.99, 'uniform'),
        'colsample_bytree': Real(0.01, 0.99, 'uniform'),
        'reg_alpha': Real(1, 100, 'uniform'),
        'n_estimators': Integer(50, 500,'log-uniform'),
        'max_delta_step': Real(0, 10),
        'colsample_bylevel': Real(0.01, 1.0, 'uniform'),
        'reg_lambda': Real(1e-9, 100, 'log-uniform'),
    }
    if (max(y_train)==1) & (problem_type =="classification"): #binary classification
        xgbc_BS_complete["scale_pos_weight"] = Real(0.2, 4,"log-uniform")

    search = BayesSearchCV(model, xgbc_BS_complete, n_iter=100, # specify how many iterations
        scoring=scoring, n_jobs=-1, cv=5,verbose=0)
    search.fit(X_train,y_train)
    model.set_params(**search.best_params_)
    model.fit(X_train,y_train)
    return model

#######################
# TESTING
#######################
def confussion_matrix(pred_te, y_test):
    A = pd.DataFrame(pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test.values.squeeze()}).groupby("Predicted_values").Real_values.value_counts())
    A.columns = ["values"]
    A.reset_index(inplace=True)
    return A.pivot(index="Predicted_values",columns="Real_values",values="values").fillna(0)

def evaluate_model(
    model, X_test, y_test
):
    """
    Computes confusion matrix and accuracy
    """
    
    if type(model)==list:
        output_multibin = []
        for model_i in model:
            output_multibin.append(model_i.predict(X_test))
        y_pred = pd.DataFrame(output_multibin).T.sum(axis=1)
        incoherent = pd.DataFrame(output_multibin).diff().max()==1
        FI = pd.DataFrame(map(lambda x: x.feature_importances_, model)).T.set_index(model[0].feature_names_in_)
        FI["median"] = FI.apply(lambda x: x.median(),axis=1)
        FI = FI.sort_values("median",ascending=False)
    else:
        y_pred = pd.Series(model.predict(X_test))
        incoherent = pd.Series([False]*len(y_pred))
        FI = pd.Series(model.feature_importances_).set_axis(model.feature_names_in_).sort_values(ascending=False)
        
    dicti = {0:"A1",1:"A2",2:"B1",3:"B2",4:"C1",5:"C2"}
    y_pred_corr = y_pred.apply(lambda x: dicti[int(round(x,0))])
    y_pred_corr.loc[incoherent]="INC"

    cmatrix = confussion_matrix(y_pred_corr.to_list(), y_test.applymap(lambda x: dicti[x]))
    number_levels = min([len(cmatrix),len(cmatrix.columns)])
    sumdiagonal = 0
    sumnextdiag = 0

    for i in range(number_levels):
        sumdiagonal += cmatrix.iloc[i,i]
        if i<number_levels-1:
            sumnextdiag += cmatrix.iloc[i,i+1]+cmatrix.iloc[i+1,i]

    num_records = len(y_pred)
    acc = sumdiagonal/num_records
    acc_relax= (sumdiagonal+sumnextdiag) / num_records

    print("Accuracy on test data:", acc)
    print("'Relaxed accuracy' on test data:", acc_relax)
    print("Models incoherence:", sum(incoherent)/num_records)
    display(cmatrix)
    display(FI)

Predicting level from text

In [2]:
texts = pd.read_csv("data/texts.csv")
X = process_text(texts, option= "clean",ignore_length=False)
Y_num = process_label_num(texts)
Y_multlab = process_label_smcat(texts)
train_index, test_index = train_test_index_split(texts)

X_train, X_test = split_data(X, train_index, test_index)
y_num_train, y_num_test =  split_data(Y_num, train_index, test_index)
y_multlab_train, y_multlab_test = split_data(Y_multlab, train_index, test_index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y["label"] = Y["label"].apply(lambda x: leveldict0[x])


In [3]:
xgbr = train_xgb(X_train, y_num_train,"regression")
evaluate_model(xgbr, X_test, y_num_test)

Accuracy on test data: 0.5852842809364549
'Relaxed accuracy' on test data: 0.959866220735786
Models incoherence: 0.0


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,44.0,9.0,3.0,0.0,0.0,0.0
A2,18.0,40.0,5.0,2.0,0.0,0.0
B1,2.0,13.0,14.0,9.0,1.0,1.0
B2,0.0,0.0,15.0,22.0,9.0,0.0
C1,0.0,0.0,1.0,16.0,34.0,16.0
C2,0.0,0.0,0.0,2.0,2.0,21.0


LEN         0.263475
NR_WORDS    0.137522
VERB_0      0.119910
ADJ_1       0.115442
NOUN_0      0.054990
ADV_1       0.042928
INTJ_1      0.025562
ADV_0       0.018755
ADP_0       0.017399
CCONJ       0.016095
NR_SENT     0.012544
NOUN_1      0.011492
AUX         0.010853
PRON_1      0.010692
VERB        0.010581
ADP_1       0.010228
NUM_0       0.009271
VARIETY     0.009161
INTJ_0      0.009124
CCONJ_1     0.008562
PART_1      0.007796
PRON_0      0.007772
AUX_0       0.007444
PART_0      0.007251
VERB_1      0.007184
ADJ_0       0.006597
SCONJ_0     0.006152
PROPN_1     0.005480
AUX_1       0.005050
SCONJ_1     0.004792
CCONJ_0     0.004626
SCONJ       0.004064
PROPN_0     0.003973
DET_1       0.003669
NUM_1       0.003563
DET_0       0.000000
dtype: float32

In [4]:
xgbc = train_xgb(X_train, y_num_train,"classification")
evaluate_model(xgbc, X_test, y_num_test)

Accuracy on test data: 0.6387959866220736
'Relaxed accuracy' on test data: 0.959866220735786
Models incoherence: 0.0


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,52.0,10.0,2.0,1.0,0.0,0.0
A2,12.0,46.0,5.0,1.0,1.0,0.0
B1,0.0,5.0,14.0,6.0,1.0,0.0
B2,0.0,1.0,16.0,28.0,12.0,2.0
C1,0.0,0.0,1.0,13.0,25.0,10.0
C2,0.0,0.0,0.0,2.0,7.0,26.0


NR_WORDS    0.116493
LEN         0.095783
VERB_0      0.043197
NR_SENT     0.031561
VERB_1      0.031450
NOUN_0      0.025972
ADJ_1       0.025489
AUX_0       0.025178
AUX         0.024163
ADJ_0       0.023973
ADV_0       0.023293
ADP_1       0.023200
ADV_1       0.023054
NUM_0       0.023012
SCONJ_0     0.022236
VERB        0.022015
PROPN_0     0.021879
VARIETY     0.021818
INTJ_1      0.021793
PART_1      0.021658
AUX_1       0.021634
ADP_0       0.021633
INTJ_0      0.021475
SCONJ       0.021429
CCONJ_0     0.021223
PROPN_1     0.021210
PRON_1      0.021078
NOUN_1      0.021063
CCONJ_1     0.020962
NUM_1       0.020882
PRON_0      0.020687
CCONJ       0.020608
DET_1       0.020573
SCONJ_1     0.020520
DET_0       0.019623
PART_0      0.018182
dtype: float32

In [5]:
xgbml = train_xgb(X_train, y_multlab_train, "multilabel")
evaluate_model(xgbml, X_test, y_num_test)

Accuracy on test data: 0.6688963210702341
'Relaxed accuracy' on test data: 0.9464882943143813
Models incoherence: 0.0033444816053511705


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,61.0,16.0,5.0,0.0,0.0,0.0
A2,2.0,42.0,3.0,2.0,0.0,0.0
B1,0.0,4.0,14.0,6.0,2.0,0.0
B2,0.0,0.0,15.0,30.0,8.0,2.0
C1,0.0,0.0,1.0,10.0,25.0,8.0
C2,0.0,0.0,0.0,3.0,11.0,28.0
INC,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,median
LEN,0.242047,0.033062,0.25302,0.304708,0.087967,0.242047
NR_WORDS,0.177173,0.023408,0.058059,0.061567,0.101931,0.061567
VERB_0,0.0,0.050358,0.037398,0.088799,0.050138,0.050138
VERB_1,0.045061,0.012829,0.124829,0.032235,0.018164,0.032235
ADJ_0,0.0,0.029467,0.019555,0.033918,0.042016,0.029467
NOUN_0,0.0,0.024451,0.015117,0.038561,0.088858,0.024451
VERB,0.263209,0.07724,0.014981,0.023868,0.012576,0.023868
ADV_1,0.0,0.021852,0.023331,0.03984,0.087536,0.023331
VARIETY,0.147984,0.068662,0.022583,0.018358,0.022437,0.022583
ADJ_1,0.0,0.010627,0.024347,0.043325,0.022314,0.022314


Lingua Ignis use case

In [6]:
Xnl = process_text(texts, option= "clean",ignore_length=True)
Xnl_train, Xnl_test = split_data(Xnl, train_index, test_index)


In [7]:
xgbr_nl = train_xgb(Xnl_train, y_num_train,"regression")
evaluate_model(xgbr_nl, Xnl_test, y_num_test)

Accuracy on test data: 0.6187290969899666
'Relaxed accuracy' on test data: 0.9632107023411371
Models incoherence: 0.0


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,42.0,7.0,2.0,1.0,0.0,0.0
A2,21.0,44.0,5.0,0.0,0.0,0.0
B1,1.0,11.0,18.0,9.0,1.0,0.0
B2,0.0,0.0,11.0,27.0,12.0,1.0
C1,0.0,0.0,2.0,11.0,32.0,15.0
C2,0.0,0.0,0.0,3.0,1.0,22.0


LEN        0.121490
VARIETY    0.112848
VERB_0     0.104676
VERB       0.103598
CCONJ      0.078249
ADJ_1      0.048351
VERB_1     0.047428
ADV_1      0.040678
CCONJ_0    0.034452
NOUN_0     0.029970
ADJ_0      0.028801
PROPN_1    0.023999
NUM_1      0.020352
PRON_1     0.015569
ADV_0      0.015555
NOUN_1     0.012414
PART_1     0.012396
AUX        0.012168
ADP_0      0.011519
PART_0     0.010489
AUX_1      0.010130
SCONJ      0.010051
INTJ_1     0.009988
DET_0      0.009729
AUX_0      0.009365
CCONJ_1    0.009096
INTJ_0     0.008598
NUM_0      0.007635
ADP_1      0.007524
PRON_0     0.007298
SCONJ_0    0.006981
SCONJ_1    0.006517
DET_1      0.006488
PROPN_0    0.005601
dtype: float32

In [8]:
xgbc_nl = train_xgb(Xnl_train, y_num_train,"classification")
evaluate_model(xgbc_nl, Xnl_test, y_num_test)

Accuracy on test data: 0.6454849498327759
'Relaxed accuracy' on test data: 0.9498327759197325
Models incoherence: 0.0


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,53.0,14.0,3.0,1.0,0.0,0.0
A2,9.0,43.0,7.0,1.0,1.0,0.0
B1,2.0,5.0,13.0,4.0,1.0,0.0
B2,0.0,0.0,14.0,31.0,12.0,2.0
C1,0.0,0.0,1.0,11.0,28.0,11.0
C2,0.0,0.0,0.0,3.0,4.0,25.0


LEN        0.094888
VARIETY    0.079085
VERB       0.045968
CCONJ      0.044304
VERB_0     0.041731
VERB_1     0.037861
ADV_1      0.032564
CCONJ_0    0.030375
ADJ_1      0.029051
NUM_1      0.027838
ADV_0      0.026331
NOUN_0     0.025660
AUX_0      0.025201
NUM_0      0.024689
ADJ_0      0.024620
AUX_1      0.023656
NOUN_1     0.023065
INTJ_1     0.023034
SCONJ      0.023010
PROPN_1    0.022538
ADP_1      0.022357
ADP_0      0.022161
INTJ_0     0.021890
PRON_1     0.021804
CCONJ_1    0.021520
SCONJ_0    0.021504
DET_0      0.021215
PART_1     0.021083
PROPN_0    0.021022
AUX        0.020926
SCONJ_1    0.020516
PART_0     0.019959
PRON_0     0.019576
DET_1      0.018996
dtype: float32

In [9]:
xgbml_nl = train_xgb(Xnl_train, y_multlab_train, "multilabel")
evaluate_model(xgbml_nl, Xnl_test, y_num_test)

Accuracy on test data: 0.6321070234113713
'Relaxed accuracy' on test data: 0.9431438127090301
Models incoherence: 0.0


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,54.0,18.0,6.0,1.0,0.0,0.0
A2,9.0,37.0,5.0,2.0,1.0,0.0
B1,1.0,7.0,11.0,4.0,0.0,0.0
B2,0.0,0.0,14.0,32.0,9.0,1.0
C1,0.0,0.0,2.0,9.0,28.0,10.0
C2,0.0,0.0,0.0,3.0,8.0,27.0


Unnamed: 0,0,1,2,3,4,median
LEN,0.02994,0.181234,0.137157,0.12965,0.055305,0.12965
VERB_0,0.060316,0.039274,0.06215,0.057879,0.045563,0.057879
VARIETY,0.056287,0.210633,0.081784,0.031783,0.029607,0.056287
ADV_1,0.041031,0.04444,0.04723,0.091659,0.056018,0.04723
ADJ_1,0.02059,0.016974,0.053039,0.047718,0.046405,0.046405
VERB_1,0.02687,0.031893,0.077994,0.03871,0.04784,0.03871
VERB,0.048286,0.024215,0.036673,0.044308,0.033523,0.036673
NOUN_0,0.034737,0.013306,0.020045,0.036434,0.048053,0.034737
NUM_1,0.031881,0.044387,0.043769,0.018223,0.027944,0.031881
ADJ_0,0.085921,0.01333,0.024393,0.04136,0.031585,0.031585


Saving train-test splits and models

In [10]:

folder_dataset = "output/processed_dataset/"
X.to_csv(folder_dataset + "X.csv", index = False)
Xnl.to_csv(folder_dataset + "Xnl.csv", index = False)
Y_num.to_csv(folder_dataset + "Y_num.csv", index = False)
Y_multlab.to_csv(folder_dataset + "Y_multlab.csv", index= False)

with open(folder_dataset+'text_index_list.pickle', 'wb') as handle:
    pickle.dump(test_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

folder_models = "output/models/"

with open(folder_models+'xgbr.pickle', 'wb') as handle:
    pickle.dump(xgbr, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder_models+'xgbr_nl.pickle', 'wb') as handle:
    pickle.dump(xgbr_nl, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder_models+'xgbc.pickle', 'wb') as handle:
    pickle.dump(xgbc, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder_models+'xgbc_nl.pickle', 'wb') as handle:
    pickle.dump(xgbc_nl, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder_models+'xgbml.pickle', 'wb') as handle:
    pickle.dump(xgbml, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder_models+'xgbml_nl.pickle', 'wb') as handle:
    pickle.dump(xgbml_nl, handle, protocol=pickle.HIGHEST_PROTOCOL)

Other tests

In [11]:
# There may be overfitting. Given the nature of the problem (level classification can be subjective), it is not an issue
# Try other scoring
from sklearn.model_selection import cross_validate
cv_results = cross_validate(xgbc_nl, Xnl_test, y_num_test, cv=5, scoring="accuracy",return_train_score=True)
cv_results

{'fit_time': array([2.27601361, 2.28601742, 2.44009161, 2.42053866, 2.41001987]),
 'score_time': array([0.01600051, 0.01600385, 0.01692915, 0.01800179, 0.01599789]),
 'test_score': array([0.55      , 0.51666667, 0.58333333, 0.61666667, 0.55932203]),
 'train_score': array([0.9665272 , 0.958159  , 0.9790795 , 0.9748954 , 0.96666667])}