External libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
from wordfreq import word_frequency
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import fbeta_score,make_scorer,accuracy_score, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import random
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


random_state = 13

random.seed(random_state)
np.random.seed(random_state)

%matplotlib inline

Feature & label engineering

In [2]:
leaveout = []#["X","SPACE", "SYM", "PUNCT"]


def freqfin(word, lang): #given a word, returns
    result = -np.log(word_frequency(word, lang))
    if str(result)=="inf":
        return 0
    return result

def wordrarity(doc):
    firsttext = pd.Series(doc)
    wordpos = pd.DataFrame({'word':firsttext,'pos':firsttext.apply(lambda x: x.pos_)})
    wordpos = wordpos[wordpos["pos"].isin(leaveout)==False]
    wordpos["word"] = wordpos["word"].apply(lambda x: x.text.lower())
    wordpos["freq"] = wordpos["word"].apply(lambda x: freqfin(x,"en"))
    summary = wordpos.groupby("pos").agg({"freq":[lambda x: np.percentile(x,q=60),lambda x: np.percentile(x,q=85)]}).T.reset_index(drop=True)
    features = {}
    for i in summary.index:
        dictrow = summary.loc[i].to_dict()
        processed = {entry+"_"+str(i):dictrow[entry] for entry in dictrow}
        features.update(processed)
    return features

def aux_sentence_complexity(sentence):
    dicti = {'LEN':0,'CCONJ':0, 'SCONJ':0, 'AUX':0, 'VERB':0, 'VARIETY':0}

    poses = []
    for token in sentence:
        if token.pos_ not in leaveout: #added later
            poses.append(token.pos_)
            if token.pos_ in dicti:
                dicti[token.pos_] += 1
    dicti["LEN"] = len(sentence)
    dicti["VARIETY"] = len(set(poses))
    return dicti

def sentence_complexity(doc):
    series_sentences = pd.Series(doc.sents)
    summarysentences = series_sentences.apply(aux_sentence_complexity).apply(pd.Series).sort_values(["LEN","SCONJ","AUX","CCONJ","VERB"])
    reduced = summarysentences[summarysentences['LEN']>=5]
    features = reduced.iloc[-len(reduced)//5:].mean().to_dict()
    features["NR_SENT"] = len(series_sentences)
    return features

def extract_features(doc):
    f = wordrarity(doc)
    f.update(sentence_complexity(doc))
    f.update({"NR_WORDS":len(doc)})
    return f

def process_text(texts):
    nlp = spacy.load("en_core_web_lg")
    texts["text_nlp"] = texts["text"].apply(nlp)
    processed_docs=[]
    for doc in list(texts["text_nlp"]):
        processed_docs.append(extract_features(doc))
    X = pd.DataFrame(processed_docs)
    return X

def process_label_cat(texts):
    Y = texts[["label"]]
    return Y

def process_label_num(texts):  
    leveldict0 = {"A1": 0, "A2": 1, "B1": 2, "B2": 3, "C1": 4, "C2": 5}
    Y = texts[["label"]]
    Y["label"] = Y["label"].apply(lambda x: leveldict0[x])
    return Y

def process_label_smcat(texts):
    leveldict1 = {"A1": [0, 0, 0, 0, 0], "A2": [1, 0, 0, 0, 0], "B1": [1, 1, 0, 0, 0], "B2": [1, 1, 1, 0, 0], "C1": [1, 1, 1, 1, 0], "C2": [1, 1, 1, 1, 1]}
    Y = texts["label"].apply(lambda x: leveldict1[x]).apply(pd.Series)
    Y.columns = [">=A2",">=B1", ">=B2", ">=C1", "C2"]
    return Y

def train_test_index_split(df, frac=0.8):
    train_index = list(df.sample(frac=frac).index)
    test_index = list(set(df.index)-set(train_index))
    return train_index, test_index

texts = pd.read_csv("data/texts.csv")
X = process_text(texts)
#Y_cat = process_label_cat(texts)
Y_num = process_label_num(texts)
Y_multbin = process_label_smcat(texts)
train_index, test_index = train_test_index_split(texts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y["label"] = Y["label"].apply(lambda x: leveldict0[x])


Training & testing

In [22]:

def split_data(X, y, train_index, test_index):
    """Splits data into features and targets training and test sets.

    Args:
        data: Data containing features and target.
        parameters: Parameters defined in parameters/data_science.yml.
    Returns:
        Split data.
    """

    X_train = X[X.index.isin(train_index)]
    X_test = X[X.index.isin(test_index)]
    y_train = y[y.index.isin(train_index)]
    y_test = y[y.index.isin(test_index)]
    return X_train, X_test, y_train, y_test


def train_classifier(X_train, y_train):
    """Trains the linear regression model.

    Args:
        X_train: Training data of independent features.
        y_train: Training data for price.

    Returns:
        Trained model.
    """
    xgbc = xgb.XGBClassifier()
    param_grid={"colsample_bylevel":[1,0.7],"colsample_bytree":[1,0.8,0.7],"subsample":[1,0.8,0.7,0.5],"learning_rate":[0.01,0.02,0.05,0.1],"gamma":[0,1,10],"reg_lambda":[1,4,10],
            "max_delta_step":[0,1,10],"max_depth":[6,8,10,12],"min_child_weight":[1,3,5],"n_estimators":[10,50,100]}
    random_search = RandomizedSearchCV(estimator = xgbc, param_distributions = param_grid, n_iter = 2, cv = 5, verbose=2, n_jobs = -1,scoring="accuracy")
    random_search.fit(X_train, y_train)
    xgbc.set_params(**random_search.best_params_)
    xgbc.fit(X_train, y_train)
    return xgbc

def train_classifier_advanced(X_train,y_train):
    model = xgb.XGBClassifier()

    xgbc_BS_complete = {
        'learning_rate':Real(0.001,1,"log-uniform"),
        'max_depth': Integer(1, 20,'uniform'),
        'gamma': (1, 100, 'uniform'),
        'subsample': Real(0.01, 0.99, 'uniform'),
        'colsample_bytree': Real(0.01, 0.99, 'uniform'),
        'reg_alpha': Real(1, 100, 'uniform'),
        'n_estimators': Integer(50, 500,'log-uniform'),
        'scale_pos_weight': Real(0.5, 2, 'uniform'),
        'max_delta_step': Real(0, 10),
        'colsample_bylevel': Real(0.01, 1.0, 'uniform'),
        'reg_lambda': Real(1e-9, 100, 'log-uniform'),
        'scale_pos_weight': Real(0.2, 4,"log-uniform")
    }


    search = BayesSearchCV(model, xgbc_BS_complete, n_iter=100, # specify how many iterations
        scoring="accuracy", n_jobs=-1, cv=5,verbose=60)
    search.fit(X_train,y_train)
    model.set_params(**search.best_params_)
    model.fit(X_train,y_train)
    return model

def train_multi_class_bin(X_train, y_train):
    list_xgbc = []
    for column in y_train.columns:
        xgbc = train_classifier_advanced(X_train,y_train[column])
        list_xgbc.append(xgbc)
    return list_xgbc

def train_regressor(X_train, y_train):
    """Trains the linear regression model.

    Args:
        X_train: Training data of independent features.
        y_train: Training data for price.

    Returns:
        Trained model.
    """
    xgbr = xgb.XGBRegressor()
    param_grid={"colsample_bylevel":[1,0.7],"colsample_bytree":[1,0.8,0.7],"subsample":[1,0.8,0.7,0.5],"learning_rate":[0.01,0.02,0.05,0.1],"gamma":[0,1,10],"reg_lambda":[1,4,10],
           "max_delta_step":[0,1,10],"max_depth":[6,8,10,12],"scale_pos_weight":[1,3,5,10],"min_child_weight":[1,3,5],"n_estimators":[50,200,100]}
    random_search = RandomizedSearchCV(estimator = xgbr, param_distributions = param_grid, n_iter = 2, cv = 5, verbose=2, random_state=34, n_jobs = -1,scoring="neg_mean_squared_error")
    random_search.fit(X_train, y_train)
    xgbr.set_params(**random_search.best_params_)
    xgbr.fit(X_train, y_train)
    return xgbr

def train_regressor_advanced(X_train,y_train):
    model = xgb.XGBRegressor()

    xgbc_BS_complete = {
        'learning_rate':Real(0.001,1,"log-uniform"),
        'max_depth': Integer(1, 20,'uniform'),
        'gamma': (1, 100, 'uniform'),
        'subsample': Real(0.01, 0.99, 'uniform'),
        'colsample_bytree': Real(0.01, 0.99, 'uniform'),
        'reg_alpha': Real(1, 100, 'uniform'),
        'n_estimators': Integer(50, 500,'log-uniform'),
        'max_delta_step': Real(0, 10),
        'colsample_bylevel': Real(0.01, 1.0, 'uniform'),
        'reg_lambda': Real(1e-9, 100, 'log-uniform'),
        'scale_pos_weight': Real(0.2, 4,"log-uniform")
    }


    search = BayesSearchCV(model, xgbc_BS_complete, n_iter=100, # specify how many iterations
        scoring="neg_mean_squared_error", n_jobs=-1, cv=5,verbose=60)
    search.fit(X_train,y_train)
    model.set_params(**search.best_params_)
    model.fit(X_train,y_train)
    return model

def confussion_matrix(pred_te, y_test):
    A = pd.DataFrame(pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test.values.squeeze()}).groupby("Predicted_values").Real_values.value_counts())
    A.columns = ["values"]
    A.reset_index(inplace=True)
    return A.pivot(index="Predicted_values",columns="Real_values",values="values").fillna(0)

def evaluate_model(
    model, X_test, y_test, option = "normal"
):
    """Calculates and logs the coefficient of determination.

    Args:
        regressor: Trained model.
        X_test: Testing data of independent features.
        y_test: Testing data for price.
    """
    
    if option == "multi":
        output_multibin = []
        for model_i in model:
            output_multibin.append(model_i.predict(X_test))
        y_pred = pd.DataFrame(output_multibin).T.sum(axis=1).to_list()
    else:
        y_pred = model.predict(X_test)
    if type(y_pred[0]) != str:
        dicti = {0:"A1",1:"A2",2:"B1",3:"B2",4:"C1",5:"C2"}
        y_pred_corr = []
        for i in y_pred:
            y_pred_corr.append(dicti[int(round(i,0))])
        y_pred = y_pred_corr
    cmatrix = confussion_matrix(y_pred, y_test.applymap(lambda x: dicti[x]))
    number_levels = len(cmatrix)
    sumdiagonal = 0
    sumnextdiag = 0

    for i in range(number_levels):
        sumdiagonal += cmatrix.iloc[i,i]
        if i<number_levels-1:
            sumnextdiag += cmatrix.iloc[i,i+1]+cmatrix.iloc[i+1,i]

    num_records = len(y_pred)
    acc = sumdiagonal/num_records
    acc_relax= (sumdiagonal+sumnextdiag) / num_records

    print("Model has an accuracy of %.3f on test data.", acc)
    print("Model has a relaxed accuracy of %.3f on test data.", acc_relax)
    display(cmatrix)
    return cmatrix








In [4]:
X_train, X_test, y_num_train, y_num_test = split_data(X, Y_num, train_index, test_index)
#X_cat_train, X_cat_test, y_cat_train, y_cat_test = split_data(X, Y_cat, train_index, test_index)
#xgbc = train_classifier(X_cat_train, y_cat_train)
xgbr = train_regressor_advanced(X_train, y_num_train)
#ccat = evaluate_model(xgbc, X_cat_test, y_cat_test)
cnum = evaluate_model(xgbr, X_test, y_num_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,53.0,13.0,3.0,0.0,0.0,0.0
A2,11.0,28.0,3.0,3.0,0.0,0.0
B1,0.0,21.0,17.0,5.0,1.0,0.0
B2,0.0,0.0,14.0,30.0,6.0,1.0
C1,0.0,0.0,1.0,10.0,34.0,13.0
C2,0.0,0.0,0.0,3.0,5.0,24.0


In [5]:
xgbc = train_classifier_advanced(X_train, y_num_train)
ccat = evaluate_model(xgbc, X_test, y_num_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,51.0,10.0,2.0,1.0,0.0,0.0
A2,13.0,49.0,6.0,1.0,0.0,0.0
B1,0.0,3.0,14.0,5.0,1.0,1.0
B2,0.0,0.0,15.0,29.0,9.0,0.0
C1,0.0,0.0,1.0,12.0,29.0,11.0
C2,0.0,0.0,0.0,3.0,7.0,26.0


In [6]:
X_train, X_test, y_multbin_train, y_multbin_test = split_data(X, Y_multbin, train_index, test_index)
xgbcbs = train_multi_class_bin(X_train, y_multbin_train)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [28]:
299*(0.62207 - 0.65551839)

-10.001068609999999

In [29]:
output_multibin = []
for model_i in xgbcbs:
    output_multibin.append(model_i.predict(X_test))
y_pred = pd.DataFrame(output_multibin).T

In [41]:
y_pred.median()

0    1.0
1    1.0
2    0.0
3    0.0
4    0.0
dtype: float64

In [33]:
# coherence control
y_pred.T.diff().min().value_counts()

-1.0    186
 0.0    113
dtype: int64

In [35]:
model00 = xgbcbs[0]

In [37]:
pd.DataFrame({"column":model00.feature_names_in_,"FI":model00.feature_importances_})

Unnamed: 0,column,FI
0,ADJ_0,0.01198
1,ADP_0,0.011215
2,ADV_0,0.012671
3,AUX_0,0.014126
4,CCONJ_0,0.016021
5,DET_0,0.01198
6,INTJ_0,0.019194
7,NOUN_0,0.014061
8,NUM_0,0.010222
9,PART_0,0.011179


In [23]:
ccatmb = evaluate_model(xgbcbs, X_test, y_num_test,"multi")


Model has an accuracy of %.3f on test data. 0.6555183946488294
Model has a relaxed accuracy of %.3f on test data. 0.9665551839464883


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,63.0,16.0,4.0,1.0,0.0,0.0
A2,1.0,43.0,3.0,1.0,0.0,0.0
B1,0.0,3.0,13.0,7.0,1.0,0.0
B2,0.0,0.0,17.0,27.0,9.0,1.0
C1,0.0,0.0,1.0,14.0,29.0,16.0
C2,0.0,0.0,0.0,1.0,7.0,21.0


In [39]:
model05 = xgbcbs[-1]

In [40]:
model05.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 0.01,
 'colsample_bynode': 1,
 'colsample_bytree': 0.99,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 1,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 1.0,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 2.356844043412667,
 'max_depth': 20,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 255,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 1.0,
 'reg_lambda': 1e-09,
 'sampling_method': 'uniform',
 'scale_pos_weight': 0.4652387220146635,
 'subsample': 0.99,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [None]:
{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1.0,
 'colsample_bynode': 1,
 'colsample_bytree': 0.99,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 1,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.001,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0.0,
 'max_depth': 4,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 500,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 1.0,
 'reg_lambda': 3.8914975603238214e-08,
 'sampling_method': 'uniform',
 'scale_pos_weight': 0.6329065406162958,
 'subsample': 0.45327216927197117,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [38]:
model00.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1.0,
 'colsample_bynode': 1,
 'colsample_bytree': 0.99,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 1,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.001,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0.0,
 'max_depth': 4,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 500,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 1.0,
 'reg_lambda': 3.8914975603238214e-08,
 'sampling_method': 'uniform',
 'scale_pos_weight': 0.6329065406162958,
 'subsample': 0.45327216927197117,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [19]:
y_num_test.value_counts(normalize=True)

label
0        0.214047
1        0.207358
3        0.170569
4        0.153846
2        0.127090
5        0.127090
dtype: float64

In [20]:
#.apply(lambda x: ["A1","A2","B1","B2","C1","C2"][x])

0      4
1      4
2      4
3      2
4      4
      ..
294    4
295    5
296    5
297    4
298    5
Length: 299, dtype: int64

In [None]:
pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts()
pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts()

In [15]:
y_pred

array(['B2', 'B2', 'C1', 'B2', 'B2', 'C1', 'A2', 'B2', 'B2', 'B2', 'B2',
       'B2', 'A2', 'B2', 'C1', 'C1', 'B2', 'B2', 'A2', 'A2', 'B2', 'B2',
       'C1', 'B2', 'C1', 'C1', 'B1', 'B2', 'C2', 'C1', 'B2', 'B2', 'C1',
       'B2', 'C1', 'B2', 'C2', 'B2', 'A2', 'C1', 'C1', 'B2', 'C1', 'B2',
       'B2', 'B2', 'B2', 'B1', 'C1', 'C1', 'B2', 'C1', 'B2', 'B2', 'A2',
       'B2', 'B2', 'B2', 'B2', 'B2', 'C1', 'B2', 'C1', 'A2', 'A1', 'A1',
       'A2', 'A1', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A1', 'A2', 'B1', 'A2', 'A2', 'A2', 'A2', 'A1', 'A2', 'A2',
       'A2', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A2', 'A2', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A1', 'A2',
       'A2', 'A2', 'A2', 'A1', 'A2', 'A2', 'B2', 'B2', 'C1', 'B2', 'C1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'B2', 'C2', 'C1', 'C1', 'C1', 'C1',
       'C1', 'C1', 'B2', 'C2', 'C1', 'C1', 'B2', 'C1', 'C2', 'B2', 'B1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'C1', 'C1', 'C

In [18]:
len(y_cat_test)

299

In [21]:
y_cat_test.values

array([['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['A2'],
       ['A2'],
       ['A2'],
       ['A

In [23]:
pd.DataFrame({"Predicted_values":y_pred,"Real_values":y_cat_test.values.squeeze()})

Unnamed: 0,Predicted_values,Real_values
0,B2,B2
1,B2,B2
2,C1,B2
3,B2,B2
4,B2,B2
...,...,...
294,C1,C2
295,C1,C2
296,C2,C2
297,C1,C2


In [16]:
confussion_matrix(y_pred, list(y_cat_test))

ValueError: All arrays must be of the same length

In [12]:
y_cat_test

Unnamed: 0,label
4,B2
16,B2
26,B2
28,B2
37,B2
...,...
1457,C2
1480,C2
1483,C2
1488,C2


In [9]:
y_pred

array(['B2', 'B2', 'C1', 'B2', 'B2', 'C1', 'A2', 'B2', 'B2', 'B2', 'B2',
       'B2', 'A2', 'B2', 'C1', 'C1', 'B2', 'B2', 'A2', 'A2', 'B2', 'B2',
       'C1', 'B2', 'C1', 'C1', 'B1', 'B2', 'C2', 'C1', 'B2', 'B2', 'C1',
       'B2', 'C1', 'B2', 'C2', 'B2', 'A2', 'C1', 'C1', 'B2', 'C1', 'B2',
       'B2', 'B2', 'B2', 'B1', 'C1', 'C1', 'B2', 'C1', 'B2', 'B2', 'A2',
       'B2', 'B2', 'B2', 'B2', 'B2', 'C1', 'B2', 'C1', 'A2', 'A1', 'A1',
       'A2', 'A1', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A1', 'A2', 'B1', 'A2', 'A2', 'A2', 'A2', 'A1', 'A2', 'A2',
       'A2', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A2', 'A2', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A1', 'A2',
       'A2', 'A2', 'A2', 'A1', 'A2', 'A2', 'B2', 'B2', 'C1', 'B2', 'C1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'B2', 'C2', 'C1', 'C1', 'C1', 'C1',
       'C1', 'C1', 'B2', 'C2', 'C1', 'C1', 'B2', 'C1', 'C2', 'B2', 'B1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'C1', 'C1', 'C

In [None]:
order = ["A1","A2","B1","B2","C1","C2"]

In [None]:
ax = sns.countplot(x="label", data=texts, order = order)

In [None]:
texts.loc[7,"text"]

In [None]:
texts["num_characters"] = texts["text"].apply(len)

In [None]:
sns.boxplot(x="label",y="num_characters",data=texts,order=order)

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
nlp = spacy.load("en_core_web_lg")
texts["text_nlp"] = texts["text"].apply(nlp)

# One text: analysis

In [None]:
def falasomma(x,y):
    return x+otraope(y)

def otraope(x):
    return x*x
abi = falasomma(1,2)

abi

In [None]:
leveldict0 = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}
leveldict1 = {"A1": [0, 0, 0, 0, 0], "A2": [1, 0, 0, 0, 0], "B1": [1, 1, 0, 0, 0], "B2": [1, 1, 1, 0, 0], "C1": [1, 1, 1, 1, 0], "C2": [1, 1, 1, 1, 1]}
Y_cat = texts["label"]
Y_number=texts["label"].apply(lambda x: leveldict0[x])
Y_smcat = texts["label"].apply(lambda x: leveldict1[x]).apply(pd.Series)
Y_smcat.columns = [">=A2",">=B1",">=B2",">=C1","C2"]
Y_number.to_pickle("../data/05_model_input/Y_number.pkl")
Y_smcat.to_pickle("../data/05_model_input/Y_smcat.pkl")

In [None]:
#feature and label engineering

def freqfin(word, lang):
    result = -np.log(word_frequency(word, lang))
    if str(result)=="inf":
        return 0
    return result

leaveout = []#["X","SPACE", "SYM", "PUNCT"]
def wordrarity(doc):
    firsttext = pd.Series(doc)
    wordpos = pd.DataFrame({'word':firsttext,'pos':firsttext.apply(lambda x: x.pos_)})
    wordpos = wordpos[wordpos["pos"].isin(leaveout)==False]
    wordpos["word"] = wordpos["word"].apply(lambda x: x.text.lower())
    wordpos["freq"] = wordpos["word"].apply(lambda x: freqfin(x,"en"))
    summary = wordpos.groupby("pos").agg({"freq":[lambda x: np.percentile(x,q=60),lambda x: np.percentile(x,q=85)]}).T.reset_index(drop=True)
    features = {}
    for i in summary.index:
        dictrow = summary.loc[i].to_dict()
        processed = {entry+"_"+str(i):dictrow[entry] for entry in dictrow}
        features.update(processed)
    return features

def aux_sentence_complexity(sentence):
    dicti = {'LEN':0,'CCONJ':0, 'SCONJ':0, 'AUX':0, 'VERB':0, 'VARIETY':0}

    poses = []
    for token in sentence:
        if token.pos_ not in leaveout: #added later
            poses.append(token.pos_)
            if token.pos_ in dicti:
                dicti[token.pos_] += 1
    dicti["LEN"] = len(sentence)
    dicti["VARIETY"] = len(set(poses))
    return dicti

def sentence_complexity(doc):
    series_sentences = pd.Series(doc.sents)
    summarysentences = series_sentences.apply(aux_sentence_complexity).apply(pd.Series).sort_values(["LEN","SCONJ","AUX","CCONJ","VERB"])
    reduced = summarysentences[summarysentences['LEN']>=5]
    features = reduced.iloc[-len(reduced)//5:].mean().to_dict()
    features["NR_SENT"] = len(series_sentences)
    return features

def extract_features(doc):
    f = wordrarity(doc)
    f.update(sentence_complexity(doc))
    f.update({"NR_WORDS":len(doc)})
    return f

processed_docs=[]
i=0
for doc in list(texts["text_nlp"]):
    i+=1
    if i%100 == 0:
        print(i)
    processed_docs.append(extract_features(doc))
X = pd.DataFrame(processed_docs)
Y_cat = texts["label"]


Y_cat.to_pickle("../data/05_model_input/Y_cat.pkl")

X.to_pickle("../data/05_model_input/X.pkl")
X_train, X_test, y_train, y_test = train_test_split(X, Y_cat, test_size=0.2)
X_train.to_pickle("../data/05_model_input/X_tr.pkl")
X_test.to_pickle("../data/05_model_input/X_te.pkl")
y_train.to_pickle("../data/05_model_input/Y_cat_tr.pkl")
y_test.to_pickle("../data/05_model_input/Y_cat_te.pkl")

In [None]:
def train_test_index_split(df, frac=0.8):
    train_index = list(df.sample(frac=frac).index))
    test_index = list(set(df.index)-set(train_index))
    return train_index, test_index

In [None]:
#traininig
xgbc = xgb.XGBClassifier()
param_grid={"colsample_bylevel":[1,0.7],"colsample_bytree":[1,0.8,0.7],"subsample":[1,0.8,0.7,0.5],"learning_rate":[0.01,0.02,0.05,0.1],"gamma":[0,1,10],"reg_lambda":[1,4,10],
           "max_delta_step":[0,1,10],"max_depth":[6,8,10,12],"min_child_weight":[1,3,5],"n_estimators":[10,50,100]}
random_search = RandomizedSearchCV(estimator = xgbc, param_distributions = param_grid, n_iter = 100, cv = 5, verbose=2,  n_jobs = -1,scoring="accuracy")
random_search.fit(X_train, y_train)
xgbc.set_params(**random_search.best_params_)
xgbc.fit(X_train, y_train)

In [None]:
#predicting
predictions_tr = xgbc.predict(X_train)
predictions_te = xgbc.predict(X_test)
pred_tr = pd.Series(predictions_tr, index=X_train.index)
pred_tr.to_pickle("../data/07_model_output/P_cat_tr.pkl")
pred_te = pd.Series(predictions_te, index=X_test.index)
pred_te.to_pickle("../data/07_model_output/P_cat_te.pkl")

In [None]:
hello = sum
hello([1,2])

In [None]:
#reporting
A = pd.DataFrame(pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts())
A.columns = ["values"]
A.reset_index(inplace=True)
display(A.pivot(index="Predicted_values",columns="Real_values",values="values").fillna(0))

In [None]:
def confusion_matrix_df(predictions, y_test):
    A = pd.DataFrame(pd.DataFrame({"Predicted_values":predictions,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts())
    A.columns = ["values"]
    A.reset_index(inplace=True)
    display(A.pivot(index="Predicted_values",columns="Real_values",values="values"))
    
def train_test_class(model,X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    # Use the model on the train data
    predictions_tr = model.predict(X_train)
    # Calculate the accuracy
    errors_tr = (predictions_tr == y_train)

    # Print out the mean squared error
    print('Training Accuracy:', round(np.mean(errors_tr), 2))

    # Use the model on the test data
    predictions = model.predict(X_test)
    # Calculate the accuracy
    errors = (predictions == y_test)
    print('Test Accuracy:', round(np.mean(errors), 2))
    
    confusion_matrix_df(predictions, y_test)
    

In [None]:
y_train.columns

In [None]:
train_test_class(xgbc, X_train, y_train['>=A2'], X_test, y_test['>=A2'])


In [None]:
train_test_class(xgbc, X_train, y_train['>=B1'], X_test, y_test['>=B1'])


In [None]:
train_test_class(xgbc, X_train, y_train['>=B2'], X_test, y_test['>=B2'])

In [None]:
train_test_class(xgbc, X_train, y_train['>=C1'], X_test, y_test['>=C1'])

In [None]:
train_test_class(xgbc, X_train, y_train['C2'], X_test, y_test['C2'])

In [None]:
xgbA2 = xgb.XGBClassifier()
xgbA2.set_params(**random_search.best_params_)

xgbB1 = xgb.XGBClassifier()
xgbB1.set_params(**random_search.best_params_)

xgbB2 = xgb.XGBClassifier()
xgbB2.set_params(**random_search.best_params_)

xgbC1 = xgb.XGBClassifier()
xgbC1.set_params(**random_search.best_params_)

xgbC2 = xgb.XGBClassifier()
xgbC2.set_params(**random_search.best_params_)

models = [xgbA2, xgbB1, xgbB2, xgbC1, xgbC2]
predictions = []
for i in range(len(models)):
    model = models[i]
    model.set_params(**random_search.best_params_)
    model.fit(X_train,y_train.iloc[:,i])
    predictions.append(model.predict(X_test))

In [None]:
real = y_test.to_numpy()

In [None]:
predicted = pd.DataFrame(predictions).T.to_numpy()

In [None]:
real

In [None]:
sum(sum(abs(real-predicted)))

In [None]:
109 livelli sbagliati

In [None]:
pd.Series((real-predicted).sum(axis=1)).value_counts()

In [None]:
pd.DataFrame(predicted).T.diff().T.describe()

In [None]:
from matplotlib import pyplot

In [None]:
pd.DataFrame({"features":X.columns,"importance":xgbc.feature_importances_}).sort_values("importance")

In [None]:
pyplot.bar(X.columns, xgbc.feature_importances_)

In [None]:
texts

In [None]:
extract_features(doc)

In [None]:
for i in doc.sents:

In [None]:
3/(1/8+1/9+1/1000)

In [None]:
np.sqrt((8**2+9**2+1000**2)/3)

In [None]:
word_frequency("'d","en")

In [None]:
'CCONJ', 'SCONJ', 'AUX', 'VERB'

In [None]:
spacy.explain('PROPN')

In [None]:
for i in doc.sents:
    print("------")
    print("->"+i.text)

In [None]:
firsttext = pd.Series(texts["text_nlp"].iloc[0])

In [None]:
wordpos[wordpos["pos"]=="PRON"].drop_duplicates(subset=["word"])

In [None]:
firsttext

In [None]:
len(firsttext)

In [None]:
total

In [None]:
summary.loc[i].to_dict()

In [None]:
for pos in wordpos["pos"].unique():
    aux = wordpos[wordpos["pos"]==pos]
    aux["word"] = aux["word"].apply(lambda x: x.text.lower())
    display(aux.drop_duplicates().iloc[:5])

In [None]:
freqfin("aòlkejòlqjroqiejr","en")

In [None]:
import spacy
from sense2vec import Sense2VecComponent

nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")

In [None]:
def extractfeatures_text(text):
    