External libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
from wordfreq import word_frequency
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import fbeta_score,make_scorer,accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from datetime import timedelta

%matplotlib inline

Feature & label engineering

In [2]:
leaveout = []#["X","SPACE", "SYM", "PUNCT"]


def freqfin(word, lang):
    result = -np.log(word_frequency(word, lang))
    if str(result)=="inf":
        return 0
    return result

def wordrarity(doc):
    firsttext = pd.Series(doc)
    wordpos = pd.DataFrame({'word':firsttext,'pos':firsttext.apply(lambda x: x.pos_)})
    wordpos = wordpos[wordpos["pos"].isin(leaveout)==False]
    wordpos["word"] = wordpos["word"].apply(lambda x: x.text.lower())
    wordpos["freq"] = wordpos["word"].apply(lambda x: freqfin(x,"en"))
    summary = wordpos.groupby("pos").agg({"freq":[lambda x: np.percentile(x,q=60),lambda x: np.percentile(x,q=85)]}).T.reset_index(drop=True)
    features = {}
    for i in summary.index:
        dictrow = summary.loc[i].to_dict()
        processed = {entry+"_"+str(i):dictrow[entry] for entry in dictrow}
        features.update(processed)
    return features

def aux_sentence_complexity(sentence):
    dicti = {'LEN':0,'CCONJ':0, 'SCONJ':0, 'AUX':0, 'VERB':0, 'VARIETY':0}

    poses = []
    for token in sentence:
        if token.pos_ not in leaveout: #added later
            poses.append(token.pos_)
            if token.pos_ in dicti:
                dicti[token.pos_] += 1
    dicti["LEN"] = len(sentence)
    dicti["VARIETY"] = len(set(poses))
    return dicti

def sentence_complexity(doc):
    series_sentences = pd.Series(doc.sents)
    summarysentences = series_sentences.apply(aux_sentence_complexity).apply(pd.Series).sort_values(["LEN","SCONJ","AUX","CCONJ","VERB"])
    reduced = summarysentences[summarysentences['LEN']>=5]
    features = reduced.iloc[-len(reduced)//5:].mean().to_dict()
    features["NR_SENT"] = len(series_sentences)
    return features

def extract_features(doc):
    f = wordrarity(doc)
    f.update(sentence_complexity(doc))
    f.update({"NR_WORDS":len(doc)})
    return f

def process_text(texts):
    nlp = spacy.load("en_core_web_lg")
    texts["text_nlp"] = texts["text"].apply(nlp)
    processed_docs=[]
    for doc in list(texts["text_nlp"]):
        processed_docs.append(extract_features(doc))
    X = pd.DataFrame(processed_docs)
    return X

def process_label_cat(texts):
    Y = texts[["label"]]
    return Y

def process_label_num(texts):  
    leveldict0 = {"A1": 0, "A2": 1, "B1": 2, "B2": 3, "C1": 4, "C2": 5}
    Y = texts[["label"]]
    Y["label"] = Y["label"].apply(lambda x: leveldict0[x])
    return Y

def process_label_smcat(texts):
    leveldict1 = {"A1": [0, 0, 0, 0, 0], "A2": [1, 0, 0, 0, 0], "B1": [1, 1, 0, 0, 0], "B2": [1, 1, 1, 0, 0], "C1": [1, 1, 1, 1, 0], "C2": [1, 1, 1, 1, 1]}
    Y = texts["label"].apply(lambda x: leveldict1[x]).apply(pd.Series)
    Y.columns = [">=A2",">=B1", ">=B2", ">=C1", "C2"]
    return Y

def train_test_index_split(df, frac=0.8):
    train_index = list(df.sample(frac=frac).index)
    test_index = list(set(df.index)-set(train_index))
    return train_index, test_index

texts = pd.read_csv("data/texts.csv")
X = process_text(texts)
Y_cat = process_label_cat(texts)
Y_num = process_label_num(texts)
train_index, test_index = train_test_index_split(texts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y["label"] = Y["label"].apply(lambda x: leveldict0[x])


Training & testing

In [6]:
import logging
from typing import Dict, Tuple
import xgboost as xgb
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV



def split_data(X, y, train_index, test_index):
    """Splits data into features and targets training and test sets.

    Args:
        data: Data containing features and target.
        parameters: Parameters defined in parameters/data_science.yml.
    Returns:
        Split data.
    """

    X_train = X[X.index.isin(train_index)]
    X_test = X[X.index.isin(test_index)]
    y_train = y[y.index.isin(train_index)]
    y_test = y[y.index.isin(test_index)]
    return X_train, X_test, y_train, y_test


def train_classifier(X_train, y_train):
    """Trains the linear regression model.

    Args:
        X_train: Training data of independent features.
        y_train: Training data for price.

    Returns:
        Trained model.
    """
    xgbc = xgb.XGBClassifier()
    param_grid={"colsample_bylevel":[1,0.7],"colsample_bytree":[1,0.8,0.7],"subsample":[1,0.8,0.7,0.5],"learning_rate":[0.01,0.02,0.05,0.1],"gamma":[0,1,10],"reg_lambda":[1,4,10],
            "max_delta_step":[0,1,10],"max_depth":[6,8,10,12],"min_child_weight":[1,3,5],"n_estimators":[10,50,100]}
    random_search = RandomizedSearchCV(estimator = xgbc, param_distributions = param_grid, n_iter = 2, cv = 5, verbose=2, random_state=34, n_jobs = -1,scoring="accuracy")
    random_search.fit(X_train, y_train)
    xgbc.set_params(**random_search.best_params_)
    xgbc.fit(X_train, y_train)
    return xgbc

def train_regressor(X_train, y_train):
    """Trains the linear regression model.

    Args:
        X_train: Training data of independent features.
        y_train: Training data for price.

    Returns:
        Trained model.
    """
    xgbr = xgb.XGBRegressor()
    param_grid={"colsample_bylevel":[1,0.7],"colsample_bytree":[1,0.8,0.7],"subsample":[1,0.8,0.7,0.5],"learning_rate":[0.01,0.02,0.05,0.1],"gamma":[0,1,10],"reg_lambda":[1,4,10],
           "max_delta_step":[0,1,10],"max_depth":[6,8,10,12],"scale_pos_weight":[1,3,5,10],"min_child_weight":[1,3,5],"n_estimators":[50,200,100]}
    random_search = RandomizedSearchCV(estimator = xgbr, param_distributions = param_grid, n_iter = 2, cv = 5, verbose=2, random_state=34, n_jobs = -1,scoring="neg_mean_squared_error")
    random_search.fit(X_train, y_train)
    xgbr.set_params(**random_search.best_params_)
    xgbr.fit(X_train, y_train)
    return xgbr

def confussion_matrix(pred_te, y_test):
    A = pd.DataFrame(pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test.values.squeeze()}).groupby("Predicted_values").Real_values.value_counts())
    A.columns = ["values"]
    A.reset_index(inplace=True)
    return A.pivot(index="Predicted_values",columns="Real_values",values="values").fillna(0)

def evaluate_model(
    model, X_test, y_test
):
    """Calculates and logs the coefficient of determination.

    Args:
        regressor: Trained model.
        X_test: Testing data of independent features.
        y_test: Testing data for price.
    """
    global y_pred, number_levels
    y_pred = model.predict(X_test)
    mode = "class:"
    if type(y_pred[0]) != str:
        dicti = {0:"A1",1:"A2",2:"B1",3:"B2",4:"C1",5:"C2"}
        y_pred_corr = []
        mode = "false_reg:"
        for i in y_pred:
            y_pred_corr.append(dicti[int(round(i,0))])
        y_pred = y_pred_corr
    cmatrix = confussion_matrix(y_pred, y_test.applymap(lambda x: dicti[x]))
    number_levels = len(cmatrix)
    sumdiagonal = 0
    sumnextdiag = 0

    for i in range(number_levels):
        sumdiagonal += cmatrix.iloc[i,i]
        if i<number_levels-1:
            sumnextdiag += cmatrix.iloc[i,i+1]+cmatrix.iloc[i+1,i]

    num_records = len(y_pred)
    acc = sumdiagonal/num_records
    acc_relax= (sumdiagonal+sumnextdiag) / num_records

    print("Model has an accuracy of %.3f on test data.", acc)
    print("Model has a relaxed accuracy of %.3f on test data.", acc_relax)
    display(cmatrix)
    return cmatrix

X_train, X_test, y_num_train, y_num_test = split_data(X, Y_num, train_index, test_index)
#X_cat_train, X_cat_test, y_cat_train, y_cat_test = split_data(X, Y_cat, train_index, test_index)
#xgbc = train_classifier(X_cat_train, y_cat_train)
xgbr = train_regressor(X_train, y_num_train)
#ccat = evaluate_model(xgbc, X_cat_test, y_cat_test)
cnum = evaluate_model(xgbr, X_test, y_num_test)

X_train, X_test, y_cat_train, y_cat_test = split_data(X, Y_num, train_index, test_index)
xgbc = train_classifier(X_train, y_cat_train)
ccat = evaluate_model(xgbc, X_test, y_cat_test)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
false_reg:Model has an accuracy of %.3f on test data. 0.5317725752508361
false_reg:Model has a relaxed accuracy of %.3f on test data. 0.959866220735786


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,16.0,0.0,0.0,0.0,0.0,0.0
A2,46.0,44.0,8.0,4.0,0.0,0.0
B1,2.0,5.0,19.0,7.0,2.0,1.0
B2,0.0,1.0,15.0,35.0,13.0,1.0
C1,0.0,0.0,1.0,10.0,36.0,23.0
C2,0.0,0.0,0.0,0.0,1.0,9.0


Fitting 5 folds for each of 2 candidates, totalling 10 fits
false_reg:Model has an accuracy of %.3f on test data. 0.5819397993311036
false_reg:Model has a relaxed accuracy of %.3f on test data. 0.9431438127090301


Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,63.0,18.0,4.0,1.0,0.0,2.0
A2,1.0,30.0,12.0,5.0,0.0,0.0
B1,0.0,0.0,3.0,2.0,1.0,0.0
B2,0.0,1.0,24.0,33.0,14.0,1.0
C1,0.0,1.0,0.0,14.0,23.0,9.0
C2,0.0,0.0,0.0,1.0,14.0,22.0


In [8]:
xgbr.predict(X_train)

array([1.875126 , 3.5992644, 2.6668804, ..., 4.6195097, 4.224061 ,
       4.581091 ], dtype=float32)

In [7]:
xgbc.predict_proba(X_train)

array([[0.04560134, 0.14207296, 0.39194673, 0.2996362 , 0.08594877,
        0.03479397],
       [0.02148755, 0.05337627, 0.12441013, 0.3094248 , 0.40356258,
        0.08773868],
       [0.02595538, 0.18435776, 0.22235352, 0.43440247, 0.10044724,
        0.03248363],
       ...,
       [0.00891939, 0.01443953, 0.0401716 , 0.07737973, 0.15216301,
        0.70692676],
       [0.01843974, 0.03250111, 0.1049723 , 0.18703653, 0.3554739 ,
        0.30157647],
       [0.00829383, 0.01342683, 0.04415679, 0.07195279, 0.17318016,
        0.68898964]], dtype=float32)

In [None]:
xgbr.predict(X_test)

In [None]:
false_reg:Model has an accuracy of %.3f on test data. 0.5953177257525084
false_reg:Model has a relaxed accuracy of %.3f on test data. 0.9732441471571907


In [6]:
cnum

Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,53.0,25.0,1.0,0.0,0.0,0.0
A2,2.0,23.0,3.0,1.0,0.0,0.0
B1,1.0,10.0,18.0,10.0,2.0,0.0
B2,0.0,0.0,12.0,30.0,17.0,1.0
C1,0.0,0.0,2.0,10.0,38.0,23.0
C2,0.0,0.0,0.0,0.0,1.0,16.0


In [49]:
y_pred = xgbr.predict(X_num_test)
mode = "class:"
if type(y_pred[0]) != str:
    dicti = {1:"A1",2:"A2",3:"B1",4:"B2",5:"C1",6:"C2"}
    y_pred_corr = []
    mode = "false_reg:"
    for i in y_pred:
        y_pred_corr.append(dicti[int(round(i,0))])
    y_pred = y_pred_corr

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [50]:
y_pred

['B2',
 'B2',
 'C1',
 'B2',
 'B2',
 'B2',
 'B1',
 'B1',
 'B2',
 'B2',
 'B2',
 'B2',
 'B1',
 'B2',
 'C1',
 'B1',
 'B2',
 'B2',
 'B1',
 'B1',
 'B2',
 'B2',
 'C1',
 'B2',
 'B2',
 'B2',
 'B1',
 'B2',
 'C1',
 'C1',
 'B1',
 'B2',
 'C1',
 'B2',
 'C1',
 'B2',
 'C1',
 'B2',
 'B1',
 'C1',
 'B2',
 'B2',
 'C1',
 'B2',
 'B2',
 'B2',
 'B2',
 'B1',
 'C1',
 'B2',
 'B2',
 'B1',
 'B2',
 'B2',
 'B1',
 'B2',
 'B2',
 'B2',
 'C1',
 'B2',
 'C1',
 'B1',
 'C1',
 'A2',
 'A1',
 'A1',
 'A2',
 'A2',
 'B1',
 'B1',
 'A1',
 'A1',
 'A2',
 'B1',
 'A2',
 'B1',
 'A2',
 'A2',
 'A2',
 'A2',
 'B1',
 'B1',
 'A2',
 'A2',
 'A2',
 'A1',
 'A2',
 'A2',
 'A2',
 'A1',
 'A1',
 'A2',
 'A2',
 'A1',
 'A1',
 'A2',
 'B1',
 'B1',
 'A2',
 'B1',
 'A2',
 'A2',
 'A2',
 'B1',
 'A1',
 'A1',
 'A2',
 'A2',
 'A1',
 'B1',
 'A2',
 'B1',
 'A2',
 'A1',
 'A2',
 'A2',
 'C1',
 'C1',
 'C1',
 'B2',
 'C1',
 'B2',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'C1',
 'B2',
 'C1',
 'C1',
 'C1',
 'C1',
 'B2',
 'C1',
 'C1',
 'B2',
 'B1',

In [47]:
ccat

Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,64.0,16.0,6.0,0.0,0.0,1.0
A2,1.0,36.0,7.0,6.0,0.0,0.0
B1,0.0,1.0,6.0,2.0,1.0,0.0
B2,0.0,0.0,14.0,35.0,11.0,0.0
C1,0.0,0.0,2.0,18.0,23.0,8.0
C2,0.0,0.0,0.0,2.0,13.0,26.0


In [43]:
cnum

Real_values,1,2,3,4,5,6
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,65.0,13.0,5.0,0.0,0.0,0.0
A2,0.0,28.0,4.0,0.0,0.0,0.0
B1,0.0,12.0,18.0,13.0,1.0,1.0
B2,0.0,0.0,7.0,36.0,9.0,1.0
C1,0.0,0.0,1.0,14.0,38.0,18.0
C2,0.0,0.0,0.0,0.0,0.0,15.0


In [45]:
65+28+18+36+38+15

200

In [46]:
65+28+18+36+38+15+18+9+13+4+13+0+18+7+14+0

296

In [41]:
xgbr.predict(X_num_test)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


array([4.019762 , 3.5520754, 4.8929687, 3.8410656, 3.8854365, 4.174892 ,
       3.073006 , 3.3436015, 3.7850728, 4.4400806, 4.05219  , 4.1647086,
       2.637621 , 3.6347573, 4.5147133, 3.1968532, 3.6442583, 4.128808 ,
       2.567329 , 2.8312657, 3.789647 , 4.3066998, 4.9055367, 4.1161075,
       4.407031 , 4.485958 , 3.2608342, 3.943362 , 5.0399075, 4.736501 ,
       3.480221 , 3.5151286, 5.2103925, 3.7027793, 4.5905123, 4.193363 ,
       4.7588835, 4.1631837, 2.8569012, 4.5859923, 3.6192968, 3.669119 ,
       4.793107 , 3.63866  , 3.6331265, 3.7982419, 3.6451964, 2.9319057,
       4.9782357, 3.90806  , 3.7199447, 2.6305895, 3.7799737, 4.0859795,
       2.85639  , 3.6605186, 4.176469 , 3.7923641, 4.566941 , 4.3464584,
       4.7250867, 3.4875956, 5.1383204, 1.7814556, 1.070656 , 1.0882567,
       2.3638237, 1.8670553, 2.7185795, 2.6288896, 1.2364964, 1.1282915,
       2.201621 , 2.974628 , 2.4302669, 2.9647653, 2.1627738, 2.1443772,
       1.5442822, 1.8875397, 2.6068308, 2.797445 , 

In [34]:
ccat

Real_values,A1,A2,B1,B2,C1,C2
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1,64.0,16.0,6.0,0.0,0.0,1.0
A2,1.0,36.0,7.0,6.0,0.0,0.0
B1,0.0,1.0,6.0,2.0,1.0,0.0
B2,0.0,0.0,14.0,35.0,11.0,0.0
C1,0.0,0.0,2.0,18.0,23.0,8.0
C2,0.0,0.0,0.0,2.0,13.0,26.0


In [33]:
cnum

Real_values,1,2,3,4,5,6
Predicted_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,64.0,16.0,6.0,0.0,0.0,1.0
2,1.0,36.0,7.0,6.0,0.0,0.0
3,0.0,1.0,6.0,2.0,1.0,0.0
4,0.0,0.0,14.0,35.0,11.0,0.0
5,0.0,0.0,2.0,18.0,23.0,8.0
6,0.0,0.0,0.0,2.0,13.0,26.0


In [None]:
pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts()
pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts()

In [15]:
y_pred

array(['B2', 'B2', 'C1', 'B2', 'B2', 'C1', 'A2', 'B2', 'B2', 'B2', 'B2',
       'B2', 'A2', 'B2', 'C1', 'C1', 'B2', 'B2', 'A2', 'A2', 'B2', 'B2',
       'C1', 'B2', 'C1', 'C1', 'B1', 'B2', 'C2', 'C1', 'B2', 'B2', 'C1',
       'B2', 'C1', 'B2', 'C2', 'B2', 'A2', 'C1', 'C1', 'B2', 'C1', 'B2',
       'B2', 'B2', 'B2', 'B1', 'C1', 'C1', 'B2', 'C1', 'B2', 'B2', 'A2',
       'B2', 'B2', 'B2', 'B2', 'B2', 'C1', 'B2', 'C1', 'A2', 'A1', 'A1',
       'A2', 'A1', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A1', 'A2', 'B1', 'A2', 'A2', 'A2', 'A2', 'A1', 'A2', 'A2',
       'A2', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A2', 'A2', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A1', 'A2',
       'A2', 'A2', 'A2', 'A1', 'A2', 'A2', 'B2', 'B2', 'C1', 'B2', 'C1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'B2', 'C2', 'C1', 'C1', 'C1', 'C1',
       'C1', 'C1', 'B2', 'C2', 'C1', 'C1', 'B2', 'C1', 'C2', 'B2', 'B1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'C1', 'C1', 'C

In [18]:
len(y_cat_test)

299

In [21]:
y_cat_test.values

array([['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['B2'],
       ['A2'],
       ['A2'],
       ['A2'],
       ['A

In [23]:
pd.DataFrame({"Predicted_values":y_pred,"Real_values":y_cat_test.values.squeeze()})

Unnamed: 0,Predicted_values,Real_values
0,B2,B2
1,B2,B2
2,C1,B2
3,B2,B2
4,B2,B2
...,...,...
294,C1,C2
295,C1,C2
296,C2,C2
297,C1,C2


In [16]:
confussion_matrix(y_pred, list(y_cat_test))

ValueError: All arrays must be of the same length

In [12]:
y_cat_test

Unnamed: 0,label
4,B2
16,B2
26,B2
28,B2
37,B2
...,...
1457,C2
1480,C2
1483,C2
1488,C2


In [9]:
y_pred

array(['B2', 'B2', 'C1', 'B2', 'B2', 'C1', 'A2', 'B2', 'B2', 'B2', 'B2',
       'B2', 'A2', 'B2', 'C1', 'C1', 'B2', 'B2', 'A2', 'A2', 'B2', 'B2',
       'C1', 'B2', 'C1', 'C1', 'B1', 'B2', 'C2', 'C1', 'B2', 'B2', 'C1',
       'B2', 'C1', 'B2', 'C2', 'B2', 'A2', 'C1', 'C1', 'B2', 'C1', 'B2',
       'B2', 'B2', 'B2', 'B1', 'C1', 'C1', 'B2', 'C1', 'B2', 'B2', 'A2',
       'B2', 'B2', 'B2', 'B2', 'B2', 'C1', 'B2', 'C1', 'A2', 'A1', 'A1',
       'A2', 'A1', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A1', 'A2', 'B1', 'A2', 'A2', 'A2', 'A2', 'A1', 'A2', 'A2',
       'A2', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2',
       'A2', 'A2', 'A2', 'A2', 'A2', 'A1', 'A1', 'A2', 'A2', 'A1', 'A2',
       'A2', 'A2', 'A2', 'A1', 'A2', 'A2', 'B2', 'B2', 'C1', 'B2', 'C1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'B2', 'C2', 'C1', 'C1', 'C1', 'C1',
       'C1', 'C1', 'B2', 'C2', 'C1', 'C1', 'B2', 'C1', 'C2', 'B2', 'B1',
       'B2', 'C2', 'C1', 'C2', 'C1', 'C1', 'C1', 'C

In [None]:
order = ["A1","A2","B1","B2","C1","C2"]

In [None]:
ax = sns.countplot(x="label", data=texts, order = order)

In [None]:
texts.loc[7,"text"]

In [None]:
texts["num_characters"] = texts["text"].apply(len)

In [None]:
sns.boxplot(x="label",y="num_characters",data=texts,order=order)

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
nlp = spacy.load("en_core_web_lg")
texts["text_nlp"] = texts["text"].apply(nlp)

# One text: analysis

In [None]:
def falasomma(x,y):
    return x+otraope(y)

def otraope(x):
    return x*x
abi = falasomma(1,2)

abi

In [None]:
leveldict0 = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}
leveldict1 = {"A1": [0, 0, 0, 0, 0], "A2": [1, 0, 0, 0, 0], "B1": [1, 1, 0, 0, 0], "B2": [1, 1, 1, 0, 0], "C1": [1, 1, 1, 1, 0], "C2": [1, 1, 1, 1, 1]}
Y_cat = texts["label"]
Y_number=texts["label"].apply(lambda x: leveldict0[x])
Y_smcat = texts["label"].apply(lambda x: leveldict1[x]).apply(pd.Series)
Y_smcat.columns = [">=A2",">=B1",">=B2",">=C1","C2"]
Y_number.to_pickle("../data/05_model_input/Y_number.pkl")
Y_smcat.to_pickle("../data/05_model_input/Y_smcat.pkl")

In [None]:
#feature and label engineering

def freqfin(word, lang):
    result = -np.log(word_frequency(word, lang))
    if str(result)=="inf":
        return 0
    return result

leaveout = []#["X","SPACE", "SYM", "PUNCT"]
def wordrarity(doc):
    firsttext = pd.Series(doc)
    wordpos = pd.DataFrame({'word':firsttext,'pos':firsttext.apply(lambda x: x.pos_)})
    wordpos = wordpos[wordpos["pos"].isin(leaveout)==False]
    wordpos["word"] = wordpos["word"].apply(lambda x: x.text.lower())
    wordpos["freq"] = wordpos["word"].apply(lambda x: freqfin(x,"en"))
    summary = wordpos.groupby("pos").agg({"freq":[lambda x: np.percentile(x,q=60),lambda x: np.percentile(x,q=85)]}).T.reset_index(drop=True)
    features = {}
    for i in summary.index:
        dictrow = summary.loc[i].to_dict()
        processed = {entry+"_"+str(i):dictrow[entry] for entry in dictrow}
        features.update(processed)
    return features

def aux_sentence_complexity(sentence):
    dicti = {'LEN':0,'CCONJ':0, 'SCONJ':0, 'AUX':0, 'VERB':0, 'VARIETY':0}

    poses = []
    for token in sentence:
        if token.pos_ not in leaveout: #added later
            poses.append(token.pos_)
            if token.pos_ in dicti:
                dicti[token.pos_] += 1
    dicti["LEN"] = len(sentence)
    dicti["VARIETY"] = len(set(poses))
    return dicti

def sentence_complexity(doc):
    series_sentences = pd.Series(doc.sents)
    summarysentences = series_sentences.apply(aux_sentence_complexity).apply(pd.Series).sort_values(["LEN","SCONJ","AUX","CCONJ","VERB"])
    reduced = summarysentences[summarysentences['LEN']>=5]
    features = reduced.iloc[-len(reduced)//5:].mean().to_dict()
    features["NR_SENT"] = len(series_sentences)
    return features

def extract_features(doc):
    f = wordrarity(doc)
    f.update(sentence_complexity(doc))
    f.update({"NR_WORDS":len(doc)})
    return f

processed_docs=[]
i=0
for doc in list(texts["text_nlp"]):
    i+=1
    if i%100 == 0:
        print(i)
    processed_docs.append(extract_features(doc))
X = pd.DataFrame(processed_docs)
Y_cat = texts["label"]


Y_cat.to_pickle("../data/05_model_input/Y_cat.pkl")

X.to_pickle("../data/05_model_input/X.pkl")
X_train, X_test, y_train, y_test = train_test_split(X, Y_cat, test_size=0.2)
X_train.to_pickle("../data/05_model_input/X_tr.pkl")
X_test.to_pickle("../data/05_model_input/X_te.pkl")
y_train.to_pickle("../data/05_model_input/Y_cat_tr.pkl")
y_test.to_pickle("../data/05_model_input/Y_cat_te.pkl")

In [None]:
def train_test_index_split(df, frac=0.8):
    train_index = list(df.sample(frac=frac).index))
    test_index = list(set(df.index)-set(train_index))
    return train_index, test_index

In [None]:
#traininig
xgbc = xgb.XGBClassifier()
param_grid={"colsample_bylevel":[1,0.7],"colsample_bytree":[1,0.8,0.7],"subsample":[1,0.8,0.7,0.5],"learning_rate":[0.01,0.02,0.05,0.1],"gamma":[0,1,10],"reg_lambda":[1,4,10],
           "max_delta_step":[0,1,10],"max_depth":[6,8,10,12],"min_child_weight":[1,3,5],"n_estimators":[10,50,100]}
random_search = RandomizedSearchCV(estimator = xgbc, param_distributions = param_grid, n_iter = 100, cv = 5, verbose=2, random_state=34, n_jobs = -1,scoring="accuracy")
random_search.fit(X_train, y_train)
xgbc.set_params(**random_search.best_params_)
xgbc.fit(X_train, y_train)

In [None]:
#predicting
predictions_tr = xgbc.predict(X_train)
predictions_te = xgbc.predict(X_test)
pred_tr = pd.Series(predictions_tr, index=X_train.index)
pred_tr.to_pickle("../data/07_model_output/P_cat_tr.pkl")
pred_te = pd.Series(predictions_te, index=X_test.index)
pred_te.to_pickle("../data/07_model_output/P_cat_te.pkl")

In [None]:
hello = sum
hello([1,2])

In [None]:
#reporting
A = pd.DataFrame(pd.DataFrame({"Predicted_values":pred_te,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts())
A.columns = ["values"]
A.reset_index(inplace=True)
display(A.pivot(index="Predicted_values",columns="Real_values",values="values").fillna(0))

In [None]:
def confusion_matrix_df(predictions, y_test):
    A = pd.DataFrame(pd.DataFrame({"Predicted_values":predictions,"Real_values":y_test}).groupby("Predicted_values").Real_values.value_counts())
    A.columns = ["values"]
    A.reset_index(inplace=True)
    display(A.pivot(index="Predicted_values",columns="Real_values",values="values"))
    
def train_test_class(model,X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    # Use the model on the train data
    predictions_tr = model.predict(X_train)
    # Calculate the accuracy
    errors_tr = (predictions_tr == y_train)

    # Print out the mean squared error
    print('Training Accuracy:', round(np.mean(errors_tr), 2))

    # Use the model on the test data
    predictions = model.predict(X_test)
    # Calculate the accuracy
    errors = (predictions == y_test)
    print('Test Accuracy:', round(np.mean(errors), 2))
    
    confusion_matrix_df(predictions, y_test)
    

In [None]:
y_train.columns

In [None]:
train_test_class(xgbc, X_train, y_train['>=A2'], X_test, y_test['>=A2'])


In [None]:
train_test_class(xgbc, X_train, y_train['>=B1'], X_test, y_test['>=B1'])


In [None]:
train_test_class(xgbc, X_train, y_train['>=B2'], X_test, y_test['>=B2'])

In [None]:
train_test_class(xgbc, X_train, y_train['>=C1'], X_test, y_test['>=C1'])

In [None]:
train_test_class(xgbc, X_train, y_train['C2'], X_test, y_test['C2'])

In [None]:
xgbA2 = xgb.XGBClassifier()
xgbA2.set_params(**random_search.best_params_)

xgbB1 = xgb.XGBClassifier()
xgbB1.set_params(**random_search.best_params_)

xgbB2 = xgb.XGBClassifier()
xgbB2.set_params(**random_search.best_params_)

xgbC1 = xgb.XGBClassifier()
xgbC1.set_params(**random_search.best_params_)

xgbC2 = xgb.XGBClassifier()
xgbC2.set_params(**random_search.best_params_)

models = [xgbA2, xgbB1, xgbB2, xgbC1, xgbC2]
predictions = []
for i in range(len(models)):
    model = models[i]
    model.set_params(**random_search.best_params_)
    model.fit(X_train,y_train.iloc[:,i])
    predictions.append(model.predict(X_test))

In [None]:
real = y_test.to_numpy()

In [None]:
predicted = pd.DataFrame(predictions).T.to_numpy()

In [None]:
real

In [None]:
sum(sum(abs(real-predicted)))

In [None]:
109 livelli sbagliati

In [None]:
pd.Series((real-predicted).sum(axis=1)).value_counts()

In [None]:
pd.DataFrame(predicted).T.diff().T.describe()

In [None]:
from matplotlib import pyplot

In [None]:
pd.DataFrame({"features":X.columns,"importance":xgbc.feature_importances_}).sort_values("importance")

In [None]:
pyplot.bar(X.columns, xgbc.feature_importances_)

In [None]:
texts

In [None]:
extract_features(doc)

In [None]:
for i in doc.sents:

In [None]:
3/(1/8+1/9+1/1000)

In [None]:
np.sqrt((8**2+9**2+1000**2)/3)

In [None]:
word_frequency("'d","en")

In [None]:
'CCONJ', 'SCONJ', 'AUX', 'VERB'

In [None]:
spacy.explain('PROPN')

In [None]:
for i in doc.sents:
    print("------")
    print("->"+i.text)

In [None]:
firsttext = pd.Series(texts["text_nlp"].iloc[0])

In [None]:
wordpos[wordpos["pos"]=="PRON"].drop_duplicates(subset=["word"])

In [None]:
firsttext

In [None]:
len(firsttext)

In [None]:
total

In [None]:
summary.loc[i].to_dict()

In [None]:
for pos in wordpos["pos"].unique():
    aux = wordpos[wordpos["pos"]==pos]
    aux["word"] = aux["word"].apply(lambda x: x.text.lower())
    display(aux.drop_duplicates().iloc[:5])

In [None]:
freqfin("aòlkejòlqjroqiejr","en")

In [None]:
import spacy
from sense2vec import Sense2VecComponent

nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")

In [None]:
def extractfeatures_text(text):
    