In [1]:
import pandas as pd 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from imblearn.over_sampling import SMOTE
import os 

os.chdir('C:/Users/fib0/Desktop/Experiments/DataFrames')

In [2]:
# Importing Data (4 datasets : 2 embeddings 2 layers)
data1 = pd.read_csv('95% pseudolabels - fasttext - layer1 - cleaned.csv')
data2 = pd.read_csv('95% pseudolabels - fasttext - layer2 - cleaned.csv')
data3 = pd.read_csv('95% pseudolabels w2v - layer1 - cleaned.csv')
data4 = pd.read_csv('95% pseudolabels w2v - layer2 - cleaned.csv')
data5 = pd.read_csv('95% pseudolabels - bert embeddings - layer1 - cleaned.csv')
data6 = pd.read_csv('95% pseudolabels - bert embeddings - layer2 - cleaned.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
datasets = [(data1, '95% pseudolabels - fasttext - layer1 - cleaned.csv', "custom fasttext", "Layer1"),
            (data2, '95% pseudolabels - fasttext - layer2 - cleaned.csv', "custom fasttext", "Layer2"),
            (data3, '95% pseudolabels w2v - layer1 - cleaned.csv', "custom w2v", "Layer1"),
            (data4, '95% pseudolabels w2v - layer2 - cleaned.csv', "custom w2v", "Layer2"),
            (data5, '95% pseudolabels - bert embeddings - layer1 - cleaned.csv', "bert", "Layer1"),
            (data6, '95% pseudolabels - bert embeddings - layer2 - cleaned.csv', "bert", "Layer2")]

In [4]:
classifiers = [(SVC(), "Support Vector Machine"),
               (LogisticRegression(), "Logistic Regression"),
               (RandomForestClassifier(), "Random Forest")]

In [5]:
def exclude_low_freq_classes(data, threshold=10):
    
    df = data.copy()
    
    freq_table = df["pseudoLabels"].value_counts()
    low_freq_classes = []
    for i in range(len(freq_table)):
        if freq_table[i] < threshold:
            low_freq_classes.append(freq_table.index[i])
    
    for i in range(len(low_freq_classes)):
        df = df[df["pseudoLabels"] != low_freq_classes[i]]
        
    df.reset_index(drop=True, inplace=True)
    
    return df

In [6]:
def experiments(dataset, classifier, oversampling=True, contextual=True, write_to_txt=False, save_model=False):
    
    # Excluding low frequency classes
    data = exclude_low_freq_classes(dataset[0], threshold=20)
    
    # Defining the input X
    if contextual == True:
        X = data.iloc[:, 8:-1]
    else:
        X = data.iloc[:, 108:208]
        
    # Defining the output y
    y = data["pseudoLabels"]
    
    # Train - test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # If selected, perform oversampling to the training data
    if oversampling == True:
        smote = SMOTE(random_state = 42)
        X_train, y_train = smote.fit_resample(X_train, y_train)  
        
    # Fit the model to the training data
    clf = classifier[0].fit(X_train, y_train)
    
    # Predict the test set labels
    y_pred = clf.predict(X_test)
    
    # Report results
    print("-----------------------------------------")
    print(f"Dataset : {dataset[1]}")
    print(f"Embedding : {dataset[2]}")
    print(f"Labels : {dataset[3]}")
    print(f"Contextual : {contextual}")
    print(f"Oversampling : {oversampling}")
    print(f"Classifier : {classifier[1]}")
    print(f"Number of training instances : {len(y_train)}")
    print(classification_report(y_test, y_pred))
    #print(f"Balanced Accuracy : {balanced_accuracy_score(y_test, y_pred)}")
    #print(f"Average Precision : {average_precision_score(y_test, y_pred)}")
    
    # Write to text
    if write_to_txt == True:
        if os.path.exists("C:/Users/fib0/Desktop/Experiments/" + classifier[1] + " " + dataset[2] + dataset[3] + " " + str(contextual) + " " + str(oversampling) + ".txt"):
            os.remove("C:/Users/fib0/Desktop/Experiments/" + classifier[1] + " " + dataset[2] + dataset[3] + " " + str(contextual) + " " + str(oversampling) + ".txt")
        with open("C:/Users/fib0/Desktop/Experiments/" + classifier[1] + " " + dataset[2] + dataset[3] + " " + str(contextual) + " " + str(oversampling) + ".txt", 'w') as f:
            f.write(f"Dataset : {dataset[1]}\n")
            f.write(f"Embedding : {dataset[2]}\n")
            f.write(f"Labels : {dataset[3]}\n")
            f.write(f"Contextual : {contextual}\n")
            f.write(f"Oversampling : {oversampling}\n")
            f.write(f"Classifier : {classifier[1]}\n")
            f.write(f"Number of training instances : {len(y_train)}\n")
            f.write(classification_report(y_test, y_pred))
            
    # Save the model
    if save_model==True:
        model_name = classifier[1] + " " + dataset[2] + dataset[3] + " " + str(contextual) + " " + str(oversampling) 
        pickle.dump(clf, open(model_name, 'wb'))

In [8]:
# Layer1 experiments
for dataset in [datasets[1]]:
    for classifier in classifiers:
        for ovs in [True, False]:
            for cont in [True, False]:
                experiments(dataset, classifier, oversampling=ovs, contextual=cont, write_to_txt=True)

-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : True
Oversampling : True
Classifier : Support Vector Machine
Number of training instances : 453544
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.67      0.33      0.44         6
                                        Adaptive       0.86      0.74      0.79        34
                                   Argumentation       0.80      0.67      0.73         6
                                        Coaching       0.60      0.60      0.60         5
                                   Communication       0.78      0.91      0.84        35
                             Conflict management       0.86      0.78      0.82        46
                    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : False
Oversampling : True
Classifier : Support Vector Machine
Number of training instances : 453544
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.83      0.83      0.83         6
                                        Adaptive       0.37      0.91      0.53        34
                                   Argumentation       1.00      0.83      0.91         6
                                        Coaching       0.71      1.00      0.83         5
                                   Communication       0.94      0.97      0.96        35
                             Conflict management       0.98      0.93      0.96        46
                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : False
Oversampling : False
Classifier : Support Vector Machine
Number of training instances : 19576
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.50      0.17      0.25         6
                                        Adaptive       0.80      0.59      0.68        34
                                   Argumentation       1.00      0.17      0.29         6
                                        Coaching       0.50      0.20      0.29         5
                                   Communication       0.85      0.94      0.89        35
                             Conflict management       0.76      0.83      0.79        46
                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average,

-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : True
Oversampling : True
Classifier : Logistic Regression
Number of training instances : 453544
                                                  precision    recall  f1-score   support

                                  Accountability       0.86      0.86      0.86         7
                                Active listening       0.33      0.17      0.22         6
                                        Adaptive       0.67      0.76      0.71        34
                                   Argumentation       0.67      0.33      0.44         6
                                        Coaching       0.50      0.40      0.44         5
                                   Communication       0.75      0.86      0.80        35
                             Conflict management       0.80      0.80      0.80        46
                       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : False
Oversampling : True
Classifier : Logistic Regression
Number of training instances : 453544
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.83      0.83      0.83         6
                                        Adaptive       0.38      0.91      0.53        34
                                   Argumentation       1.00      0.83      0.91         6
                                        Coaching       0.67      0.80      0.73         5
                                   Communication       0.94      0.94      0.94        35
                             Conflict management       0.93      0.91      0.92        46
                      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : True
Oversampling : False
Classifier : Logistic Regression
Number of training instances : 19576
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.33      0.17      0.22         6
                                        Adaptive       0.65      0.71      0.68        34
                                   Argumentation       0.50      0.33      0.40         6
                                        Coaching       0.50      0.20      0.29         5
                                   Communication       0.79      0.86      0.82        35
                             Conflict management       0.83      0.85      0.84        46
                       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : False
Oversampling : False
Classifier : Logistic Regression
Number of training instances : 19576
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.67      0.33      0.44         6
                                        Adaptive       0.87      0.76      0.81        34
                                   Argumentation       1.00      0.67      0.80         6
                                        Coaching       0.60      0.60      0.60         5
                                   Communication       0.85      0.97      0.91        35
                             Conflict management       0.91      0.85      0.88        46
                      

-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : False
Oversampling : True
Classifier : Random Forest
Number of training instances : 453544
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.83      0.83      0.83         6
                                        Adaptive       0.37      0.91      0.53        34
                                   Argumentation       1.00      0.83      0.91         6
                                        Coaching       0.60      0.60      0.60         5
                                   Communication       0.92      0.97      0.94        35
                             Conflict management       0.93      0.89      0.91        46
                            

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------
Dataset : 95% pseudolabels - fasttext - layer2 - cleaned.csv
Embedding : custom fasttext
Labels : Layer2
Contextual : False
Oversampling : False
Classifier : Random Forest
Number of training instances : 19576
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.86      0.92         7
                                Active listening       0.80      0.67      0.73         6
                                        Adaptive       0.90      0.76      0.83        34
                                   Argumentation       1.00      0.83      0.91         6
                                        Coaching       0.60      0.60      0.60         5
                                   Communication       0.94      0.97      0.96        35
                             Conflict management       0.97      0.85      0.91        46
                            

In [7]:
# Layer1 experiments
for dataset in [datasets[4], datasets[5]]:
    for classifier in classifiers:
        for ovs in [True, False]:
            experiments(dataset, classifier, oversampling=ovs, contextual=True, write_to_txt=True)

-----------------------------------------
Dataset : 95% pseudolabels - bert embeddings - layer1 - cleaned.csv
Embedding : bert
Labels : Layer1
Contextual : True
Oversampling : True
Classifier : Support Vector Machine
Number of training instances : 361960
                                         precision    recall  f1-score   support

                         Accountability       0.60      1.00      0.75         3
                       Active listening       0.88      0.78      0.82         9
                               Adaptive       0.78      0.80      0.79        45
                          Argumentation       0.83      0.56      0.67         9
                               Coaching       0.50      0.50      0.50         8
                          Communication       0.89      0.95      0.92        44
                    Conflict management       0.73      0.83      0.78        53
                             Creativity       0.87      0.87      0.87        46
               

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------------------------------
Dataset : 95% pseudolabels - bert embeddings - layer1 - cleaned.csv
Embedding : bert
Labels : Layer1
Contextual : True
Oversampling : True
Classifier : Logistic Regression
Number of training instances : 361960
                                         precision    recall  f1-score   support

                         Accountability       0.60      1.00      0.75         3
                       Active listening       1.00      0.78      0.88         9
                               Adaptive       0.68      0.76      0.72        45
                          Argumentation       0.75      0.67      0.71         9
                               Coaching       0.33      0.38      0.35         8
                          Communication       0.98      0.91      0.94        44
                    Conflict management       0.77      0.83      0.80        53
                             Creativity       0.88      0.76      0.81        46
                  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------------------------------
Dataset : 95% pseudolabels - bert embeddings - layer1 - cleaned.csv
Embedding : bert
Labels : Layer1
Contextual : True
Oversampling : False
Classifier : Logistic Regression
Number of training instances : 18482
                                         precision    recall  f1-score   support

                         Accountability       0.75      1.00      0.86         3
                       Active listening       0.86      0.67      0.75         9
                               Adaptive       0.82      0.69      0.75        45
                          Argumentation       0.75      0.33      0.46         9
                               Coaching       0.57      0.50      0.53         8
                          Communication       0.93      0.91      0.92        44
                    Conflict management       0.76      0.79      0.78        53
                             Creativity       0.88      0.76      0.81        46
                  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------
Dataset : 95% pseudolabels - bert embeddings - layer2 - cleaned.csv
Embedding : bert
Labels : Layer2
Contextual : True
Oversampling : True
Classifier : Support Vector Machine
Number of training instances : 348264
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.83      0.91         6
                                Active listening       1.00      0.29      0.44         7
                                        Adaptive       0.68      0.81      0.74        26
                                   Argumentation       1.00      0.71      0.83         7
                                        Coaching       0.80      0.50      0.62         8
                                   Communication       0.71      0.83      0.77        36
                             Conflict management       0.74      0.85      0.79        33
                        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


-----------------------------------------
Dataset : 95% pseudolabels - bert embeddings - layer2 - cleaned.csv
Embedding : bert
Labels : Layer2
Contextual : True
Oversampling : True
Classifier : Logistic Regression
Number of training instances : 348264
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.83      0.91         6
                                Active listening       0.67      0.29      0.40         7
                                        Adaptive       0.79      0.85      0.81        26
                                   Argumentation       1.00      0.71      0.83         7
                                        Coaching       1.00      0.38      0.55         8
                                   Communication       0.74      0.89      0.81        36
                             Conflict management       0.84      0.94      0.89        33
                           

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------
Dataset : 95% pseudolabels - bert embeddings - layer2 - cleaned.csv
Embedding : bert
Labels : Layer2
Contextual : True
Oversampling : False
Classifier : Logistic Regression
Number of training instances : 15293
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.83      0.91         6
                                Active listening       1.00      0.29      0.44         7
                                        Adaptive       0.78      0.81      0.79        26
                                   Argumentation       0.80      0.57      0.67         7
                                        Coaching       1.00      0.50      0.67         8
                                   Communication       0.72      0.81      0.76        36
                             Conflict management       0.71      0.88      0.78        33
                           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------
Dataset : 95% pseudolabels - bert embeddings - layer2 - cleaned.csv
Embedding : bert
Labels : Layer2
Contextual : True
Oversampling : False
Classifier : Random Forest
Number of training instances : 15293
                                                  precision    recall  f1-score   support

                                  Accountability       1.00      0.67      0.80         6
                                Active listening       1.00      0.29      0.44         7
                                        Adaptive       0.82      0.69      0.75        26
                                   Argumentation       1.00      0.29      0.44         7
                                        Coaching       0.00      0.00      0.00         8
                                   Communication       0.59      0.67      0.62        36
                             Conflict management       0.51      0.76      0.61        33
                                 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
