In [None]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.6.0 (from simpletransformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Pr

In [None]:
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

# load data file
df = pd.read_excel('Classifications_Output.xlsx', sheet_name="Classifications_Segment_Level")
df

# remove rows where IS_GOLD_STANDARD = YES
df = df[df['IS_GOLD_STANDARD'] == "NO"]
df.shape

# rename columns
df["text"] = df["SNIPPET"]
df["labels"] = df["s_manual_trinary"]

# Function to replace the token before the sentiment word with the Q1 info
def replace_token_with_number(row):
    string = row['SNIPPET']
    number = row['Q1']
    return string.replace('[[', "xxproj " + str(number) + " ")

# Apply the function to every row in the dataframe
df["text"] = df.apply(replace_token_with_number, axis=1)

# remove [+] and [-] from text column and replace with ++ and --
df["text"] = df["text"].str.replace("[+]", " xxpositive", regex=False)
df["text"] = df["text"].str.replace("[-]", " xxnegative", regex=False)

# remove [[ and  ]] from text column
df["text"] = df["text"].str.replace("[[", "", regex=False)
df["text"] = df["text"].str.replace("]]", "", regex=False)


In [None]:
set(df["labels"])

{-1, 0, 1}

In [None]:
df['labels'].value_counts()

 0    1537
 1     170
-1     168
Name: labels, dtype: int64

In [None]:
# -1 not possible as class for training / takes only integers
df['labels'] = df['labels'].replace(-1, 2)
print(df['labels'].value_counts(ascending=True))

2     168
1     170
0    1537
Name: labels, dtype: int64


In [None]:
# take a look at the first text after the above changes
df["text"].iloc[1]

'Patel encourages Americas citizens to highlight the national heroes, stories, and values of American tolerance found throughout the nations history. >>> By raising up these examples, he believes, Americans will start to develop new civic habits of hospitality and a new spirit of tolerance. <<< In the end, he argues, it is in our collective interest as citizens to work for the renewal of the civic, spiritual, and cultural landscape of American tolerance and interfaith generosity. Before I mention my cautions about Patels work, I want to offer one final note of xxproj 1 commendation xxpositive.'

In [None]:
# create reduced dataset for k-fold cross validation
df_new = df[["text","labels"]]
df_new

Unnamed: 0,text,labels
0,In addition to vowing that more bombs would be...,2
1,Patel encourages Americas citizens to highligh...,1
2,WASHINGTON AP The political suspense isn t ove...,1
3,A generation away from her family first White ...,0
4,McSally Defends Calling CNN Reporter Liberal H...,0
...,...,...
1870,Bush had a tough night at the debate. >>> Now ...,0
1871,>>> Joe will take on changing the culture on o...,2
1872,">>> As someone who enjoys eating out, trying n...",1
1873,>>> Most of us will be watching the debate. <<...,0


In [None]:
#run 10-fold cross validation
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd

# prepare cross validation
n=10
kf = KFold(n_splits=n, shuffle=True)

results = []

# initiate roberta-base model for predicting 3 multiclass categories, with the class weights calculated above.
# use_cuda=True for running on GPU
# training for 10 epochs - try other values too!

for train_index, test_index in kf.split(df_new):
  # splitting Dataframe (dataset  not included)
    train_df = df_new.iloc[train_index]
    test_df = df_new.iloc[test_index]
    # compute weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_df["labels"]), y=train_df['labels'])
    print(class_weights)
    # Defining Model
    model = ClassificationModel('roberta', 'roberta-base', num_labels=3, weight=class_weights.tolist(),
                            use_cuda=True, args={'reprocess_input_data': True, 'overwrite_output_dir': True,
                                                  "num_train_epochs": 10})

  # train the model
    model.train_model(train_df)
  # validate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=classification_report, conf=confusion_matrix)
    print(result['f1'])
    print(result['conf'])
  # append model score
    results.append(result['f1'])

[0.40774587 3.5093633  3.80894309]


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should pr

  0%|          | 0/937 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/118 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         0.0       0.94      0.91      0.92       771
         1.0       0.49      0.52      0.51        81
         2.0       0.46      0.60      0.52        86

    accuracy                           0.84       938
   macro avg       0.63      0.68      0.65       938
weighted avg       0.86      0.84      0.85       938

[[698  29  44]
 [ 22  42  17]
 [ 20  14  52]]
[0.40553394 3.8600823  3.63565891]


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should pr

  0%|          | 0/938 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/937 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/118 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93       766
         1.0       0.63      0.57      0.60        89
         2.0       0.51      0.54      0.52        82

    accuracy                           0.86       937
   macro avg       0.69      0.68      0.68       937
weighted avg       0.86      0.86      0.86       937

[[714  22  30]
 [ 26  51  12]
 [ 30   8  44]]


In [None]:
#train the model on the entire dataset (without the gold standard data)

# calculate class weights for labels column
class_weights = compute_class_weight('balanced', classes=np.unique(df["labels"]), y=df['labels'])
class_weights

array([0.4066363 , 3.67647059, 3.7202381 ])

In [None]:
#initiate roberta-base model for predicting 3 multiclass categories, with the class weights calculated above.
#use_cuda=True for running on GPU
#training for 10 epochs

model = ClassificationModel('roberta', 'roberta-base', num_labels=3, weight=class_weights.tolist(),
                            use_cuda=True, args={'reprocess_input_data': True, 'overwrite_output_dir': True,
                                                  "num_train_epochs": 10})


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should pr

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
#train the model
model.train_model(df)

  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/235 [00:00<?, ?it/s]

(2350, 0.5755550782917999)

In [None]:
#import gold_standard dataset

from pandas.core.groupby import DataFrameGroupBy
# create new dataframe with all rows including the Gold Standard
dfgold = pd.read_excel('Classifications_Output.xlsx', sheet_name="Classifications_Segment_Level")
dfgold = dfgold[dfgold['IS_GOLD_STANDARD'] == "YES"]
dfgold.shape

# rename columns
dfgold["text"] = dfgold["SNIPPET"]
dfgold["labels"] = dfgold["s_manual_trinary"]

# Function to replace the token before the sentiment word with the Q1 info
def replace_token_with_number(row):
    string = row['SNIPPET']
    number = row['Q1']
    return string.replace('[[', "xxproj " + str(number) + " ")

# Apply the function to every row in the dataframe
dfgold["text"] = dfgold.apply(replace_token_with_number, axis=1)

# remove [+] and [-] from text column and replace with ++ and --
dfgold["text"] = dfgold["text"].str.replace("[+]", " xxpositive", regex=False)
dfgold["text"] = dfgold["text"].str.replace("[-]", " xxnegative", regex=False)

# remove [[ and  ]] from text column
dfgold["text"] = dfgold["text"].str.replace("[[", "", regex=False)
dfgold["text"] = dfgold["text"].str.replace("]]", "", regex=False)


In [None]:
dfgold['labels'] = dfgold['labels'].replace(-1, 2)
print(dfgold['labels'].value_counts(ascending=True))

2     29
1     40
0    229
Name: labels, dtype: int64


In [None]:
set(dfgold["labels"])

{0, 1, 2}

In [None]:
# validate the model on the gold standard dataset
results = []

from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd

result, model_outputs, wrong_predictions = model.eval_model(dfgold, f1=classification_report, conf=confusion_matrix)
print(result['f1'])
print(result['conf'])

# append model score
results.append(result['f1'])

  0%|          | 0/298 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         0.0       0.92      0.83      0.87       229
         1.0       0.63      0.55      0.59        40
         2.0       0.33      0.66      0.44        29

    accuracy                           0.77       298
   macro avg       0.63      0.68      0.63       298
weighted avg       0.82      0.77      0.79       298

[[189   9  31]
 [ 11  22   7]
 [  6   4  19]]


In [None]:
# predict labels on entire dataframe including Gold Standard for subsequent data export
predictiongold = model.predict(dfgold["text"].to_list())

print(predictiongold[0])

#add new column to dataframe with roBERTa predictions
dfgold['roBERTa_segment'] = predictiongold[0]

print(dfgold['roBERTa_segment'])

dfgold.to_excel("Classifications_Output_goldstandard.xlsx")

  0%|          | 0/298 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

[0 0 2 0 2 0 0 0 0 2 2 2 0 1 1 1 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 1 2 2 2 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 2 0 0 0 1 0 0 0 0 0 0 0 2 2 2 2 0 0 0
 2 0 0 0 1 0 0 0 1 1 2 1 2 0 0 1 2 1 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 1 1 1 2
 0 1 1 0 2 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1
 0 0 1 0 0 0 0 2 2 2 2 2 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 2 0 0 0 0 0 0 0 0 0 2 2 1 2 2 1 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0
 2 0 0 0 0 2 0 0 2 1 0 0 2 0 0 0 0 2 0 0 1 0 0 1 1 0 0 0 0 0 2 0 1 1 1 1 2
 2 0 0 0 0 0 2 0 0 2 2 2 0 2 2 0 1 0 0 0 0 0 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0
 0 0]
1875    0
1876    0
1877    2
1878    0
1879    2
       ..
2168    0
2169    0
2170    0
2171    0
2172    0
Name: roBERTa_segment, Length: 298, dtype: int64
