###Read in data and format
For my dataset, 0 is a nugget and 1 is noise

In [7]:
import csv
import sklearn
import torch

In [2]:
def read_split_data(fn):
    labels = []
    utterances = []
    with open(fn, newline='') as csvfile:
        reader = csv.reader(csvfile)
        # This skips the first row of the CSV file.
        next(reader)
        for row in reader: 
            labels.append(int(row[0]))
            utterances.append(row[1])
    print('Number of rows: {}'.format(len(labels)))
    data_dict = {
        'labels': labels,
        'utterances': utterances
                }
    return data_dict

data_dict = read_split_data('utterances.csv')

Number of rows: 1098


In [3]:
#Peek at our dataset
for i in range(0,5):
    print(data_dict['labels'][i], data_dict['utterances'][i])

1 "Every robot has a big red button," says Dolgov.
1 [Page Six] Guess who's putting sluts and hussies ON BLAST?
1 A quiet house is nice until you are ordered to stay in it for months.
1 A song can make or ruin a person’s day if they let it get to them.
1 Abraham's specialty is population and developmental economics.


In [4]:
#Split data into train, test, and validation sets

#requires sklearn
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(data_dict['utterances'], data_dict['labels'], test_size=.2)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


In [5]:
#Alright, we’ve read in our dataset. Now let’s tackle tokenization. We’ll eventually train a classifier using pre-trained DistilBert, so let’s use the DistilBert tokenizer.

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
#Creat a dataset object

#requires torch
class utterances_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = utterances_dataset(train_encodings, train_labels)
val_dataset = utterances_dataset(val_encodings, val_labels)
test_dataset = utterances_dataset(test_encodings, test_labels)

In [11]:
#Fine-tuning with Trainer

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi



Step,Training Loss
10,0.6594
20,0.6517
30,0.6129
40,0.5485
50,0.4983
60,0.4123
70,0.2733
80,0.1971
90,0.1477
100,0.1271


TrainOutput(global_step=132, training_loss=0.3556366400285201, metrics={'train_runtime': 1153.8311, 'train_samples_per_second': 0.114, 'total_flos': 114215873358600, 'epoch': 3.0})

In [12]:
model.save_pretrained("./yacov-athena-DistilBertSC")

In [13]:
tokenizer.save_pretrained("./yacov-athena-DistilBertSC")

('./yacov-athena-DistilBertSC/tokenizer_config.json',
 './yacov-athena-DistilBertSC/special_tokens_map.json',
 './yacov-athena-DistilBertSC/vocab.txt',
 './yacov-athena-DistilBertSC/added_tokens.json')

In [26]:
results = trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-2.2027936 ,  1.9565878 ],
       [-1.996772  ,  1.9063733 ],
       [-1.9398148 ,  1.7807553 ],
       [-1.9722581 ,  1.7864017 ],
       [-1.8825649 ,  1.7188511 ],
       [-2.013989  ,  1.7472883 ],
       [-2.059013  ,  1.9368087 ],
       [-2.119093  ,  1.9426961 ],
       [-2.0638952 ,  1.8670772 ],
       [-1.9669638 ,  1.8365777 ],
       [-2.084086  ,  1.9246064 ],
       [-1.8047495 ,  1.659288  ],
       [-2.1248512 ,  1.9054465 ],
       [-1.9307948 ,  1.7703309 ],
       [-2.1210876 ,  1.9037105 ],
       [-1.9130121 ,  1.7200853 ],
       [-1.9571277 ,  1.7856723 ],
       [-2.05442   ,  1.9085035 ],
       [-2.0392017 ,  1.8717418 ],
       [-1.9413962 ,  1.7846944 ],
       [-2.24923   ,  2.0357218 ],
       [-2.0544672 ,  1.8760622 ],
       [-1.93567   ,  1.7495906 ],
       [-2.026557  ,  1.8760918 ],
       [-2.072688  ,  1.871838  ],
       [-2.0285501 ,  1.8075365 ],
       [-2.005712  ,  1.8565831 ],
       [-1.8836186 ,  1.77

In [29]:
#Review last n results
n = 10
for i in range(n):
    print(test_texts[((len(test_texts)-n -1) + i)], test_labels[((len(test_texts)-n -1) + i)])

But CENTCOM and the CIA had decided instead to use the untested Predator. 1
The average American woman weighs 166.2 pounds.  And its unclear whether women in the U.S. 1
Hurt me 0
Schneider is currently a partner in the environment, land, and resources practice at law firm Latham & 1
I purchased a baby clown from the Russian terrorist black market. 1
Who can help me 0
The complicated school homework left the parents trying to help their kids quite confused. 1
What can I possible do to survive? 0
Amjad is himself only just back in the city, having months ago fled into exile as a result of his association with another Western journalist. 1
He decided water-skiing on a frozen lake wasn’t a good idea. 1


In [159]:
f1 = open('therapy_transcript.txt', 'r')
therapy_utterances = f1.read().splitlines()

f1 = open('therapy_transcript.txt', 'r')
therapy_utterances = f1.read()
print(therapy_utterances)
therapy_utterances = therapy_utterances.split('.')
print(therapy_utterances)
live_encodings = tokenizer(therapy_utterances, truncation=True, padding=True)

Counseling Session Transcription
Suzie: So, yesterday I get home after a long day at work and I check my email….. and there is an email from this guy I hooked up with like….17 years ago….something like that.
	•	Counselor: Wow.
Suzie: That’s kinda like….. what I was like, I was like Wow! I was like really excited!
	•	Counselor: Wow. Okay… so you felt excited?
Suzie: I did. I felt really exited and I was like…. I felt sort of guilty about feeling excited because I’m like….. hello I’m married almost 10 Years. But, somebody was obviously paying some kind of attention to me. I have not talked to this guy. I haven’t seen this guy…. this guy is really like Joe random out of the blue…. like nothing from him for 17 Years ago.
	•	Counselor: Okay.
Suzie: And I’m so flattered. I mean it wasn’t that big of a deal ….the email… whatever but, I’m so flattered that I almost wish that… that random email….. that act would have been from my husband but it wasn’t. And I got off on the fact that it was from

In [None]:
model.eval()

inputs = tokenizer(therapy_utterances[0], return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs, labels=labels)
print(outputs)


In [139]:
def getSoftmaxScores(inputs, dimen):
	''' Get the softmax scores '''
	print('---Softmax---')
	print('---Dim = ' + str(dimen) + '---')
	softmaxFunc = torch.nn.Softmax(dim = dimen)
	softmaxScores = softmaxFunc(inputs)
	print('Softmax Scores: \n', softmaxScores)
	sums_0 = torch.sum(softmaxScores, dim=0)
	sums_1 = torch.sum(softmaxScores, dim=1)
	print('Sum over dimension 0: \n', sums_0)
	print('Sum over dimension 1: \n', sums_1)

def getSigmoidScores(inputs):
	''' Get the sigmoid scores: they are element-wise '''
	#print('---Sigmoid---')
	sigmoidScores = torch.sigmoid(inputs)
	#print('Sigmoid Scores: \n', sigmoidScores)
	return sigmoidScores

logits = torch.randn(2, 3)*10 - 5
print('Logits: ', logits)
getSigmoidScores(logits)

Logits:  tensor([[  2.3559,   9.1067, -11.3278],
        [ -6.0727, -19.6020,  -1.2912]])


tensor([[9.1340e-01, 9.9989e-01, 1.2034e-05],
        [2.2996e-03, 3.0687e-09, 2.1566e-01]])

In [167]:

for i in range(len(therapy_utterances)):
    inputs = tokenizer(therapy_utterances[i], return_tensors="pt")
    labels = torch.tensor([1]).unsqueeze(0)
    outputs = model(**inputs, labels=labels)
    probablities = getSigmoidScores(outputs.logits)
    #print(therapy_utterances[i], probablities)
    if probablities[0,0] > .47:
        print(therapy_utterances[i],'\n\n\n\n Here is a high risk sentence!!! \n\n\n\n\n')

 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 I want to kill him 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 Like I want to kill myself 



 Here is a high risk sentence!!! 





 You’re romantic 



 Here is a high risk sentence!!! 





 I just want everything to end 



 Here is a high risk sentence!!! 





 Who can help me 



 Here is a high risk sentence!!! 





 Hurt me 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 



 Here is a high risk sentence!!! 





 Lik