In [None]:
# NLP analysis of RiPS transcripts using T5 (run on Google Colab)
# requirements: python3 -m pip install transformers[sentencepiece], pytorch_lightning
# read in the csv file and print sentiment scores per sentence
# https://huggingface.co/mrm8488/t5-base-finetuned-imdb-sentiment
# https://github.com/google-research/text-to-text-transfer-transformer/issues/109

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [None]:
!pip install transformers[sentencepiece]
!pip install pytorch_lightning

In [4]:
import pandas as pd
import csv
import numpy as np

from transformers import AutoTokenizer, AutoModelWithLMHead

In [5]:
transd = '/content/drive/My Drive/Colab/RiPSws3transcript/'
sentd = '/content/drive/My Drive/Colab/t5sent/'
trans = ['comment']

In [None]:
transf_list = ['2020-11-16fetch_AWS','2020-11-17fetch_AWS_v1','2020-11-18fetch_AWS_v1','2020-11-23fetch_AWS_v1',
               'pepper_1','pepper_2','pepper_3','pepper_4','WS3_Fetch','WS3_Pepper','WS3_all',
               '2020-11-16fetch_AWS_simu','2020-11-17fetch_AWS_v1_simu','2020-11-18fetch_AWS_v1_simu',
               '2020-11-23fetch_AWS_v1_simu','pepper_1_simu','pepper_2_simu','pepper_3_simu','pepper_4_simu',
               'WS3_Fetch_simu','WS3_Pepper_simu','WS3_all_simu','2020-11-16fetch_AWS_robo','2020-11-17fetch_AWS_v1_robo',
               '2020-11-18fetch_AWS_v1_robo','2020-11-23fetch_AWS_v1_robo','pepper_1_robo','pepper_2_robo','pepper_3_robo',
               'pepper_4_robo','WS3_Fetch_robo','WS3_Pepper_robo','WS3_all_robo']

In [None]:
# load model pre-trained with the IMDB database
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

In [8]:
# sentiment analysis with T5
def get_sentiment(text):
	input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')
	output = model.generate(input_ids=input_ids,max_length=2)
	dec = [tokenizer.decode(ids) for ids in output]
	label = dec[0]
	return label

In [None]:
# process all transcripts
for transf in transf_list:
	# read in transcripts
	df = pd.read_csv((transd+transf+'.csv'),usecols=trans)
	
	# apply T5 pretrained sentiment analysis tools
	sentscores = []
	for sentence in df.comment:
		score = get_sentiment(sentence)
		sentscores.append(score)

	# print to file
	with open((sentd+transf+'_sentscore.csv'),'w') as outf:
		wr = csv.writer(outf,delimiter='\n')
		wr.writerow(sentscores)

	print('Completed:',transf)

print('All files processed!')