In [1]:
import pandas as pd
import numpy as np

In [63]:
from datasets import load_dataset

HF_USERNAME = "LukeGPT88"
PROJECT_NAME = "patient-doctor-text-classifier"
SUB_PROJECT_NAME = "eng"
DATASET_NAME = f"{HF_USERNAME}/{PROJECT_NAME}-{SUB_PROJECT_NAME}-dataset"
TASK = f"{PROJECT_NAME}-{SUB_PROJECT_NAME}"

dataset = load_dataset(DATASET_NAME)

In [None]:
dataset

# FOR OLD DATASET ONLY (HF repo patient-doctor-text-classifier-eng-dataset-old)

## Convert short texts in neutral conversations

In [3]:
id2label = {0: "PATIENT", 1: "DOCTOR", 2: "NEUTRAL"}

def mapping(idx):
  label = id2label.get(idx)
  return label

In [12]:
# label_train_text = list(map(mapping, dataset['train']['Label']))
# label_val_text = list(map(mapping, dataset['validation']['Label']))
# label_test_text = list(map(mapping, dataset['test']['Label']))

In [64]:
df_train = pd.DataFrame({'Text': dataset['train']['Text'], 'Label': dataset['train']['Label'], 'Encoding': dataset['train']['Encoding'] })
df_val = pd.DataFrame({'Text': dataset['validation']['Text'], 'Label': dataset['validation']['Label'], 'Encoding': dataset['validation']['Encoding'] })
df_test = pd.DataFrame({'Text': dataset['test']['Text'], 'Label': dataset['test']['Label'], 'Encoding': dataset['test']['Encoding'] })

In [None]:
# df_train.to_csv('df_train_old.csv')
# df_val.to_csv('df_val_old.csv')
# df_test.to_csv('df_test_old.csv')

In [None]:
SHORT_TEXT_LENGTH = 10
df_train['Short Text'] = [True if len(text) < SHORT_TEXT_LENGTH else False for text in df_train['Text'].values]
df_val['Short Text'] = [True if len(text) < SHORT_TEXT_LENGTH else False for text in df_val['Text'].values]
df_test['Short Text'] = [True if len(text) < SHORT_TEXT_LENGTH else False for text in df_test['Text'].values]

In [None]:
# df_val['Text'].loc[df_val['Short Text'] == True].to_csv('df_val')

### Train set

In [None]:
# we need to convert short expressions to be classified as neutral conversions
# matches all the patterns that are similar to the requested word
import re

texts = df_train['Text'].loc[ (df_train['Short Text'] == True) ]
indexes = df_train['Text'].loc[ (df_train['Short Text'] == True) ].index

words = ['uhm', 'ok', 'no', 'uh', 'sure', 'mmm']
text_to_convert = []
rows_to_convert = []
for text, index in list(map(lambda x, y: (x, y), texts,indexes.values)):
  for word in words:

    regex_pattern = r"\W*\w*" + re.escape(word) + r'\W*\w*'

    matches = re.findall(regex_pattern, text, re.IGNORECASE)
    if len(matches) > 0 :
      rows_to_convert.append(index)
      text_to_convert.append([matches[0], index])
print(len(rows_to_convert))
print(len(list(set(rows_to_convert))))
print(text_to_convert)


In [None]:
df_train.iloc[list(set(rows_to_convert))][df_train['Label'].isin(['DOCTOR'])]

In [None]:
df_train.iloc[list(set(rows_to_convert)), df_train.columns.get_loc('Label')] = 'NEUTRAL'
df_train.iloc[list(set(rows_to_convert)), df_train.columns.get_loc('Encoding')] = 2

### Validation set

In [None]:
# we need to convert short expressions to be classified as neutral conversions
# matches all the patterns that are similar to the requested word
import re

texts = df_val['Text'].loc[ (df_val['Short Text'] == True) ]
indexes = df_val['Text'].loc[ (df_val['Short Text'] == True) ].index

words = ['uhm', 'ok', 'no', 'uh', 'sure', 'mmm']
text_to_convert = []
rows_to_convert = []
for text, index in list(map(lambda x, y: (x, y), texts,indexes.values)):
  for word in words:

    regex_pattern = r"\W*\w*" + re.escape(word) + r'\W*\w*'

    matches = re.findall(regex_pattern, text, re.IGNORECASE)
    if len(matches) > 0 :
      rows_to_convert.append(index)
      text_to_convert.append([matches[0], index])
print(len(rows_to_convert))
print(len(list(set(rows_to_convert))))
print(text_to_convert)

In [None]:
df_val.iloc[list(set(rows_to_convert))][df_val['Label'].isin(['DOCTOR'])]

In [None]:
df_val.iloc[list(set(rows_to_convert)), df_val.columns.get_loc('Label')] = 'NEUTRAL'
df_val.iloc[list(set(rows_to_convert)), df_val.columns.get_loc('Encoding')] = 2

### test set

In [None]:
# we need to convert short expressions to be classified as neutral conversions
# matches all the patterns that are similar to the requested word
import re

texts = df_test['Text'].loc[ (df_test['Short Text'] == True) ]
indexes = df_test['Text'].loc[ (df_test['Short Text'] == True) ].index

words = ['uhm', 'ok', 'no', 'uh', 'sure', 'mmm']
text_to_convert = []
rows_to_convert = []
for text, index in list(map(lambda x, y: (x, y), texts,indexes.values)):
  for word in words:

    regex_pattern = r"\W*\w*" + re.escape(word) + r'\W*\w*'

    matches = re.findall(regex_pattern, text, re.IGNORECASE)
    if len(matches) > 0 :
      rows_to_convert.append(index)
      text_to_convert.append([matches[0], index])
print(len(rows_to_convert))
print(len(list(set(rows_to_convert))))
print(text_to_convert)

In [None]:
df_test.iloc[list(set(rows_to_convert)), df_test.columns.get_loc('Label')] = 'NEUTRAL'
df_test.iloc[list(set(rows_to_convert)), df_test.columns.get_loc('Encoding')] = 2

In [None]:
df_train.to_csv('df_train.csv')
df_val.to_csv('df_val.csv')
df_test.to_csv('df_test.csv')

## from dataframe to the HF dataset upload

In [None]:
df_train.drop('Short Text', axis=1, inplace=True)
df_val.drop('Short Text', axis=1, inplace=True)
df_test.drop('Short Text', axis=1, inplace=True)

In [68]:
from datasets import Dataset, DatasetDict
new_ds = DatasetDict()

new_ds['train'] = Dataset.from_pandas(df_train)
new_ds['validation'] = Dataset.from_pandas(df_val)
new_ds['test'] = Dataset.from_pandas(df_test)

print(new_ds)

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', 'Encoding', 'Classification Score'],
        num_rows: 24746
    })
    validation: Dataset({
        features: ['Text', 'Label', 'Encoding', 'Classification Score'],
        num_rows: 8249
    })
    test: Dataset({
        features: ['Text', 'Label', 'Encoding', 'Classification Score'],
        num_rows: 8249
    })
})


### HF login

In [39]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [69]:
new_ds

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', 'Encoding', 'Classification Score'],
        num_rows: 24746
    })
    validation: Dataset({
        features: ['Text', 'Label', 'Encoding', 'Classification Score'],
        num_rows: 8249
    })
    test: Dataset({
        features: ['Text', 'Label', 'Encoding', 'Classification Score'],
        num_rows: 8249
    })
})

In [70]:
new_ds.push_to_hub(DATASET_NAME)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/LukeGPT88/patient-doctor-text-classifier-eng-dataset/commit/362a95f23b9035c84121afc22f63fcbdb927d37b', commit_message='Upload dataset', commit_description='', oid='362a95f23b9035c84121afc22f63fcbdb927d37b', pr_url=None, pr_revision=None, pr_num=None)

## MEMORY SIZE FOR EACH STRING

In [None]:
import sys

size_in_bytes = []

for text in new_ds['train']['Text']:
  size_in_bytes.append(sys.getsizeof(text) - sys.getsizeof(""))

print(f"Total Size of text strings: {sum(size_in_bytes)} bytes")
print(f"Average Size for each string: {sum(size_in_bytes)/len(new_ds['train']['Text'])} bytes")

## Classification Score

In [5]:
from transformers import pipeline
classifier = pipeline("text-classification", model=f"{HF_USERNAME}/{TASK}", top_k=None)



In [8]:
pred_score_list = []
for text in dataset['test']['Text']:
  res = classifier(text)
  pred_score_list.append(res[0])

print(pred_score_list)

[[{'label': 'DOCTOR', 'score': 0.9997432827949524}, {'label': 'PATIENT', 'score': 0.0001748453505570069}, {'label': 'NEUTRAL', 'score': 8.190727385226637e-05}], [{'label': 'DOCTOR', 'score': 0.9996976852416992}, {'label': 'PATIENT', 'score': 0.0002058265235973522}, {'label': 'NEUTRAL', 'score': 9.647852130001411e-05}], [{'label': 'NEUTRAL', 'score': 0.9993240833282471}, {'label': 'PATIENT', 'score': 0.00047072715824469924}, {'label': 'DOCTOR', 'score': 0.00020525652507785708}], [{'label': 'NEUTRAL', 'score': 0.9998699426651001}, {'label': 'PATIENT', 'score': 8.797729969955981e-05}, {'label': 'DOCTOR', 'score': 4.203589560347609e-05}], [{'label': 'DOCTOR', 'score': 0.9996703863143921}, {'label': 'PATIENT', 'score': 0.00024467328330501914}, {'label': 'NEUTRAL', 'score': 8.499332034261897e-05}], [{'label': 'NEUTRAL', 'score': 0.9996292591094971}, {'label': 'PATIENT', 'score': 0.00018980140157509595}, {'label': 'DOCTOR', 'score': 0.00018091539095621556}], [{'label': 'DOCTOR', 'score': 0.99

In [9]:
pred_score_list[0]

[{'label': 'DOCTOR', 'score': 0.9997432827949524},
 {'label': 'PATIENT', 'score': 0.0001748453505570069},
 {'label': 'NEUTRAL', 'score': 8.190727385226637e-05}]

In [65]:
df_test['Classification Score'] = pred_score_list

In [50]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8249 entries, 0 to 8248
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Text                  8249 non-null   object
 1   Label                 8249 non-null   object
 2   Encoding              8249 non-null   int64 
 3   Classification Score  8249 non-null   object
dtypes: int64(1), object(3)
memory usage: 257.9+ KB


In [66]:
df_train['Classification Score'] = [[{'label': 'PATIENT', 'score': float(0)}, {'label': 'DOCTOR', 'score': float(0)}, {'label': 'NEUTRAL', 'score': float(0)}]] * len(df_train.index)
df_val['Classification Score'] = [[{'label': 'PATIENT', 'score': float(0)}, {'label': 'DOCTOR', 'score': float(0)}, {'label': 'NEUTRAL', 'score': float(0)}]] * len(df_val.index)

In [67]:
df_train.head()

Unnamed: 0,Text,Label,Encoding,Classification Score
0,Jace Hasty!,NEUTRAL,2,"[{'label': 'PATIENT', 'score': 0.0}, {'label':..."
1,"Yeah, for sure, so right now it sounds like a ...",DOCTOR,1,"[{'label': 'PATIENT', 'score': 0.0}, {'label':..."
2,"Yeah, no problem.",DOCTOR,1,"[{'label': 'PATIENT', 'score': 0.0}, {'label':..."
3,"No, OK. I'm gonna get you to palpate your hipb...",DOCTOR,1,"[{'label': 'PATIENT', 'score': 0.0}, {'label':..."
4,🇩🇴🇲🇽; I make art sometimes. Bi/pan enby; she/t...,NEUTRAL,2,"[{'label': 'PATIENT', 'score': 0.0}, {'label':..."


## Confusion Matrix

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model=f"{HF_USERNAME}/{TASK}")

In [None]:
pred_list = []

for text in new_ds['test']['Text']:
  res = classifier(text)
  pred_list.append(res[0]['label'])

true_list = list(map(mapping, new_ds['test']['Encoding']))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', 
                            cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(true_list, pred_list)
np.set_printoptions(precision=2)

# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, id2label.values(), 
        title='Normalized confusion matrix')

plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support

macro = precision_recall_fscore_support(true_list, pred_list, average='macro')
micro = precision_recall_fscore_support(true_list, pred_list, average='micro')

print(f'Macro : {macro}\n Micro : {micro}')

## TRANSLATION TASK

In [None]:
from deep_translator import GoogleTranslator

texts = dataset['train']['text']
res = GoogleTranslator(source='auto', target='it').translate_batch(texts)

In [None]:
from deep_translator import GoogleTranslator

data = {}
data['en'] = dataset['train']['text']
for lang in ['it']:
  examples_list = []
  for idx, text in enumerate(dataset['train']['text']):
    res = GoogleTranslator(source='auto', target=lang).translate(text)
    examples_list.append(res)
    print(idx)
  data[lang] = examples_list

df = pd.DataFrame(data)
df.to_csv('patient-doctor-text-classifier-it-dataset.csv')