## PATIENT - DOCTOR DATASET

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset

dataset = load_dataset("Postzeun/Patient-Doctor")

In [None]:
dataset

In [None]:
text_lines = dataset['train']['text']

data = {'text': [], 'label': []}

patient_text = []
doctor_text = []

for line in text_lines:
  if line.startswith('P:'):
    line = line.replace('P:', '').strip()
    label = 0
    patient_text.append(line)
  elif line.startswith('D:'):
    line = line.replace('D:', '').strip()
    label = 1
    doctor_text.append(line)
  else:
    continue
  data['text'].append(line)
  data['label'].append(label)

In [None]:
df1 = pd.DataFrame({'text': data['text'], 'label': data['label']})
df1.info()

In [None]:
df1['label'].value_counts()

In [None]:
import os
PROJECT_PATH = os.getcwd()
df2 = pd.read_csv(os.path.join(PROJECT_PATH, 'dataset/pulsar_contents.csv'))
df2.rename(columns={"bio": "text"}, inplace=True)
df2['label'] = 2
df2 = df2[['text', 'label']]
df2.info()

In [None]:
# With three labels P - D - N
frames = [df1, df2]

df = pd.concat(frames)

In [None]:
# pd.isna(df['text']).value_counts()
df.dropna(inplace=True)
df.info()

#### Check non-latin characters

In [None]:
LATIN_1_CHARS = (
    ('\xe2\x80\x99', "'"),
    ('\xc3\xa9', 'e'),
    ('\xe2\x80\x90', '-'),
    ('\xe2\x80\x91', '-'),
    ('\xe2\x80\x92', '-'),
    ('\xe2\x80\x93', '-'),
    ('\xe2\x80\x94', '-'),
    ('\xe2\x80\x94', '-'),
    ('\xe2\x80\x98', "'"),
    ('\xe2\x80\x9b', "'"),
    ('\xe2\x80\x9c', '"'),
    ('\xe2\x80\x9c', '"'),
    ('\xe2\x80\x9d', '"'),
    ('\xe2\x80\x9e', '"'),
    ('\xe2\x80\x9f', '"'),
    ('\xe2\x80\xa6', '...'),
    ('\xe2\x80\xb2', "'"),
    ('\xe2\x80\xb3', "'"),
    ('\xe2\x80\xb4', "'"),
    ('\xe2\x80\xb5', "'"),
    ('\xe2\x80\xb6', "'"),
    ('\xe2\x80\xb7', "'"),
    ('\xe2\x81\xba', "+"),
    ('\xe2\x81\xbb', "-"),
    ('\xe2\x81\xbc', "="),
    ('\xe2\x81\xbd', "("),
    ('\xe2\x81\xbe', ")")
)


def clean_latin1(data):
    try:
        return data.encode('utf-8')
    except UnicodeDecodeError:
        data = data.decode('iso-8859-1')
        for _hex, _char in LATIN_1_CHARS:
            data = data.replace(_hex, _char)
        return data.encode('utf8')

In [None]:
error = []
for text in df['text'].values:
  try:
    text = clean_latin1(text)
    # text.encode(encoding='utf-8').decode('ascii')
  except:
    print(f'error for {text}')
    error.append(text)
  # text.encode(encoding='utf-8').decode('ascii')

print(len(error))

In [None]:
df['label'].value_counts().plot(kind='bar')

## Adding the Classification Score Field

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model=f"{HF_USERNAME}/{TASK}", top_k=None)

In [None]:
pred_score_list = []
for text in dataset['test']['Text']:
  res = classifier(text)
  pred_score_list.append(res[0])

df_test['Classification Score'] = pred_score_list

In [None]:
df_train['Classification Score'] = [[{'label': 'PATIENT', 'score': float(0)}, {'label': 'DOCTOR', 'score': float(0)}, {'label': 'NEUTRAL', 'score': float(0)}]] * len(df_train.index)
df_val['Classification Score'] = [[{'label': 'PATIENT', 'score': float(0)}, {'label': 'DOCTOR', 'score': float(0)}, {'label': 'NEUTRAL', 'score': float(0)}]] * len(df_val.index)

## Hugging Face login

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
HF_USERNAME = "LukeGPT88"
PROJECT_NAME = "patient-doctor-text-classifier"
SUB_PROJECT_NAME = "eng"
DATASET_NAME = f"{HF_USERNAME}/{PROJECT_NAME}-{SUB_PROJECT_NAME}-dataset-0528"
TASK = f"{PROJECT_NAME}-{SUB_PROJECT_NAME}"

In [None]:
folder_path = '/home/luca/Documents/Extendi/ML/Extendi/AIProjects/TextClassification/PatientDoctorTextClassifier/Dataframes/20240528'
df = pd.read_csv(f'{folder_path}/df_tot.csv')

In [None]:
df.head()

In [None]:
train, validate, test = \
              np.split(df.sample(frac=1, random_state=42).reset_index(drop=True), 
                       [int(.6*len(df)), int(.8*len(df))])

In [None]:
train_ds = Dataset.from_pandas(train)
validation_ds = Dataset.from_pandas(validate)
test_ds = Dataset.from_pandas(test)

ddict = DatasetDict({
    "train": train_ds,   # split1_ds is an instance of `datasets.Dataset`
    "validation": validation_ds,
    "test": test_ds,
})
ddict.push_to_hub(DATASET_NAME)