In [None]:
import numpy as np
import pandas as pd
import re
from transformers import pipeline
import csv
import os

In [None]:
HF_USERNAME = "LukeGPT88"
PROJECT_NAME = "patient-doctor-text-classifier"
SUB_PROJECT_NAME = "eng"
TASK = f"{PROJECT_NAME}-{SUB_PROJECT_NAME}"

FOLDERPATH = 'PulsarSearchesExport'
TOPIC_CC = 'HavasGlobal-Lupus'
TOPIC_SC = 'havas_global-lupus'

# From Pulsar Searches 

In [None]:
df = pd.read_excel(f'{FOLDERPATH}/{TOPIC_CC}/107503-all-2024-05-17-07-39-05-1163478.xlsx')
df.info()

In [None]:
df.columns

In [None]:
df[['user name', 'user screen name']]

In [None]:
df.drop_duplicates(subset=['content'], inplace=True)
df.dropna(subset=['content'], inplace=True)

In [None]:
df.info()

In [None]:
contents = df['content'].values
bios = df['bio'].values
data = {'contents': contents, 'bios': bios}

## make better inference for bios adding medical bios

In [None]:
df_medical_bios = pd.read_csv(f'{FOLDERPATH}/{TOPIC_CC}/output/medical_bios_df.csv')

In [None]:
meaningful_medical_bio_array = []
for medical_bio in df_medical_bios['bio']:
  meaningful_medical_bio = get_meaningful_text(medical_bio)
  meaningful_medical_bio_array.append(meaningful_medical_bio)

df_medical_bios['bio'] = None
df_medical_bios['bio'] = meaningful_medical_bio_array

In [None]:
df_medical_bios['bio']

In [None]:
df_medical_bios.drop_duplicates(subset=['bio'], inplace=True)
df_medical_bios.dropna(subset=['bio'], inplace=True)

In [None]:
df_medical_bios.loc[df_medical_bios['bio'].str.contains('Assistant')]['bio']

In [None]:
df_medical_bios['bio'].values[1]

## Endpoint Inference using HF Endpoint

In [None]:
print(len(contents), len(bios), len(data))

In [None]:
data['contents'][:10]

In [None]:
def get_csv_file(data, split=0):
  out_df = pd.DataFrame({
    "Account": [res['account'] for res in data], "User": [res['user'] for res in data], "Content": [res['content']['text'] for res in data], "Predicted Content Type (Label / Score)": [f"{res['content']['label']} / {res['content']['score']}" for res in data], 
    "Bio": [res['bio']['text'] for res in data], "Predicted Bio Type (Label / Score)": [f"{res['bio']['label']} / {res['bio']['score']}" for res in data]
  })
  filename = f'{FOLDERPATH}/{TOPIC_CC}/output/{TOPIC_SC}_export_inference_p{int(split)}.csv' if split != 0 else f'{FOLDERPATH}/{TOPIC_CC}/output/{TOPIC_SC}_export_inference.csv'
  out_df.to_csv(filename)
  if split > 1:
    filename_old = f'{FOLDERPATH}/{TOPIC_CC}/output/{TOPIC_SC}_export_inference_p{int(split - 2)}.csv'
    if(os.path.exists(filename_old) and os.path.isfile(filename_old)):
      os.remove(filename_old)


In [None]:
classifier = pipeline("text-classification", model=f"{HF_USERNAME}/{TASK}")

In [None]:
len(list(map(lambda x,y: (x, y), data['contents'], data['bios'])))

In [None]:
def get_meaningful_text(text):
  """
    Remove meaningless words like everything starting with '#', '@', or containing 'http'
  """
  # Define the regex pattern to match words starting with '@', '#', or 'http'
  # pattern = r'\b(@\w+|#\w+|http\S*)'
  pattern = r'\B@\w+|#\w+|http\S*'

  # Use re.sub to replace words starting with '@', '#', or 'http' with an empty string
  cleaned_text = re.sub(pattern, '', text)

  # Optional: Clean up any extra spaces created by the replacements
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

  return cleaned_text

In [None]:
text = "@YodaKnows3 @AP Well he had prostate cancer, so my guess is he died of lupus"
res = get_meaningful_text("✅ It is my IMMENSE PLEASURE to share with you our SEMINAR on systemic #Lupus just published in... The LANCET!!!!! Amazing work driven by @alberta_hoi 👍LINK: https://t.co/iayAks7zWC https://t.co/9Q99vJghT6")
print(res)

In [None]:
classifier = pipeline("text-classification", model=f"{HF_USERNAME}/{TASK}")
outcomes = []
tot_rows = list(map(lambda x, w, y, z: (x, w, y, z), df['user name'].values,df['user screen name'].values, data['contents'], data['bios']))[:2]
for i, (account, user, content, bio) in enumerate(tot_rows):
  meaningful_content = get_meaningful_text(content)
  print(meaningful_content)
  if len(meaningful_content) <= 500:
    if type(bio) != str:
      meaningful_bio = ''
    else:
      meaningful_bio = get_meaningful_text(bio)
    outcomes.append({"account": account, "user": user, "content": {**{"text": content}, **classifier(meaningful_content)[0]}, "bio": {**{"text": bio}, **classifier(meaningful_bio)[0]}})
  if i%1000 == 0 and i != 0:
    split = i/1000
    get_csv_file(outcomes, split)
  if i == len(tot_rows) - 1:
    get_csv_file(outcomes)

# outcomes = []
# for i, bio in enumerate(bios[:110]):
#   if type(bio) == float:
#     bio = None
#     outcomes.append({ **{"bio": bio}, **{"label": None, "score": None}})
#     continue
#   if len(bio) <= 500:
#     outcomes.append({ **{"bio": bio}, **classifier(bio)[0]})
#   if i%1000 == 0 and i != 0:
#     split = i/1000
#     get_csv_file(outcomes, 'bio', split)
#   if i == len(bios[:110]) - 1:
#     get_csv_file(outcomes, 'bio')

In [None]:
# fields = ['Content']
# with open(f'{topics_sc}_export_inference.csv', 'w') as f:
#   # using csv.writer method from CSV package
#   write = csv.writer(f)
    
#   write.writerow(fields)
#   write.writerows(outcomes)

In [None]:
df = pd.read_csv(f'/home/luca/Documents/Extendi/ML/Extendi/AIProjects/TextClassification/PatientDoctorTextClassifier/PulsarSearchesExport/{TOPIC_CC}/output/20240524/{TOPIC_SC}_export_inference.csv')
# df['Score'].to_csv(f'{TOPIC_SC}_score.csv', index=False)
df.info()

In [None]:
df['Bio'].dropna(inplace=True)
for bio in df['Bio']:
  if type(bio) !=str:
    continue
  if 'patient' in bio:
    print(bio)

In [None]:
bio_label = []
for bio_label_score in df['Predicted Bio Type (Label / Score)']:
  bio_label.append(bio_label_score.split('/')[0].strip())

df['bio_label'] = bio_label

In [None]:
df['bio_label'].value_counts()

#### Invoking a SageMaker endpoint for inference (Optionally)

In [None]:
import json
from sagemaker.huggingface.model import HuggingFacePredictor

runtime = boto3.client("sagemaker-runtime")
content_type = "<request-mime-type>"
event = "I'm going to the lupus summit in Washington this weekend. And the lupus foundation itself is asking for certain reforms regarding your care . One is so your health insurance respects the decision of your dr prescribing med and they can't make you try another med first"
payload = {"inputs": event}
endpoint_name = 'patient-doctor-text-classifier-endpoint'

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=json.dumps(payload)
)
print(response)