<a href="https://colab.research.google.com/github/petersetiabudi4/RekmedVSM/blob/main/Rekmed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Preprocessing

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/Information Retrieval/dokter pasien obat_new.csv')


# Tokenizing with NaN handling
df['doctor'] = df['doctor'].apply(lambda x: word_tokenize(str(x))
  if pd.notnull(x) else x)
df['patient'] = df['patient'].apply(lambda x: word_tokenize(str(x))
  if pd.notnull(x) else x)
df['medicine'] = df['medicine'].apply(lambda x: word_tokenize(str(x))
  if pd.notnull(x) else x)

# Lowercasing
df['doctor'] = df['doctor'].apply(lambda x: [word.lower() for word in x]
  if isinstance(x, list) else x)
df['patient'] = df['patient'].apply(lambda x: [word.lower() for word in x]
  if isinstance(x, list) else x)
df['medicine'] = df['medicine'].apply(lambda x: [word.lower() for word in x]
  if isinstance(x, list) else x)

# Save preprocessed data
df.to_csv('preprocessed_rekmed.csv', index=False)

print(df.shape)
print(df.head())

(876, 3)
                    doctor patient medicine
0  [[, isi, nama, anda, ]]     NaN      NaN
1              [andrianto]     NaN      NaN
2  [[, isi, nama, anda, ]]     NaN      NaN
3  [[, isi, nama, anda, ]]     NaN      NaN
4  [[, isi, nama, anda, ]]     NaN      NaN


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('/content/preprocessed_rekmed.csv')
data = data.fillna('')

# Create a combined text field for each patient-doctor-medicine combination
data['combination'] = data['patient'].astype(str) + ' ' + data['doctor'].astype(str) + ' ' + data['medicine'].astype(str)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF vectorizer on the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combination'])

# Tokenization and Lowercasing for Query Preprocessing
def preprocess_query(query):
    # Tokenization
    tokens = query.split()

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Rejoin the preprocessed tokens into a single string
    preprocessed_query = ' '.join(tokens)

    return preprocessed_query

def search(query, threshold=0.3, relevant_keywords=None):
    preprocessed_query = preprocess_query(query)
    query_vector = tfidf_vectorizer.transform([preprocessed_query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

    # Find all combinations with similarity scores above the threshold
    above_threshold_indices = [i for i, score in enumerate(cosine_similarities[0]) if score >= threshold]
    results = [data.loc[i] for i in above_threshold_indices]

    if not results:
        return 'No results found', [], 0

    query_type = 'unrecognized'
    if "doctor for patient" in query:
        query_type = 'find doctor with which patient'
    elif "patient for doctor" in query:
        query_type = 'find patient with which doctor'
    elif "medicine for patient" in query:
        query_type = 'find medicine prescribed for which patient'
    elif "medicine by doctor" in query:
        query_type = 'find medicine prescribed by which doctor'

    relevant_results = []

    if relevant_keywords:
        if query_type == 'find patient with which doctor':
            relevant_results = [result for result in results if all(keyword in result['doctor'].lower() for keyword in relevant_keywords)]
        elif query_type == 'find doctor with which patient':
            relevant_results = [result for result in results if all(keyword in result['patient'].lower() for keyword in relevant_keywords)]
        elif query_type == 'find medicine prescribed for which patient':
            relevant_results = [result for result in results if all(keyword in result['patient'].lower() for keyword in relevant_keywords)]
        elif query_type == 'find medicine prescribed by which doctor':
            relevant_results = [result for result in results if all(keyword in result['doctor'].lower() for keyword in relevant_keywords)]

    return query_type, results, len(results), len(relevant_results)

# Example queries
query_1 = "find doctor for patient noah hidayat"
query_2 = "find patient for doctor anita"
query_3 = "find medicine for patient Guntur"
query_4 = "find medicine by doctor yudhisindo"

relevant_keywords_1 = ['noah', 'hidayat']  # Customize keywords for each query
relevant_keywords_2 = ['anita']  # Customize keywords for each query
relevant_keywords_3 = ['guntur']  # Customize keywords for each query
relevant_keywords_4 = ['yudhisindo']  # Customize keywords for each query

query_type_1, results_1, count_1, relevant_count_1 = search(query_1, relevant_keywords=relevant_keywords_1)
query_type_2, results_2, count_2, relevant_count_2 = search(query_2, relevant_keywords=relevant_keywords_2)
query_type_3, results_3, count_3, relevant_count_3 = search(query_3, relevant_keywords=relevant_keywords_3)
query_type_4, results_4, count_4, relevant_count_4 = search(query_4, relevant_keywords=relevant_keywords_4)

print(f"Query: {query_1}, Query Type: {query_type_1}, Count: {count_1}, Relevant Document Count: {relevant_count_1}")
for result in results_1:
    print(f"Result: Patient={result['patient']}, Doctor={result['doctor']}")

print(f"\nQuery: {query_2}, Query Type: {query_type_2}, Count: {count_2}, Relevant Document Count: {relevant_count_2}")
for result in results_2:
    print(f"Result: Doctor={result['doctor']}, Patient={result['patient']}")

print(f"\nQuery: {query_3}, Query Type: {query_type_3}, Count: {count_3}, Relevant Document Count: {relevant_count_3}")
for result in results_3:
    print(f"Result: Patient={result['patient']}, Medicine={result['medicine']}")

print(f"\nQuery: {query_4}, Query Type: {query_type_4}, Count: {count_4}, Relevant Document Count: {relevant_count_4}")
for result in results_4:
    print(f"Result: Doctor={result['doctor']}, Medicine={result['medicine']}")


Query: find doctor for patient noah hidayat, Query Type: find doctor with which patient, Count: 19, Relevant Document Count: 8
Result: Patient=['arif', 'hidayat'], Doctor=['dr', 'hendry']
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['arif', 'hidayat'], Doctor=
Result: Patient=['noah', 'santoso'], Doctor=
Result: Patient=['scarlett', 'hidayat'], Doctor=
Result: Patient=['scarlett', 'hidayat'], Doctor=
Result: Patient=['noah', 'lee'], Doctor=
Result: Patient=['noah', 'lee'], Doctor=
Result: Patient=['noah', 'setiawan'], Doctor=
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['noah', 'hidayat'], Doctor=['supardi']
Result: Patient=['noah', 'prasetyo'], Doctor=
Re

In [None]:
#specify ground truth for each queries
ground_truth1 = 8
ground_truth2 = 14
ground_truth3 = 8
ground_truth4 = 16

#calculate precision and recall
precision1 = relevant_count_1 / count_1
recall1 = relevant_count_1 / ground_truth1

precision2 = relevant_count_2 / count_2
recall2 = relevant_count_2 / ground_truth2

precision3 = relevant_count_3 / count_3
recall3 = relevant_count_3 / ground_truth3

precision4 = relevant_count_4 / count_4
recall4 = relevant_count_4 / ground_truth4

print(f"\nQuery 1 : find doctor for patient noah hidayat")
print(f"Precision : {precision1}")
print(f"Recall : {recall1}")

print(f"\nQuery 2 : find patient for doctor anita")
print(f"Precision : {precision2}")
print(f"Recall : {recall2}")

print(f"\nQuery 3 : find medicine for patient guntur")
print(f"Precision : {precision3}")
print(f"Recall : {recall3}")

print(f"\nQuery 4 : find medicine by doctor yudhisindo")
print(f"Precision : {precision4}")
print(f"Recall : {recall4}")


Query 1 : find doctor for patient noah hidayat
Precision : 0.42105263157894735
Recall : 1.0

Query 2 : find patient for doctor anita
Precision : 0.875
Recall : 1.0

Query 3 : find medicine for patient guntur
Precision : 1.0
Recall : 1.0

Query 4 : find medicine by doctor yudhisindo
Precision : 1.0
Recall : 1.0
