### In this notebook we will implements a classification model & perform cosine similarity  

In [83]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
from numpy import dot
from numpy.linalg import norm

In [84]:
# reading the stop words list with pickle
with open ('stop_words.ob', 'rb') as fp:
    domain_stop_word = pickle.load(fp)

In [85]:
# read data file
file_path = 'diseases_with_description.csv'
df = pd.read_csv(file_path)
print(df.head())

                                         Description           D_Name
0  bone, muscle, ear, otitis, hearing, airway, me...  musculoskeletal
1  ear, otitis, hearing, bleeding, sinusitis, nos...         ear_nose
2  ventilation, oxygen, airway, dyspnea, copd, br...      respiratory


In [86]:
def clean_text_func(text):
    
    """ this function clean & pre-process the data  """

    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word:
            final_text = final_text + x  +" "
    return final_text

df['Description'] = df['Description'].apply(lambda x: clean_text_func(x))
df.head()

Unnamed: 0,Description,D_Name
0,bone muscle ear otitis hearing airway membrane...,musculoskeletal
1,ear otitis hearing bleeding sinusitis nose ext...,ear_nose
2,ventilation oxygen airway dyspnea copd breathi...,respiratory


# Words Embedding:

In [87]:
cv = CountVectorizer(stop_words="english")
cv_tfidf = TfidfVectorizer(stop_words="english")

X = cv.fit_transform(list(df.loc[:, 'Description' ]))
X_tfidf = cv_tfidf.fit_transform(list(df.loc[:, 'Description' ]))

In [88]:
df_cv = pd.DataFrame(X.toarray() , columns=cv.get_feature_names_out())
df_tfidf = pd.DataFrame(X_tfidf.toarray() , columns=cv_tfidf.get_feature_names_out())

In [89]:
print(df_cv.shape)
cosine = lambda v1 , v2 : dot(v1 , v2) / (norm(v1) * norm(v2))

(3, 46)


### Cosine Similarity 

In [90]:
new_text = ["dizziness loss of balance  vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes "]
new_text_cv = cv.transform(new_text).toarray()[0]
new_text_tfidf = cv_tfidf.transform(new_text).toarray()[0]

for chpter_number in range(int(df.shape[0])):
    print(f"This is chpter number : {chpter_number} ")
    print(f"Cosin cv :    { cosine( df_cv.iloc[chpter_number]  , new_text_cv )} ")
    print(f"Cosin TFIDF : { cosine( df_tfidf.iloc[chpter_number]  , new_text_tfidf) } ")

This is chpter number : 0 
Cosin cv :    0.31622776601683794 
Cosin TFIDF : 0.2786047033951415 
This is chpter number : 1 
Cosin cv :    0.31622776601683794 
Cosin TFIDF : 0.2710426238493392 
This is chpter number : 2 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 


### Implementing the classification model: LogisticRegression

In [91]:
df.columns

Index(['Description', 'D_Name'], dtype='object')

In [92]:
X_train = df.Description
y_train = df.D_Name

In [93]:
cv1 = CountVectorizer()
X_train_cv1 = cv1.fit_transform(X_train)
pd_cv1 = pd.DataFrame(X_train_cv1.toarray(), columns=cv1.get_feature_names_out())

In [94]:
lr = LogisticRegression()
lr.fit(X_train_cv1, y_train)

In [95]:
X_test = "Difficulty sleeping or staying asleep Fever Fluid draining from ear  Loss of balance. Hearing difficulties. Ear pain"
cleaned_text = clean_text_func(X_test)
X_test_cv3  = cv1.transform([cleaned_text])
y_pred_cv3 = lr.predict(X_test_cv3)
print(y_pred_cv3)

['ear_nose']


In [96]:
X_test = "Uneasy breathing. No proper Ventialtion. Decrease in the Oxygen Levels in blood."
cleaned_text = clean_text_func(X_test)
X_test_cv3  = cv1.transform([cleaned_text])
y_pred_cv3 = lr.predict(X_test_cv3)
print(y_pred_cv3)

['respiratory']


In [97]:
X_test = "Severe pain and swelling in the hip area after a fall. Difficulty in moving the hip joint. Bruising and tenderness around the hip. Limited range of motion. Seek medical attention immediately."

cleaned_text = clean_text_func(X_test)
X_test_cv3  = cv1.transform([cleaned_text])
y_pred_cv3 = lr.predict(X_test_cv3)
print(y_pred_cv3)

['musculoskeletal']


### In conclusion our model is abel to predict the disease type, that given by user

In [104]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

# Define features (X) and target (y)
X_test = df['Description']
y_test = df['D_Name']

# Vectorize the test data
X_test_cv = cv1.transform(X_test)

# Predict using the logistic regression model
y_pred = lr.predict(X_test_cv)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.6666666666666666
Precision: 0.5
Classification Report:
                 precision    recall  f1-score   support

       ear_nose       0.50      1.00      0.67         1
musculoskeletal       0.00      0.00      0.00         1
    respiratory       1.00      1.00      1.00         1

       accuracy                           0.67         3
      macro avg       0.50      0.67      0.56         3
   weighted avg       0.50      0.67      0.56         3

Confusion Matrix:
[[1 0 0]
 [1 0 0]
 [0 0 1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
