### In this notebook we will implements a classification model & perform cosine similarity  

In [23]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
from numpy import dot
from numpy.linalg import norm
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nmman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# reading the stop words list with pickle
domain_stop_word=stopwords.words('english')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [25]:
# read data file
file_path = 'diseases.csv'
df = pd.read_csv(file_path)
print(df.head())

                                         Description           D_Name
0  bone, muscle, ear, otitis, hearing, membrane, ...  musculoskeletal
1  ear, otitis, hearing, throat, sinusitis, bleed...         ear_nose
2  ventilation, oxygen, airway, copd, breathing, ...      respiratory
3  fever, fatigue, rash, joint pain, headache, mo...        infection
4  insulin, glucose, pancreas, sugar, hyperglycem...         diabetes


In [26]:
def clean_text_func(text):
    
    """ this function clean & pre-process the data  """

    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word:
            final_text = final_text + x  +" "
    return final_text

df['Description'] = df['Description'].apply(lambda x: clean_text_func(x))
df.head()

Unnamed: 0,Description,D_Name
0,bone muscle ear otitis hearing membrane tinnit...,musculoskeletal
1,ear otitis hearing throat sinusitis bleeding c...,ear_nose
2,ventilation oxygen airway copd breathing asthm...,respiratory
3,fever fatigue rash joint pain headache mosquit...,infection
4,insulin glucose pancreas sugar hyperglycemia h...,diabetes


# Words Embedding:

In [27]:
cv = CountVectorizer(stop_words="english")
cv_tfidf = TfidfVectorizer(stop_words="english")

X = cv.fit_transform(list(df.loc[:, 'Description' ]))
X_tfidf = cv_tfidf.fit_transform(list(df.loc[:, 'Description' ]))

In [28]:
df_cv = pd.DataFrame(X.toarray() , columns=cv.get_feature_names_out())
df_tfidf = pd.DataFrame(X_tfidf.toarray() , columns=cv_tfidf.get_feature_names_out())

In [29]:
print(df_cv.shape)
cosine = lambda v1 , v2 : dot(v1 , v2) / (norm(v1) * norm(v2))

(30, 202)


### Cosine Similarity 

In [30]:
new_text = ["dizziness loss of balance  vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes "]
new_text_cv = cv.transform(new_text).toarray()[0]
new_text_tfidf = cv_tfidf.transform(new_text).toarray()[0]

for chpter_number in range(int(df.shape[0])):
    print(f"This is chpter number : {chpter_number} ")
    print(f"Cosin cv :    { cosine( df_cv.iloc[chpter_number]  , new_text_cv )} ")
    print(f"Cosin TFIDF : { cosine( df_tfidf.iloc[chpter_number]  , new_text_tfidf) } ")

This is chpter number : 0 
Cosin cv :    0.42426406871192845 
Cosin TFIDF : 0.36119374754246125 
This is chpter number : 1 
Cosin cv :    0.29814239699997197 
Cosin TFIDF : 0.22940948009137957 
This is chpter number : 2 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 3 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 4 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 5 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 6 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 7 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 8 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 9 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 10 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 11 
Cosin cv :    0.15811388300841897 
Cosin TFIDF : 0.18551027641711712 
This is chpter number : 12 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 
This is chpter number : 13 
Cosin cv :    0.0 
Cosin TFI

### Implementing the classification model: LogisticRegression

In [31]:
df.columns

Index(['Description', 'D_Name'], dtype='object')

In [32]:
X_train = df.Description
y_train = df.D_Name

In [33]:
cv1 = CountVectorizer()
X_train_cv1 = cv1.fit_transform(X_train)
pd_cv1 = pd.DataFrame(X_train_cv1.toarray(), columns=cv1.get_feature_names_out())

In [34]:
lr = LogisticRegression()
lr.fit(X_train_cv1, y_train)

In [35]:
X_test = f"""I've been having a lot of trouble breathing lately. It's especially bad when I'm trying to do something active, like walking up stairs. I've also been coughing a lot and sometimes it's hard to catch my breath. My chest feels tight and I'm wheezing a lot, particularly at night. """
cleaned_text = clean_text_func(X_test)

In [36]:
X_test_cv3  = cv1.transform([cleaned_text])
y_pred_cv3 = lr.predict(X_test_cv3)
print(y_pred_cv3)

['pulmonary']


### In conclusion our model is abel to predict the disease, that given by user