In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import pickle
from numpy import dot
from numpy.linalg import norm

In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Reading the stop words list with pickle
with open('/content/gdrive/My Drive/stop_words.ob', 'rb') as fp:
    domain_stop_word = pickle.load(fp)


In [4]:
# Read data file
file_path = '/content/gdrive/My Drive/diseases_with_description.csv'
df = pd.read_csv(file_path)
print(df.head())

                                         Description           D_Name
0  bone, ear, muscle, otitis, hearing, membrane, ...  musculoskeletal
1  ear, otitis, hearing, externa, membrane, silic...         ear_nose
2  airway, ventilation, obstruction, oxygen, brea...      respiratory


In [5]:
def clean_text_func(text):
    """ This function cleans and preprocesses the text data """
    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word:
            final_text = final_text + x + " "
    return final_text

# Apply the clean_text_func to 'Description' column
df['Description'] = df['Description'].apply(clean_text_func)
df.head()

Unnamed: 0,Description,D_Name
0,bone ear muscle otitis hearing membrane airway...,musculoskeletal
1,ear otitis hearing externa membrane silicosis ...,ear_nose
2,airway ventilation obstruction oxygen breathin...,respiratory


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Create CountVectorizer and TfidfVectorizer objects
cv = CountVectorizer(stop_words="english")
cv_tfidf = TfidfVectorizer(stop_words="english")

# Convert text data to vector representations
X = cv.fit_transform(df['Description'].values.astype('U'))
X_tfidf = cv_tfidf.fit_transform(df['Description'].values.astype('U'))

In [7]:
import pandas as pd

# Get feature names from CountVectorizer
feature_names_cv = cv.get_feature_names_out()

# Create DataFrame from CountVectorizer output
df_cv = pd.DataFrame(X.toarray(), columns=feature_names_cv)

# Get feature names from TfidfVectorizer
feature_names_tfidf = cv_tfidf.get_feature_names_out()

# Create DataFrame from TfidfVectorizer output
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=feature_names_tfidf)

In [8]:
import numpy as np

# Print the shape of df_cv
print(df_cv.shape)

# Define a lambda function for cosine similarity calculation
cosine = lambda v1, v2: np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

(3, 42)


In [9]:
# Define the new text
new_text = ["dizziness loss of balance vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes"]

# Transform new text using CountVectorizer and TfidfVectorizer
new_text_cv = cv.transform(new_text).toarray()[0]
new_text_tfidf = cv_tfidf.transform(new_text).toarray()[0]

# Iterate over each chapter in the DataFrame
for chapter_number in range(df.shape[0]):
    print(f"This is chapter number: {chapter_number}")
    # Calculate cosine similarity using CountVectorizer
    print(f"Cosine similarity (CountVectorizer): {cosine(df_cv.iloc[chapter_number], new_text_cv)}")
    # Calculate cosine similarity using TfidfVectorizer
    print(f"Cosine similarity (TfidfVectorizer): {cosine(df_tfidf.iloc[chapter_number], new_text_tfidf)}")

This is chapter number: 0
Cosine similarity (CountVectorizer): 0.31622776601683794
Cosine similarity (TfidfVectorizer): 0.28722096078741605
This is chapter number: 1
Cosine similarity (CountVectorizer): 0.31622776601683794
Cosine similarity (TfidfVectorizer): 0.2829981561238491
This is chapter number: 2
Cosine similarity (CountVectorizer): 0.0
Cosine similarity (TfidfVectorizer): 0.0


In [10]:
# Print the column names of the DataFrame
print(df.columns)

Index(['Description', 'D_Name'], dtype='object')


In [11]:
# Assign 'Description' column to X_train and 'D_Name' column to y_train
X_train = df['Description']
y_train = df['D_Name']

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Initialize CountVectorizer and fit-transform 'Description' column
cv1 = CountVectorizer()
X_train_cv1 = cv1.fit_transform(X_train)

# Get feature names (vocabulary) from CountVectorizer
feature_names_cv1 = cv1.get_feature_names_out()

# Create DataFrame from transformed data with feature names
pd_cv1 = pd.DataFrame(X_train_cv1.toarray(), columns=feature_names_cv1)

In [13]:
from sklearn.linear_model import LogisticRegression

# Initialize LogisticRegression model
lr = LogisticRegression()

# Fit the model using the transformed data and target labels
lr.fit(X_train_cv1, y_train)

In [14]:
# Define the test text
X_test = "Difficulty sleeping or staying asleep Fever Fluid draining from ear Loss of balance. Hearing difficulties. Ear pain"

# Clean the test text using the clean_text_func function
cleaned_text = clean_text_func(X_test)

In [15]:
# Transform cleaned text using CountVectorizer
X_test_cv3 = cv1.transform([cleaned_text])

# Make predictions using the logistic regression model
y_pred_cv3 = lr.predict(X_test_cv3)

# Print the predicted label
print(y_pred_cv3)

['ear_nose']
