In [2]:
import spacy
import sqlite3
import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import os
from google.cloud import vision
import io
import string
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from textblob import TextBlob
from fuzzywuzzy import process, fuzz


2023-05-09 22:38:17.023050: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-09 22:38:17.728376: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-09 22:38:17.732983: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
NUM_CORES = os.cpu_count()
NUM_CORES

8

In [4]:
# read database vectors
nlp = spacy.load("en_core_web_sm")
conn = sqlite3.connect('removedwords.db')
c = conn.cursor()
c.execute("SELECT TEXT FROM text;")
db_names = [row[0].rstrip() for row in c.fetchall() if row[0].strip()]
db_names

['augmentin 625 duo',
 'azithral 500',
 'allegra 120mg',
 'avil 25',
 'allegram',
 'amoxyclav 625',
 'azee 500',
 'atarax 25mg',
 'aciloc 150',
 'arkamin',
 'avomine',
 'allegra 180mg',
 'albendazole 400mg',
 'alprax 025',
 'altraday capsule sr',
 'ativan 2mg',
 'almox 500',
 'atarax 10mg',
 'aciloc rd 20',
 'aldactone',
 'amlokindat',
 'axcer 90mg',
 'ativan 1mg',
 'aldigesic p 100mg325mg',
 'alfoo 10mg tablet pr',
 'alprax 05mg',
 'anafortan 25 mg300 mg',
 'ab phylline',
 'althrocin 500',
 'azicip 500',
 'aldigesicsp',
 'amoxycillin 500mg',
 'acemiz plus',
 'aceclo plus',
 'alex cough lozenges lemon ginger',
 'atorva',
 'azmarda 50mg',
 'amixideh',
 'abflon',
 'af kit',
 'amlokind 5',
 'amlong',
 'amitone 10mg',
 'aulin 100mg',
 'ab phylline sr 200',
 'azoran',
 'amaryl 1mg',
 'aztor 10',
 'atorva 40',
 'azax 500',
 'anxit 05',
 'anxit 025mg',
 'acitrom 2',
 'acemiz mr',
 'akurit 4',
 'acivir 400 dt',
 'augmentin 1000 duo',
 'acogut',
 'amlip 5',
 'aldactone 50',
 'ampoxin 500',
 'az

In [5]:
def generate_vecs(name):
    return nlp(name).vector

In [6]:
delayed_func = [delayed(generate_vecs)(name) for name in db_names]
parallel_pool = Parallel(n_jobs=NUM_CORES//2, backend = 'multiprocessing', verbose = 10)
db_vecs = parallel_pool(delayed_func)
db_vecs = np.array(db_vecs)

[Parallel(n_jobs=4)]: Using backend MultiprocessingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0536s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0818s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1832s.) Setting batch_size=8.
[Parallel(n_jobs=4)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 200 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done 288 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 480 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done 584 tasks      | elapsed:    6.2s
[Parallel(n_jobs=4)]: Done 704 tasks      | elapsed:    7.

KeyboardInterrupt: 

In [111]:
db_names

['augmentin 625 duo',
 'azithral 500',
 'allegra 120mg',
 'avil 25',
 'allegram',
 'amoxyclav 625',
 'azee 500',
 'atarax 25mg',
 'aciloc 150',
 'arkamin',
 'avomine',
 'allegra 180mg',
 'albendazole 400mg',
 'alprax 025',
 'altraday capsule sr',
 'ativan 2mg',
 'almox 500',
 'atarax 10mg',
 'aciloc rd 20',
 'aldactone',
 'amlokindat',
 'axcer 90mg',
 'ativan 1mg',
 'aldigesic p 100mg325mg',
 'alfoo 10mg tablet pr',
 'alprax 05mg',
 'anafortan 25 mg300 mg',
 'ab phylline',
 'althrocin 500',
 'azicip 500',
 'aldigesicsp',
 'amoxycillin 500mg',
 'acemiz plus',
 'aceclo plus',
 'alex cough lozenges lemon ginger',
 'atorva',
 'azmarda 50mg',
 'amixideh',
 'abflon',
 'af kit',
 'amlokind 5',
 'amlong',
 'amitone 10mg',
 'aulin 100mg',
 'ab phylline sr 200',
 'azoran',
 'amaryl 1mg',
 'aztor 10',
 'atorva 40',
 'azax 500',
 'anxit 05',
 'anxit 025mg',
 'acitrom 2',
 'acemiz mr',
 'akurit 4',
 'acivir 400 dt',
 'augmentin 1000 duo',
 'acogut',
 'amlip 5',
 'aldactone 50',
 'ampoxin 500',
 'az

In [None]:
np.save("db_vecs_final.npy", db_vecs)

In [10]:
def lower_case(df):
    df['text'] = df['text'].apply(str.lower)
    return df

def remove_punctuations(df):
    cleaned_text = []
    for index in tqdm(range(df.shape[0])):
        text = df['text'].iloc[index]

        word_tokens = text.split()
        
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in word_tokens]

        filtered_sentence = " ".join(stripped).strip()
        cleaned_text.append(filtered_sentence)
    df['text'] = np.array(cleaned_text)
    return df

def remove_null(df):
    if df['text'].isnull().sum() > 0:
        df.dropna(inplace = True)
    return df

def detect_text(path):
    """Detects text in the file."""
    
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations
 
    text_list = []

    for text in texts:
        text_list.append('\n"{}"'.format(text.description))
        vertices = [(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices]

        # vertices = (['({},{})'.format(vertex.x, vertex.y)
        #             for vertex in text.bounding_poly.vertices])
        
        #Case 1: Area using the vertices (Highest area might be the case) - WORKING
        # area = 0
        # for i in range(len(vertices)):
        #     x1, y1 = vertices[i]
        #     x2, y2 = vertices[(i + 1) % len(vertices)]
        #     area += x1 * y2 - x2 * y1
        # area /= 2
        # if area > max_area:
        #     max_area = area

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    
    return text_list

In [8]:
db_df = pd.DataFrame(db_names, columns = ['text'])
db_df

Unnamed: 0,text
0,augmentin 625 duo
1,azithral 500
2,allegra 120mg
3,avil 25
4,allegram
...,...
173424,ziyapod 200mg tablet dt
173425,zogrell a 75mg75mg
173426,zef cv 200mg125mg
173427,zemhart 30mg


In [11]:
image_text = detect_text("Accept-SP_Tablet2 (1).jpg")
temp_df = pd.DataFrame(columns=['text'])
temp_df['text'] = image_text[0].replace("\n"," ").split()
temp_df = remove_punctuations(temp_df)
temp_df = lower_case(temp_df)
temp_df = remove_null(temp_df)
temp_df

100%|██████████| 84/84 [00:00<00:00, 37592.99it/s]


Unnamed: 0,text
0,acceptsp
1,composition
2,each
3,film
4,coated
...,...
79,may
80,be
81,injurious
82,to


In [12]:
# def remove_non_alphanumeric(df, columns):
#     """Remove all non-alphanumeric characters from the specified column(s) of the DataFrame."""
#     for col in columns:
#         df[col] = df[col].apply(lambda x: ''.join(e for e in x if e.isalnum()))
#     return df
def is_english(word):
    lang = process.extractOne(word, choices=['eng'], scorer=fuzz.token_sort_ratio, score_cutoff=95)
    if lang:
        return True
    else:
        return False

def is_alnum(string):
    return string.isalpha() or string.isdigit()

In [255]:
def filter_english(text):
    blob = TextBlob(text)
    english_words = [word for word in blob.words if TextBlob(word).detect_language() == 'en']
    return ' '.join(english_words)

In [13]:
# temp_df['text_without_stop'] = temp_df['text']
from nltk.corpus import stopwords
stop = stopwords.words('english')
with open('selected_words.txt', 'r') as f:
    stop_words = [word.strip().replace('"', '') for line in f.readlines() for word in line.split(',')]

stop_words.extend(['composition','tablet','capsule','capsules','tablets','warning','dosage','direction','directions','use','uses', 'physician','coated','film'])
stop.extend(stop_words)

# temp_df['text'] = temp_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in set(stop_words)]))

# new_stopwords = ['composition','tablet','capsule','capsules','tablets','warning','dosage','direction','directions','use','uses', 'physician','coated','film']
# stop.extend(new_stopwords)
temp_df['text'] = temp_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# temp_df['text_without_stop'] = temp_df['text_without_stop'].apply(filter_english)
temp_df


Unnamed: 0,text
0,acceptsp
1,
2,
3,
4,
...,...
79,
80,
81,
82,


In [266]:
temp_df['text'] = temp_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
temp_df

Unnamed: 0,text
0,abpas
1,n
2,एवी
3,पास
4,एन
...,...
68,7980
69,sec6a
70,e
71,sidcul


In [14]:
def cleanDataframeEnglish(df,column_name):
  cleanedDf = pd.DataFrame(columns=['words'])
  for index, row in df.iterrows():
      word = row[column_name]
      if word.isalnum() and not word.isspace() and word.isascii():
        new_row = pd.DataFrame.from_records([{'words':word}])
        # print("New Row",new_row)
        cleanedDf = pd.concat([cleanedDf,new_row],ignore_index=True)
        # print("Iteration",cleanedDf)

  return cleanedDf

In [182]:
# stop_words = set(stopwords.words('english'))
# filtered_words = []

# for word in temp_df['text'].str.cat():
#     if is_english(word) and len([char for char in word if char.isalnum()]) > 0:
#         filtered_words.append(word)

# # Filter words
# clean_words = [word for word in filtered_words if word in stop_words]

# # Save cleaned words to new dataframe
# new_df = pd.DataFrame({'words': clean_words})
# print(new_df)

Empty DataFrame
Columns: [words]
Index: []


In [15]:
# cleanedDf = cleanDataframeEnglish(temp_df,'text')
# cleanedDf.rename(columns={0:'updated_text'}, inplace=True)
# cleanedDf
# print(temp_df)

ls = cleanDataframeEnglish(temp_df,'text')
print(ls)
# print(temp_df)

      words
0  acceptsp
1     20000


In [184]:
temp_df['text'] = remove_non_alphanumeric(temp_df, ['text'])
temp_df

Unnamed: 0,text
0,star
1,mankind
2,cimg
3,acebrophylline
4,
5,acetylcysteine
6,tablets
7,abiways
8,एबवज
9,


In [16]:
#Matching sequence
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [17]:
for i, row in ls.iterrows():
    # Find the most similar text in the db_name column of db_df
    max_similarity = 0
    for j, row2 in db_df.iterrows():
        # Check if the value is a float
        if isinstance(row['words'], float):
            continue
        similarity = similar(row['words'], row2['text'])
        if similarity > max_similarity:
            max_similarity = similarity
            max_row = row2
    # Print the results
    if not isinstance(row['words'], float):
        print(f"Text '{row['words']}' has the most similar text '{max_row['text']}' with a similarity score of {max_similarity}")



Text 'acceptsp' has the most similar text 'acceptsp' with a similarity score of 1.0
Text '20000' has the most similar text 'gon 10000' with a similarity score of 0.5714285714285714


In [None]:
row = temp_df.iloc[0]
row

text    acceptsp
Name: 0, dtype: object

In [None]:
# for index, row in temp_df.iterrows():
row = temp_df.iloc[0]
cs = cosine_similarity(nlp("acceptsp").vector.reshape(1, -1), db_vecs)
cs = cs.reshape(-1, 1)
data = {"acceptsp": cs.flatten()}
data

{'acceptsp': array([0.41199553, 0.3391754 , 0.16365804, ..., 0.22509345, 0.27878356,
        0.16074592], dtype=float32)}

In [None]:
df = pd.DataFrame(db_names, columns=['db_names'])
df.shape

(173429, 1)

In [None]:
temp_df = pd.DataFrame(data)
df = pd.concat([df, temp_df], axis=1)
df.head()

Unnamed: 0,db_names,acceptsp
0,augmentin 625 duo,0.411996
1,azithral 500,0.339175
2,allegra 120mg,0.163658
3,avil 25,0.341197
4,allegram,0.601819


In [None]:
df.sort_values(by = "acceptsp", ascending=False, inplace=True)
df.head()

Unnamed: 0,db_names,acceptsp
84348,larazolemd kid tablet orange,0.897423
87366,lebestm kid tablet md,0.865241
124058,rexipra lite,0.863248
91277,mucimega effervescent tablet orange,0.857558
37956,dynagliptm forte tablet sr,0.857369


In [None]:
df[df['db_names'] == 'acceptsp']

Unnamed: 0,db_names,acceptsp
2238,acceptsp,1.0
