In [3]:
import os
import collections
import spacy
import sqlite3
import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from google.cloud import vision
import os
import io
import string
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from textblob import TextBlob


2023-05-07 11:20:59.501031: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-07 11:20:59.542312: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-07 11:20:59.543235: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
def get_text_from_images(directory):
    """Get text from images in the given directory."""
    client = vision.ImageAnnotatorClient()
    text_list = []
    for root, dirs, files in os.walk(directory):
        print(f"Processing directory: {root}")
        for file in files:
            print(f"Processing file: {file}")
            if file.endswith('.jpg') or file.endswith('.jpeg') or file.endswith('.png'):
                path = os.path.join(root, file)
                with io.open(path, 'rb') as image_file:
                    content = image_file.read()
                image = vision.Image(content=content)
                response = client.text_detection(image=image)
                texts = response.text_annotations
                for text in texts:
                    text_list.append(text.description)
    return text_list

def get_word_frequency(text_list):
    """Get frequency of each word in the text list."""
    word_count = collections.defaultdict(int)
    for text in text_list:
        for word in text.split():
            word_count[word] += 1
    return word_count

def get_common_words(word_frequency_list):
    """Get words that are present in more than one dictionary and have a frequency greater than 10."""
    common_words = {}
    for word_frequency in word_frequency_list:
        for word, frequency in word_frequency.items():
            if frequency > 10:
                if word in common_words:
                    common_words[word] += 1
                else:
                    common_words[word] = 1
    common_words = {k: v for k, v in common_words.items() if v > 1}
    return common_words

In [6]:
def lower_case(df):
    df['text'] = df['text'].apply(str.lower)
    return df

def remove_punctuations(df):
    cleaned_text = []
    for index in tqdm(range(df.shape[0])):
        text = df['text'].iloc[index]

        word_tokens = text.split()
        
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in word_tokens]

        filtered_sentence = " ".join(stripped).strip()
        cleaned_text.append(filtered_sentence)
    df['text'] = np.array(cleaned_text)
    return df

def remove_null(df):
    if df['text'].isnull().sum() > 0:
        df.dropna(inplace = True)
    return df

In [16]:
directory_path = '../images-115-max-keys-400/images'
text_list_journal = get_text_from_images(directory_path)

Processing directory: ../images-115-max-keys-400/images
Processing directory: ../images-115-max-keys-400/images/AF_200_Tablet
Processing file: AF_200_Tablet0_gaussian_noise.jpg
Processing file: AF_200_Tablet2.jpg
Processing file: AF_200_Tablet0_rotated.jpg
Processing file: AF_200_Tablet1_blur.jpg
Processing file: AF_200_Tablet1_gaussian_noise.jpg
Processing file: AF_200_Tablet2_blur.jpg
Processing file: AF_200_Tablet1_rotated.jpg
Processing file: AF_200_Tablet0.jpg
Processing file: AF_200_Tablet0_blur.jpg
Processing file: AF_200_Tablet1.jpg
Processing file: AF_200_Tablet2_gaussian_noise.jpg
Processing file: AF_200_Tablet2_rotated.jpg
Processing directory: ../images-115-max-keys-400/images/AB-Flo_Capsule
Processing file: AB-Flo_Capsule0.jpg
Processing file: AB-Flo_Capsule0_rotated.jpg
Processing file: AB-Flo_Capsule0_gaussian_noise.jpg
Processing file: AB-Flo_Capsule1_rotated.jpg
Processing file: AB-Flo_Capsule2.jpg
Processing file: AB-Flo_Capsule2_rotated.jpg
Processing file: AB-Flo_Ca

In [17]:
text_list_journal

['Fluconazole Tablets P 200 mg\nAF-200\nPEROD\n545 TOPIC',
 'Fluconazole',
 'Tablets',
 'P',
 '200',
 'mg',
 'AF',
 '-',
 '200',
 'PEROD',
 '545',
 'TOPIC',
 'Fluconazole Tablets P 200 mg\nAF-200\nC\nd\nகரம்\nby the Pr\nMELDEN\nपन\nNA\nwhe\nMR.PE\nwww\nin\nSEKRE',
 'Fluconazole',
 'Tablets',
 'P',
 '200',
 'mg',
 'AF',
 '-',
 '200',
 'C',
 'd',
 'கரம்',
 'by',
 'the',
 'Pr',
 'MELDEN',
 'पन',
 'NA',
 'whe',
 'MR.PE',
 'www',
 'in',
 'SEKRE',
 'mg\nFluconazole Tablets IP 200 mg\nAF-200\nएएफ-२००\nzdatom\nSYSTOPIC',
 'mg',
 'Fluconazole',
 'Tablets',
 'IP',
 '200',
 'mg',
 'AF',
 '-',
 '200',
 'एएफ',
 '-२००',
 'zdatom',
 'SYSTOPIC',
 'Fluconazole Tablets IP 200 mg\nAF-200\nIstett\n1\nSYSTOPIC E',
 'Fluconazole',
 'Tablets',
 'IP',
 '200',
 'mg',
 'AF',
 '-',
 '200',
 'Istett',
 '1',
 'SYSTOPIC',
 'E',
 'Fluconazole Tablets P 200 mg\nAF-200\nE 200\n&\nSHSTONC',
 'Fluconazole',
 'Tablets',
 'P',
 '200',
 'mg',
 'AF',
 '-',
 '200',
 'E',
 '200',
 '&',
 'SHSTONC',
 'Puma TP200\nAF-200\nPP.',


In [20]:
directory_path = '../images-115-max-keys-400/images'
text_list = get_text_from_images(directory_path)
# print(text_list)
t_word_frequency(temp_df['text'])
# common_words = get_common_words([word_frequency])
# print(common_words)

Processing directory: ../images-115-max-keys-400/images
Processing directory: ../images-115-max-keys-400/images/AF_200_Tablet
Processing file: AF_200_Tablet0_gaussian_noise.jpg
Processing file: AF_200_Tablet2.jpg
Processing file: AF_200_Tablet0_rotated.jpg
Processing file: AF_200_Tablet1_blur.jpg
Processing file: AF_200_Tablet1_gaussian_noise.jpg
Processing file: AF_200_Tablet2_blur.jpg
Processing file: AF_200_Tablet1_rotated.jpg
Processing file: AF_200_Tablet0.jpg
Processing file: AF_200_Tablet0_blur.jpg
Processing file: AF_200_Tablet1.jpg
Processing file: AF_200_Tablet2_gaussian_noise.jpg
Processing file: AF_200_Tablet2_rotated.jpg
Processing directory: ../images-115-max-keys-400/images/AB-Flo_Capsule
Processing file: AB-Flo_Capsule0.jpg
Processing file: AB-Flo_Capsule0_rotated.jpg
Processing file: AB-Flo_Capsule0_gaussian_noise.jpg
Processing file: AB-Flo_Capsule1_rotated.jpg
Processing file: AB-Flo_Capsule2.jpg
Processing file: AB-Flo_Capsule2_rotated.jpg
Processing file: AB-Flo_Ca

100%|██████████| 9/9 [00:00<00:00, 9498.93it/s]


text    9
dtype: int64

In [23]:
temp_df = pd.DataFrame(text_list, columns=['text'])
temp_df.count()

text    30477
dtype: int64

In [28]:
temp_df = remove_punctuations(temp_df)
temp_df = lower_case(temp_df)
temp_df.count()
temp_df = remove_null(temp_df)
temp_df.count()
# word_frequency = ge

100%|██████████| 30477/30477 [00:00<00:00, 76489.37it/s]


text    30477
dtype: int64

In [29]:
temp_df

Unnamed: 0,text
0,fluconazole tablets p 200 mg af200 perod 545 t...
1,fluconazole
2,tablets
3,p
4,200
...,...
30472,
30473,m
30474,एवीदेन
30475,एम


In [30]:
word_frequency = get_word_frequency(temp_df['text'])
word_frequency


defaultdict(int,
            {'fluconazole': 86,
             'tablets': 1280,
             'p': 553,
             '200': 157,
             'mg': 906,
             'af200': 11,
             'perod': 2,
             '545': 2,
             'topic': 2,
             'af': 90,
             'c': 152,
             'd': 89,
             'கரம்': 2,
             'by': 814,
             'the': 849,
             'pr': 12,
             'melden': 2,
             'पन': 2,
             'na': 23,
             'whe': 2,
             'mrpe': 8,
             'www': 124,
             'in': 420,
             'sekre': 2,
             'ip': 536,
             'एएफ२००': 3,
             'zdatom': 2,
             'systopic': 60,
             'एएफ': 24,
             '२००': 9,
             'istett': 2,
             '1': 75,
             'e': 67,
             'shstonc': 2,
             'puma': 4,
             'tp200': 2,
             'pp': 8,
             'ए': 28,
             'एक': 12,
             'ielew': 2,
    

In [32]:

threshold = 10
selected_words = []

for word, frequency in word_frequency.items():
    if frequency >= threshold:
        selected_words.append(f'"{word}",')

if selected_words:
    with open('selected_words.txt', 'w') as file:
        file.write(' '.join(selected_words))

        print(f"{len(selected_words)} words written to 'selected_words.txt'.")
else:
    print("No words meet the threshold frequency.")


563 words written to 'selected_words.txt'.
