In [2]:
import pandas as pd

# Path to the JSON file
json_file_path = r"output_data.json"

# Read the JSON file into a DataFrame
df = pd.read_json(json_file_path, orient='records')

df


Unnamed: 0,file_name,content,technical_specification
0,file10.pdf,Tender For Fundus Fluorescein angiography with...,--- Page 10 ---\nTender for Fundus Fluorescein...
1,file100.pdf,Tender For Auto Transfusion Machine (Cell Save...,--- Page 11 ---\nTender for Auto Transfusion M...
2,file1000.pdf,Tender No. AIIMS/R/CS/Neuro/1173/23/ GTE 1 vf[...,--- Page 45 ---\n ...
3,file1001.pdf,GTE No.: AIIMS/ R/CS/ URO/22/17/GT Page 1 of 7...,--- Page 39 ---\n \n \nGTE No.: AIIMS/ R/CS/UR...
4,file1002.pdf,Tender No. AIIMS/R/CS/ ENT /1158 /22/ GTE /A 1...,--- Page 45 ---\n ...
...,...,...,...
755,file992.pdf,Tender No. AIIMS/R/CS/SG/Ech/1/8/2022/GTE 1 vf...,--- Page 45 ---\n ...
756,file993.pdf,Tender No. AIIMS/R/CS/ OBGY /6795/2023/ GTE 1 ...,--- Page 39 ---\n \n \nTender No. AIIMS/R/CS/O...
757,file995.pdf,Tender No. AIIMS/R/CS/ Sur_Gastro/ Laproscopic...,--- Page 45 ---\n ...
758,file997.pdf,Tender No. AIIMS/R/CS/ 4007/PHY PMR /23/ GTE /...,--- Page 62 ---\n \n \n \nSECTION – VII \nTECH...


In [4]:
df.to_html('check.html')

In [9]:
import re

# Define the regex patterns to match ISO and IEC standards
iso_pattern = r'ISO[- ]?\d+(?::\d+)?'  # Matches 'ISO' followed by an optional hyphen or space, digits, and an optional version after a colon
iec_pattern = r'IEC[- ]?\d+(?::\d+)?'  # Matches 'IEC' followed by an optional hyphen or space, digits, and an optional version after a colon

# Function to find standards in a text
def find_standards(text):
    iso_matches = re.findall(iso_pattern, text)
    iec_matches = re.findall(iec_pattern, text)
    return iso_matches, iec_matches

# Apply the function to both 'content' and 'technical_specification' columns
df['iso_in_content'], df['iec_in_content'] = zip(*df['content'].apply(find_standards))
df['iso_in_tech_spec'], df['iec_in_tech_spec'] = zip(*df['technical_specification'].apply(find_standards))

# View the dataframe with ISO and IEC mentions
result_df = df[['file_name', 'iso_in_content', 'iec_in_content', 'iso_in_tech_spec', 'iec_in_tech_spec']]

# Display the result
print(result_df.head())


      file_name iso_in_content iec_in_content iso_in_tech_spec  \
0    file10.pdf             []             []               []   
1   file100.pdf             []             []               []   
2  file1000.pdf             []             []               []   
3  file1001.pdf             []             []               []   
4  file1002.pdf             []             []               []   

  iec_in_tech_spec  
0               []  
1               []  
2               []  
3               []  
4               []  


In [15]:
import pandas as pd
from collections import Counter

# Assuming your DataFrame is named 'df'
# Combine all ISO mentions from content into one list
iso_content_list = df['iso_in_content'].explode().dropna().tolist()

# Combine all IEC mentions from content into one list
iec_content_list = df['iec_in_content'].explode().dropna().tolist()

# Count occurrences of each standard in content
iso_content_counts = Counter(iso_content_list)
iec_content_counts = Counter(iec_content_list)

# Create summary DataFrames for ISO and IEC counts
iso_content_summary = pd.DataFrame(iso_content_counts.items(), columns=['Standard', 'Count']).sort_values(by='Count', ascending=False)
iec_content_summary = pd.DataFrame(iec_content_counts.items(), columns=['Standard', 'Count']).sort_values(by='Count', ascending=False)

# Display the counts
print("ISO Standards Count in Content:")
print(iso_content_summary.reset_index(drop=True))

print("\nIEC Standards Count in Content:")
print(iec_content_summary.reset_index(drop=True))


ISO Standards Count in Content:
          Standard  Count
0         ISO 9001     24
1    ISO 9001:2015     16
2         ISO 7396     15
3        ISO 15693     12
4        ISO 14443     10
5        ISO 13485      7
6        ISO 14001      7
7   ISO 14001:2015      7
8         ISO 4628      6
9    ISO 9001:2008      6
10  ISO 45000:2018      4
11       ISO 27001      4
12           ISO 5      4
13     ISO 18000:3      4
14        ISO 7153      3
15        ISO 9227      3
16  ISO 13485:2016      3
17   ISO 9001:2000      3
18       ISO 10993      2
19       ISO 19980      2
20       ISO 14971      2
21       ISO 14644      2
22       ISO 80601      2
23        ISO 1563      2
24       ISO 18308      2
25        ISO 5356      1
26          ISO 60      1
27        ISO 8655      1
28         ISO 900      1
29        ISO 4090      1
30        ISO13485      1
31         ISO7396      1
32  ISO 13485:2003      1
33       ISO 28560      1
34   ISO 7376:2009      1

IEC Standards Count in Content:

In [17]:
# List of acronyms to count
acronyms = ['FDA', 'CE', 'ISO', 'IEC', 'HIPAA']

# Initialize a dictionary to store counts
counts = {acronym: 0 for acronym in acronyms}

# Count occurrences for each acronym using word boundaries
for acronym in acronyms:
    counts[acronym] = df['content'].str.findall(r'\b' + acronym + r'\b').str.len().sum()

# Convert counts to a DataFrame for better readability
counts_df = pd.DataFrame(list(counts.items()), columns=['Acronym', 'Count'])

# Display the counts DataFrame
print(counts_df)

  Acronym  Count
0     FDA    988
1      CE   1336
2     ISO    662
3     IEC    497
4   HIPAA      8


In [10]:
df[['file_name', 'iso_in_content', 'iec_in_content', 'iso_in_tech_spec', 'iec_in_tech_spec']].to_html('standards.html')
