In [14]:
import pandas as pd

# Path to the JSON file
json_file_path = r"output_data.json"

# Read the JSON file into a DataFrame
df = pd.read_json(json_file_path, orient='records')




In [15]:
import re
from cleantext import clean
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pandas as pd
from typing import List, Set, Optional

class TextPreprocessor:
    def __init__(self, language: str = 'english'):
        """
        Initialize the text preprocessor with necessary NLTK downloads and configurations.
        
        Args:
            language (str): Language for stopwords (default: 'english')
        """
        # Download required NLTK resources
        try:
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
        except Exception as e:
            print(f"Warning: NLTK resource download failed: {e}")
        
        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        self.domain_stopwords = self._get_domain_stopwords()
        
    def _get_domain_stopwords(self) -> Set[str]:
        """Define domain-specific stopwords"""
        return  {
        'qty', 'nos', 'cm', 'mm', 'ps', 'set', 'technical', 'specification','provide','copy','right','total','rejected', 
        'section', 'vii', 'fdabiseuropean', 'u', 'x', 'l', 'b', 'c', 'd',
        'e', 'f', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
        'y', 'z', 'na', 'page', 'pageof', 'hz', 'ac', 'sr', 'general', 'point', 'department',
        'neurosurgery', 'otorhinolaryngology', 'item', 'name', 'hr', 'year', 'unit', 'camc', 'item', 
        'specification', 'page', 'tender', 'jdh', 'qty', 'department', 'supply',
        'installation', 'bid', 'aiims', 'jodhpur', 'rishikesh', 'delhi', 'raipur', 'one', 
        'following', 'procurement', 'services', 'contract', 'document', 'date', 'number', 
        'submission', 'address', 'financial', 'technical', 'criteria', 'validity', 'opening', 
        'closing', 'terms', 'conditions', 'authority', 'officer', 'manager', 'section', 
        'schedule', 'reference', 'project', 'quantity', 'value', 'requirement', 'agreement', 
        'proposal', 'evaluation', 'process', 'deadline', 'signature', 'quotation', 'form', 
        'office', 'contact', 'details', 'phone', 'email', 'fax', 'cost', 'price', 'information',
        'admn', 'bidder', 'within', 'days', 'length', 'working', 'eprocure', 'gov', 'bidders', 
        'indian', 'institute', 'nos', 'cm', 'mm', 'ps', 'set', 'technical', 'specifications', 
        'section', 'vii', 'u', 'ce', 'x', 'l', 'b', 'c', 'd', 'e', 'f', 'h', 'th', 'tatibandh', 'g.e. road', 
        'cg', 'tele', 'website', 'no.', 'neuro', 'gte', 
        'i', 'j', 'k', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'na', 'least',
        'page', 'pageof', 'hz', 'ac', 'sr', 'etc', 'may', 'either', 'general', 'points', 'aiims', 
        'would', 'etc', 'neurosurgery', 'a.', 'hrs', 'must', 'quoted', 'separately', 
        'otorhinolaryngology', 'required', 'name', 'hr', 'years', 'unit', 'camc', 'two', 
        'able', 'available', 'india', 'provided', 'patna', 'weather', 'without', 'shall', 
        'also', 'aiimsjdh', 'nagpur', 'bathinda', 'rish', 'inch', 'good', 'supplier', 
        'purchaser', 'aiimskalyani', 'kalyani', 'list', 'require', 'document', 'measurement', 
        'pagetender', 'along', 'bhubaneswar', 'bbsr', 'gem', 'per', 'time', 'odisha', 
        'annexure', 'year', 'aiimsmg', 'proc', 'gte', 'mangalagiri', 'goods', 
        'consignee', 'tenderer', 'period', 'bhopal', 'saket', 'nagar','aiims.jdh'
    }
    
    def clean_text(self, text: Optional[str]) -> str:
        """
        Clean and preprocess text for TF-IDF vectorization.
        
        Args:
            text (str): Input text to clean
            
        Returns:
            str: Cleaned and preprocessed text
        """
        # Handle None or empty input
        if not text or pd.isna(text):
            return ''
        
        try:
            # Basic cleaning using cleantext
            text = clean(
                text.lower(),
                fix_unicode=True,
                to_ascii=True,
                no_line_breaks=True,
                no_urls=True,
                no_emails=True,
                no_phone_numbers=True,
                no_numbers=True,
                no_digits=True,
                no_currency_symbols=True,
                no_punct=False,
                replace_with_url=" ",
                replace_with_email=" ",
                replace_with_phone_number=" ",
                replace_with_number=" ",
                replace_with_digit=" ",
                replace_with_currency_symbol=" ",
                lang="en"
            )
            
            # Replace special characters
            text = text.replace('-', ' ').replace('/', ' ')
            
            # Handle newlines
            text = re.sub(r'(\b\w{1,5})\n(\w{1,5}\b)', r'\1\2', text)
            text = re.sub(r'\n+', ' ', text)
            
            # Remove numbers and non-alphabetic characters
            text = re.sub(r'\b\d+\b', '', text)
            text = re.sub(r'[^a-zA-Z\s]', ' ', text)
            
            # Remove specific patterns
            text = re.sub(r'\b[ivxlcdm]+\b', ' ', text)  # Roman numerals
            
            # Tokenize and clean words
            words = [
                self.lemmatizer.lemmatize(word)
                for word in text.split()
                if (word not in self.stop_words and 
                    word not in self.domain_stopwords and
                    len(word) > 2)  # Remove very short words
            ]
            
            return ' '.join(words)
            
        except Exception as e:
            print(f"Warning: Error processing text: {e}")
            return ''
    
    def process_dataframe(self, df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
        """
        Process multiple columns in a DataFrame.
        
        Args:
            df (pd.DataFrame): Input DataFrame
            columns (List[str]): List of column names to process
            
        Returns:
            pd.DataFrame: DataFrame with processed text columns
        """
        df_copy = df.copy()
        for column in columns:
            if column in df_copy.columns:
                df_copy[column] = df_copy[column].apply(self.clean_text)
        return df_copy

# Example usage
if __name__ == "__main__":
    # Initialize preprocessor
    preprocessor = TextPreprocessor()
    
    # Process DataFrame
    columns_to_process = ['content', 'technical_specification']
    processed_df = preprocessor.process_dataframe(df, columns_to_process)

In [16]:
# import re
# from cleantext import clean
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# import nltk
# import pandas as pd  # Ensure pandas is imported

# # Download the required resources from nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

# # Initialize stopwords and lemmatizer
# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# def clean_text(text):
#     # Check if the input is None
#     if text is None:
#         return ''  # Return an empty string or a placeholder

#     # Convert to lowercase and clean the text
#     text = clean(
#         text.lower(),
#         fix_unicode=True,
#         to_ascii=True,
#         no_line_breaks=True,
#         no_urls=True,
#         no_emails=True,
#         no_phone_numbers=True,
#         no_numbers=True,
#         no_digits=True,
#         no_currency_symbols=True,
#         no_punct=False,  # Keep punctuation for later processing
#         replace_with_url=" ",
#         replace_with_email=" ",
#         replace_with_phone_number=" ",
#         replace_with_number=" ",
#         replace_with_digit=" ",
#         replace_with_currency_symbol=" ",
#         lang="en"
#     )
    
#     # Replace hyphens and slashes with a space
#     text = text.replace('-', ' ').replace('/', ' ')
    
# # Replace newlines between words without adding a space if both words have 5 or fewer characters
#     text = re.sub(r'(\b\w{1,5})\n(\w{1,5}\b)', r'\1\2', text)
    
#     # Replace remaining newlines with a space
#     text = re.sub(r'\n+', ' ', text)
    
#     # Remove numbers and punctuation/non-alphabetic characters
#     text = re.sub(r'\b\d+\b', '', text)
#     text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
#     # Remove specific phrases and patterns
#     text = re.sub(r'india\s+institute\s+medical\s+science', ' ', text)
#     text = re.sub(r'\b[ivxlcdm]+\b', ' ', text)  # Remove Roman numerals
    
#     # Tokenize the text
#     words = text.split()
    
#     # Lemmatize and filter out both normal and domain-specific stopwords
#     domain_stopwords = {
#         'qty', 'nos', 'cm', 'mm', 'ps', 'set', 'technical', 'specification','provide','copy','right','total','rejected', 
#         'section', 'vii', 'fdabiseuropean', 'u', 'x', 'l', 'b', 'c', 'd',
#         'e', 'f', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
#         'y', 'z', 'na', 'page', 'pageof', 'hz', 'ac', 'sr', 'general', 'point', 'department',
#         'neurosurgery', 'otorhinolaryngology', 'item', 'name', 'hr', 'year', 'unit', 'camc', 'item', 
#         'specification', 'page', 'tender', 'jdh', 'qty', 'department', 'supply',
#         'installation', 'bid', 'aiims', 'jodhpur', 'rishikesh', 'delhi', 'raipur', 'one', 
#         'following', 'procurement', 'services', 'contract', 'document', 'date', 'number', 
#         'submission', 'address', 'financial', 'technical', 'criteria', 'validity', 'opening', 
#         'closing', 'terms', 'conditions', 'authority', 'officer', 'manager', 'section', 
#         'schedule', 'reference', 'project', 'quantity', 'value', 'requirement', 'agreement', 
#         'proposal', 'evaluation', 'process', 'deadline', 'signature', 'quotation', 'form', 
#         'office', 'contact', 'details', 'phone', 'email', 'fax', 'cost', 'price', 'information',
#         'admn', 'bidder', 'within', 'days', 'length', 'working', 'eprocure', 'gov', 'bidders', 
#         'indian', 'institute', 'nos', 'cm', 'mm', 'ps', 'set', 'technical', 'specifications', 
#         'section', 'vii', 'u', 'ce', 'x', 'l', 'b', 'c', 'd', 'e', 'f', 'h', 'th', 'tatibandh', 'g.e. road', 
#         'cg', 'tele', 'website', 'no.', 'neuro', 'gte', 
#         'i', 'j', 'k', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'na', 'least',
#         'page', 'pageof', 'hz', 'ac', 'sr', 'etc', 'may', 'either', 'general', 'points', 'aiims', 
#         'would', 'etc', 'neurosurgery', 'a.', 'hrs', 'must', 'quoted', 'separately', 
#         'otorhinolaryngology', 'required', 'name', 'hr', 'years', 'unit', 'camc', 'two', 
#         'able', 'available', 'india', 'provided', 'patna', 'weather', 'without', 'shall', 
#         'also', 'aiimsjdh', 'nagpur', 'bathinda', 'rish', 'inch', 'good', 'supplier', 
#         'purchaser', 'aiimskalyani', 'kalyani', 'list', 'require', 'document', 'measurement', 
#         'pagetender', 'along', 'bhubaneswar', 'bbsr', 'gem', 'per', 'time', 'odisha', 
#         'annexure', 'year', 'aiimsmg', 'proc', 'gte', 'mangalagiri', 'goods', 
#         'consignee', 'tenderer', 'period', 'bhopal', 'saket', 'nagar','aiims.jdh'
#     }
    
#     # Filter out normal stopwords and domain-specific stopwords
#     cleaned_words = [
#         lemmatizer.lemmatize(word) 
#         for word in words 
#         if word not in stop_words and word not in domain_stopwords
#     ]
    
#     # Join the cleaned words back into a string
#     cleaned_text = ' '.join(cleaned_words)
    
#     # Return the cleaned text
#     return cleaned_text

# # Apply the function to the 'Technical Specification' column in the DataFrame
# df.loc[:, 'content'] = df['content'].apply(clean_text)
# df.loc[:, 'technical_specification'] = df['technical_specification'].apply(clean_text)


In [17]:
processed_df.to_json('cleaned_files.json', orient='records', indent=4)

In [18]:
df.to_html('check.html')