In [None]:
import os
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.downloader import download

# Download NLTK resources for stopwords and lemmatization
download('stopwords')
download('wordnet')

# Step 1: Specify the folder path on your desktop
desktop_folder = os.path.expanduser("~/Desktop/Preprocessing")  # Replace 'YourFolderName' with your folder's name

# Step 2: List all files in the folder and filter for CSV files
all_files = os.listdir(desktop_folder)
csv_files = [f for f in all_files if f.endswith('.csv')]

if csv_files:
    # Step 3: Choose the first CSV file (or modify logic to select a specific file)
    file_path = os.path.join(desktop_folder, csv_files[0])
    print(f"Selected file: {file_path}")

    # Load the CSV file
    data = pd.read_csv(file_path)
    print("\nLoaded data preview:")
    print(data.head())

    # Step 4: Preprocess the text data
    # Drop rows with missing critical fields
    data = data.dropna(subset=['Title', 'Description', 'Category'])

    # Combine relevant fields into a new column 'CombinedText'
    fields_to_combine = ['Title', 'Description', 'Category', 
                         'How is the proposed idea different from previous attempts?',
                         'What is in it for the customer, the department, the company?']

    # Fill NaN in optional fields with an empty string
    for field in fields_to_combine:
        data[field] = data[field].fillna('')

    # Combine fields with context tags for better embeddings
    data['CombinedText'] = data.apply(
        lambda row: f"[TITLE] {row['Title']} [DESCRIPTION] {row['Description']} [CATEGORY] {row['Category']} "
                  f"[DIFFERENCE] {row['How is the proposed idea different from previous attempts?']} "
                  f"[VALUE] {row['What is in it for the customer, the department, the company?']}", axis=1
    )

    # Normalize text (convert to lowercase)
    data['CombinedText'] = data['CombinedText'].str.lower()

    # Remove special characters, numbers, and extra whitespace
    data['CombinedText'] = data['CombinedText'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    data['CombinedText'] = data['CombinedText'].str.strip().replace(r'\s+', ' ', regex=True)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    data['CleanedText'] = data['CombinedText'].apply(
        lambda x: ' '.join(word for word in x.split() if word not in stop_words)
    )

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    data['LemmatizedText'] = data['CleanedText'].apply(
        lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split())
    )

    # Extract and clean comments for relevance
    def final_refine_comment_text(comment):
        # Remove metadata such as "Comment from:" and "entered on"
        comment = re.sub(r"Comment from:.*?entered on.*?:", "", comment)
        # Remove remaining "Comment from:" patterns entirely
        comment = re.sub(r"Comment from:.*?", "", comment)
        # Remove any stray metadata
        comment = re.sub(r"\d{1,2}/\d{1,2}/\d{2,4}.*?@", "", comment)
        # Remove URLs
        comment = re.sub(r"http\S+", "", comment)
        # Clean up whitespace
        return comment.strip()

    if 'Comments' in data.columns:
        data['RefinedComments'] = data['Comments'].dropna().apply(final_refine_comment_text)
        
        # Combine refined comments into LemmatizedText
        data['LemmatizedText'] = data.apply(
            lambda row: f"{row['LemmatizedText']} [COMMENTS] {row['RefinedComments']}" if pd.notna(row['RefinedComments']) else row['LemmatizedText'],
            axis=1
        )

    # Filter out rows with very short text (e.g., less than 10 characters)
    data = data[data['LemmatizedText'].str.len() > 10]

    # Display the cleaned data
    print("\nPreprocessed data preview:")
    print(data[['Title', 'Description', 'LemmatizedText', 'RefinedComments']].head())

    # Step 5: Save preprocessed data to a new CSV file
    output_file_path = os.path.join(desktop_folder, "Ideas_Cleaned.csv")
    data.to_csv(output_file_path, index=False)
    print(f"\nPreprocessed data saved to: {output_file_path}")
else:
    print("No CSV files found in the folder.")
