In [3]:
import os
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.downloader import download

# Download NLTK resources for stopwords and lemmatization
download('stopwords')
download('wordnet')

# Step 1: Specify the folder path on your desktop
desktop_folder = os.path.expanduser("~/Desktop/Preprocessing")  # Replace 'YourFolderName' with your folder's name

# Step 2: List all files in the folder and filter for CSV files
all_files = os.listdir(desktop_folder)
csv_files = [f for f in all_files if f.endswith('.csv')]

if csv_files:
    # Step 3: Choose the first CSV file (or modify logic to select a specific file)
    file_path = os.path.join(desktop_folder, csv_files[0])
    print(f"Selected file: {file_path}")

    # Load the CSV file
    data = pd.read_csv(file_path)
    print("\nLoaded data preview:")
    print(data.head())

    # Step 4: Preprocess the text data
    # Drop rows with missing 'Title' or 'Description'
    data = data.dropna(subset=['Title', 'Description'])

    # Combine 'Title' and 'Description' into a new column 'CombinedText'
    data['CombinedText'] = data['Title'] + " " + data['Description']

    # Normalize text (convert to lowercase)
    data['CombinedText'] = data['CombinedText'].str.lower()

    # Remove special characters, numbers, and extra whitespace
    data['CombinedText'] = data['CombinedText'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    data['CombinedText'] = data['CombinedText'].str.strip().replace('\s+', ' ', regex=True)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    data['CleanedText'] = data['CombinedText'].apply(
        lambda x: ' '.join(word for word in x.split() if word not in stop_words)
    )

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    data['LemmatizedText'] = data['CleanedText'].apply(
        lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split())
    )

    # Filter out rows with very short text (e.g., less than 10 characters)
    data = data[data['LemmatizedText'].str.len() > 10]

    # Display the cleaned data
    print("\nPreprocessed data preview:")
    print(data[['Title', 'Description', 'LemmatizedText']].head())

    # Step 5: Save preprocessed data to a new CSV file
    output_file_path = os.path.join(desktop_folder, "Ideas_Cleaned.csv")
    data.to_csv(output_file_path, index=False)
    print(f"\nPreprocessed data saved to: {output_file_path}")
else:
    print("No CSV files found in the folder.")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexthorpe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexthorpe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Selected file: /Users/alexthorpe/Desktop/Preprocessing/Ideas-2025-01-06-17-01-48-1540.csv

Loaded data preview:
     Code              Category         Submitted             Submitter  \
0  D27603           Enhancement  12/20/2024 18:54               J Leone   
1  D27602           Enhancement  12/19/2024 10:25               J Leone   
2  D27600           Enhancement  12/18/2024 11:09  Brian Wermerskirchen   
3  D27599           Enhancement  12/17/2024 15:53       Emerson Lambert   
4  D27595  Reporting/Dashboards  12/13/2024 19:59         France Dreyer   

                  Submitter email Team Name Submission Team  \
0           jleone@brightidea.com       NaN             NaN   
1           jleone@brightidea.com       NaN             NaN   
2  bwermerskirchen@brightidea.com       NaN             NaN   
3         elambert@brightidea.com       NaN             NaN   
4          fdreyer@brightidea.com       NaN             NaN   

  Submission Team email                                   