1. Import Libraries

In [90]:
import pandas as pd
import os
import re
from datetime import datetime


2. Load the Excel File


In [91]:
# Define file path
file_path = r'C:\Users\FINRISE\Desktop\Task data scie\case_study_FTE\case_study_FTE\case_study_1\data\section_one_data\Emails_CS.xlsx'


In [92]:
file_path

'C:\\Users\\FINRISE\\Desktop\\Task data scie\\case_study_FTE\\case_study_FTE\\case_study_1\\data\\section_one_data\\Emails_CS.xlsx'

In [93]:
# Read Excel file
email_data = pd.read_excel(file_path)
print("Initial shape:", email_data.shape)


Initial shape: (2036, 8)


In [94]:
# Remove duplicates
email_data.drop_duplicates(inplace=True)

In [95]:
# Drop rows with missing important values
email_data.dropna(subset=['file_name', 'file_creation_date'], inplace=True)


In [96]:
# Convert 'file_creation_date' to datetime
email_data['file_creation_date'] = pd.to_datetime(email_data['file_creation_date'], errors='coerce')


In [97]:
# Drop rows with invalid dates
email_data.dropna(subset=['file_creation_date'], inplace=True)


In [98]:
# Extract file extension if missing
if 'file_extension' not in email_data.columns:
    email_data['file_extension'] = email_data['file_name'].apply(
        lambda x: os.path.splitext(x)[1].replace('.', '') if pd.notnull(x) else '')


In [99]:
# Add any missing required columns with placeholder data
required_cols = ['file_name', 'file_content', 'file_id', 'folder_id',
                 'file_extension', 'file_creation_date', 'file_subtype', 'created_by']


In [100]:
for col in required_cols:
    if col not in email_data.columns:
        email_data[col] = 'N/A'


In [101]:
# Reorder the columns
email_data = email_data[required_cols]


In [102]:
# Function to extract dollar amounts from text
def extract_dollar_amount(text):
    amounts = re.findall(r'\$\d+(?:,\d{3})*(?:\.\d{2})?', str(text))
    return amounts[0] if amounts else None


In [103]:
# Apply the function
email_data['dollar_amount'] = email_data['file_content'].apply(extract_dollar_amount)


In [104]:
# Define output directory
output_dir = r'C:\Users\FINRISE\Desktop\Task data scie\case_study_FTE\case_study_FTE\case_study_1\data\section_one_data\Email output'


In [105]:
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)


In [106]:
# Define full output file paths
csv_file = os.path.join(output_dir, "Cleaned_Emails_Metadata_with_Dollar_Amounts.csv")
excel_file = os.path.join(output_dir, "Cleaned_Emails_Metadata_with_Dollar_Amounts.xlsx")


In [107]:
# Save files
email_data.to_csv(csv_file, index=False)
email_data.to_excel(excel_file, index=False)



In [108]:
print("Files saved successfully to:")
print(csv_file)
print(excel_file)


Files saved successfully to:
C:\Users\FINRISE\Desktop\Task data scie\case_study_FTE\case_study_FTE\case_study_1\data\section_one_data\Email output\Cleaned_Emails_Metadata_with_Dollar_Amounts.csv
C:\Users\FINRISE\Desktop\Task data scie\case_study_FTE\case_study_FTE\case_study_1\data\section_one_data\Email output\Cleaned_Emails_Metadata_with_Dollar_Amounts.xlsx
