### 1. Data Collection

In [None]:
import pandas as ps

df = read_csv('file_path') #assuming file is of type csv

### 2. Data Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df.groupby('category').emails_data.count().plot.bar(ylim = 0)
plt.show() #displays histogram with class distributions

df['category'].value_counts() #gives count of each category

df = df[pd.notnull(df['emails_data'])] #removes missing values from the emails_data column

df = df[pd.notnull(df['category'])] #removes missing values from the category column

### 3. Data Cleaning

In [None]:
from bs4 import BeautifulSoup
from html import unescape

# function that creates new column with email data without urls

def remove_url(df):
    df['clean_email_data'] = ''
    for i in range(len(df)):
        uncleaned_text = df['emails_data'][i]
        clean_email_data = unescape(clean_email_data)
        final_data = BeautifulSoup(clean_email_data)
        clean_email= final_data.get_text()
        df['clean_email_data'][i]= clean_email
    return df

### 4. Text Processing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

df['lower_case'] = df['emails_data'].apply(lambda x: x.lower().strip().replace('\n', ' ').replace('\r', ' '))

df['without-link'] = df['lower_case'].apply(lambda x: re.sub(r'http\S+', '', x))

tokenizer = RegexpTokenizer(r'\w+')

df['Special_word'] = df.apply(lambda row: tokenizer.tokenize(row['lower_case']), axis=1)

stop = [word for word in stopwords.words('english')]

df['stop_words'] = df['Special_word'].apply(lambda x: [item for item in x if item not in stop])

df['stop_words'] = df['stop_words'].astype('str')

df['short_word'] = df['stop_words'].str.findall('\w{2,}')

df['string']=df['short_word'].str.join(' ')

df['Text'] = df['string'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Vectorization and Coverting categorical labels to numeric labels

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

x_train, x_test, y_train, y_test = train_test_split(df["emails_data"],df["category"], test_size = 0.25, random_state = 42)

count_vect = CountVectorizer(ngram_range=(1, 2))
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

### 5. Random Forest Implementation

In [None]:
from sklearn.ensemble RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42, class_weight='balanced')
rfc.fit(x_train_tfidf,y_train)

### 6. Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score, recall_score

y_pred = rfc.predict(x_test_tfidf)
print("Accuracy: "+str(accuracy_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))

### Sample Code for Extracting price from text

In [None]:
import re

def extract_total_amount(text):
    #Define regular expressions for common currency patterns
    currency_patterns = [
        r'\$\s?(\d+(\.\d{1,2})?)',  # $ followed by digits and optional decimal
        r'(\d+(\.\d{1,2})?)\s?USD',  # Digits followed by USD
        r'(\d+(\.\d{1,2})?)\s?EUR',  # Digits followed by EUR
    ]

    total_amount = None
    for pattern in currency_patterns:
        match = re.search(pattern, text)
        if match:
            total_amount = float(match.group(1))
            break

    return total_amount

#Test the function with sample text
sample_text = "Your total payment is $150.45. Thank you!"
total_amount = extract_total_amount(sample_text)
if total_amount:
    print(f"Extracted Total Amount: ${total_amount:.2f}")
else:
    print("Total amount not found in the text.")