# WEEK 4: Feature Engineering and Model Training

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Read datasets
outflow = pd.read_parquet("ucsd-outflows.pqt")
inflow = pd.read_parquet("ucsd-inflows.pqt")

In [None]:
# Filter out rows with 'memo' uncleaned
outflow_uncleaned = outflow[outflow['memo'] != outflow['category']]

# Lower case all values in memo
outflow_uncleaned.loc[:, 'memo'] = outflow_uncleaned['memo'].apply(lambda x: x.lower())

# Remove special characters and numbers
outflow_uncleaned.loc[:, 'memo'] = outflow_uncleaned['memo'].apply(lambda x: re.sub(r'[^a-z\s]', ' ', x))

# Remove placeholders
outflow_uncleaned.loc[:, 'memo'] = outflow_uncleaned['memo'].apply(lambda x: re.sub(r'xxx+', ' ', x))

# Remove extra spaces
outflow_uncleaned.loc[:, 'memo'] = outflow_uncleaned['memo'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Observe the first 5000 rows of the cleaned 'memo' column 
print(outflow_uncleaned.iloc[:5000, :]['memo'].to_string())

In [None]:
#Format the outflow dataset
outflow_cleaned = outflow[outflow['memo'] == outflow['category']]
outflow_memo = pd.concat([outflow_uncleaned, outflow_cleaned]).sort_index()
outflow_memo['memo_default'] = outflow['memo']
outflow_memo = outflow_memo[['prism_consumer_id', 'prism_account_id', 'memo_default', 'memo','amount', 'posted_date', 'category']]
outflow_memo

In [None]:
outflow_data = outflow_memo.copy()

# Conduct train-test split on dataset based on customer ids
customer_id = outflow_data['prism_consumer_id'].unique()

train_id, test_id = train_test_split(customer_id, test_size= 0.25, random_state = 42)

train_data = outflow_data[outflow_data['prism_consumer_id'].isin(train_id)]
test_data = outflow_data[outflow_data['prism_consumer_id'].isin(test_id)]

In [None]:
# Generate TF-IDF features from the cleaned memo column
tfidf_vectorizer = TfidfVectorizer(max_features = 1000)
tfidf_features = tfidf_vectorizer.fit_transform(train_data['memo']).toarray()

# Convert TF-IDF features to DataFrame
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf_vectorizer.get_feature_names_out())

# Convert 'posted_date' to datetime
date_series = pd.to_datetime(outflow_data['posted_date'])

# Create new date-based features
outflow_data['day_of_week'] = date_series.dt.dayofweek
outflow_data['day_of_month'] = date_series.dt.day

# Create an indicator if the amount is a whole dollar
outflow_data['whole_dollar'] = (outflow_data['amount'] % 1 == 0).astype(int)

# Combine TF-IDF features with date/amount-based features
features = pd.concat([tfidf_df, outflow_data[['day_of_week', 'day_of_month', 'whole_dollar', 'amount']]
                      .reset_index().drop(columns=['index'])], axis=1)
labels = outflow_data['category']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Evaluate
y_pred = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression F1-Score:", f1_score(y_test, y_pred, average='weighted'))

# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest F1-Score:", f1_score(y_test, y_pred_rf, average='weighted'))