In [2]:
# Required Libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_recall_fscore_support, classification_report
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import matplotlib.pyplot as plt
import pickle
import mlflow
import mlflow.sklearn
import seaborn as sns

In [None]:
# Download NLTK resources if not already available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing Utilities
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [4]:
def preprocess_text(text):
    """Preprocess text: tokenization, lemmatization, and stopword removal."""
    if not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text.lower())
    lemmatized_tokens = [
        lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words
    ]
    return " ".join(lemmatized_tokens)

In [5]:
# Paths and Directories
today = datetime.today().strftime("%Y-%m-%d")
run_dir = f"C:/Users/misty.hickman/OneDrive/Documents/dev_work/TopicModeling/model_artifacts/run_{today}"
visualizations_dir = f"{run_dir}/visualizations"
os.makedirs(run_dir, exist_ok=True)
os.makedirs(visualizations_dir, exist_ok=True)

In [6]:
# Load the data
data_path = "research_review_data.csv"
df = pd.read_csv(data_path)

In [7]:
# Preprocess Text Columns
df['processed_reviews'] = df['reviews'].apply(preprocess_text)
df['processed_response_reviews'] = df['response_reviews'].apply(preprocess_text)
df['combined_text'] = df['processed_reviews'] + " " + df['processed_response_reviews']

In [8]:
# Calculate Duration Time
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df['duration_time'] = (df['end_date'] - df['start_date']).dt.total_seconds() / 3600  # Hours


In [9]:
# Prepare Data for LDA
texts = df['combined_text'].str.split()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
df.head()

In [11]:
# Train LDA Model
num_topics = 10
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)


In [12]:
# Save LDA Artifacts
lda_model.save(os.path.join(run_dir, "lda_model"))
dictionary.save(os.path.join(run_dir, "dictionary.dict"))
with open(os.path.join(run_dir, "corpus.pkl"), "wb") as f:
    pickle.dump(corpus, f)

In [13]:
# Load LDA Model Artifacts
logged_model = LdaModel.load(os.path.join(run_dir, "lda_model"))
logged_dictionary = Dictionary.load(os.path.join(run_dir, "dictionary.dict"))
with open(os.path.join(run_dir, "corpus.pkl"), "rb") as f:
    logged_corpus = pickle.load(f)

In [14]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.95, min_df=2)
X_tfidf = tfidf_vectorizer.fit_transform(df['combined_text'])

In [None]:
# Train Random Forest Regressor
X = X_tfidf
y = df['duration_time']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Evaluate Model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


In [17]:
# Feature Importance Plot
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1][:20]  # Top 20 features
top_features = [tfidf_vectorizer.get_feature_names_out()[i] for i in indices]

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(top_features[::-1], importances[indices][::-1])
plt.title("Feature Importance")
plt.savefig(os.path.join(visualizations_dir, "feature_importance.png"))

In [None]:
# Performance Metrics Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--")
plt.title("Performance Metrics Plot")
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.savefig(os.path.join(visualizations_dir, "performance_metrics.png"))
plt.show()