In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import shap
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources if not already available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Preprocessing Utilities
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
sid = SentimentIntensityAnalyzer()

In [97]:
def preprocess_text(text):
    """Preprocess text: tokenization, lemmatization, and stopword removal."""
    if not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text.lower())
    lemmatized_tokens = [
        lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words
    ]
    return " ".join(lemmatized_tokens)

In [98]:
def extract_text_features(text):
    """Extract additional features from text."""
    if not isinstance(text, str):
        return {
            'length': 0,
            'word_count': 0,
            'sentiment_score': 0
        }
        
    # Basic metrics
    length = len(text)
    word_count = len(text.split())
    
    # Sentiment analysis
    sentiment_score = sid.polarity_scores(text)['compound']
    
    return {
        'length': length,
        'word_count': word_count,
        'sentiment_score': sentiment_score
    }

In [99]:
def add_time_features(df):
    """Add time-based features to the dataframe."""
    df['start_date'] = pd.to_datetime(df['start_date'])
    df['end_date'] = pd.to_datetime(df['end_date'])
    
    # Extract time-based features
    df['day_of_week'] = df['start_date'].dt.dayofweek
    df['hour_of_day'] = df['start_date'].dt.hour
    df['month'] = df['start_date'].dt.month
    df['is_weekend'] = df['start_date'].dt.dayofweek.isin([5, 6]).astype(int)
    
    return df

In [100]:
# Paths and Directories
today = datetime.today().strftime("%Y-%m-%d")
run_dir = f"C:/Users/misty.hickman/OneDrive/Documents/dev_work/TopicModeling/model_artifacts/run_{today}"
visualizations_dir = f"{run_dir}/visualizations"
os.makedirs(run_dir, exist_ok=True)
os.makedirs(visualizations_dir, exist_ok=True)

In [101]:
# Load the data
data_path = "research_review_data.csv"
df = pd.read_csv(data_path)

# Save a sample of original data before preprocessing
sample_data = {
    'review': df['reviews'].iloc[0],
    'response': df['response_reviews'].iloc[0],
    'start_date': df['start_date'].iloc[0]
}


In [102]:
# Text preprocessing
df['processed_reviews'] = df['reviews'].apply(preprocess_text)
df['processed_response_reviews'] = df['response_reviews'].apply(preprocess_text)
df['combined_text'] = df['processed_reviews'] + " " + df['processed_response_reviews']

In [103]:
# Extract text features with explicit column names
review_features = pd.DataFrame([extract_text_features(text) for text in df['reviews']], 
                             columns=['length', 'word_count', 'sentiment_score'])
response_features = pd.DataFrame([extract_text_features(text) for text in df['response_reviews']], 
                               columns=['length', 'word_count', 'sentiment_score'])

In [104]:
# Add prefixes to column names
review_features.columns = ['review_' + col for col in review_features.columns]
response_features.columns = ['response_' + col for col in response_features.columns]

# Add text features to main dataframe
df = pd.concat([df, review_features, response_features], axis=1)

# Add time features
df = add_time_features(df)

# Calculate duration
df['duration_time'] = (df['end_date'] - df['start_date']).dt.total_seconds() / 3600

In [105]:
# Prepare data for LDA
texts = df['combined_text'].str.split()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Train LDA Model
num_topics = 10
lda_model = LdaModel(
    corpus=corpus,
    num_topics=num_topics,
    id2word=dictionary,
    passes=10,
    random_state=42
)

In [106]:
# Extract topic distributions as features
def get_topic_distribution(bow):
    return [prob for _, prob in lda_model.get_document_topics(bow, minimum_probability=0)]

topic_distributions = [get_topic_distribution(bow) for bow in corpus]
topic_features = pd.DataFrame(topic_distributions, columns=[f'topic_{i}' for i in range(num_topics)])


In [107]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(df['combined_text'])

In [None]:
df.head()

In [109]:
# Combine all features
numerical_features = df[[
    'review_length', 'review_word_count', 'review_sentiment_score',
    'response_length', 'response_word_count', 'response_sentiment_score',
    'day_of_week', 'hour_of_day', 'month', 'is_weekend'
]].values

X = np.hstack([X_tfidf.toarray(), numerical_features, topic_distributions])
y = df['duration_time'].values

In [110]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [111]:
# Define model parameters for grid search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

In [112]:
# Train model with grid search
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test_scaled)



In [113]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

In [None]:
# Print metrics
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}%")

In [115]:
# Create visualizations directory if it doesn't exist
os.makedirs(visualizations_dir, exist_ok=True)

In [116]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Duration')
plt.ylabel('Predicted Duration')
plt.title('Actual vs Predicted Duration')
plt.tight_layout()
plt.savefig(os.path.join(visualizations_dir, 'actual_vs_predicted.png'))
plt.close()

In [117]:
# Plot residuals
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Duration')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.savefig(os.path.join(visualizations_dir, 'residuals.png'))
plt.close()

In [118]:
# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': list(tfidf_vectorizer.get_feature_names_out()) + 
               ['review_length', 'review_word_count', 'review_sentiment_score',
                'response_length', 'response_word_count', 'response_sentiment_score',
                'day_of_week', 'hour_of_day', 'month', 'is_weekend'] +
               [f'topic_{i}' for i in range(num_topics)],
    'importance': best_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False).head(20)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.savefig(os.path.join(visualizations_dir, 'feature_importance.png'))
plt.close()


In [119]:
# Calculate and plot SHAP values for feature importance interpretation
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test_scaled[:100])  # Using first 100 samples for visualization

# Create correct feature names list matching the actual features
feature_names = (
    list(tfidf_vectorizer.get_feature_names_out()) + 
    ['review_length', 'review_word_count', 'review_sentiment_score',
     'response_length', 'response_word_count', 'response_sentiment_score',
     'day_of_week', 'hour_of_day', 'month', 'is_weekend'] +
    [f'topic_{i}' for i in range(num_topics)]
)

# Ensure we have the correct number of feature names
if len(feature_names) != X_test_scaled.shape[1]:
    print(f"Warning: Feature names count ({len(feature_names)}) doesn't match feature count ({X_test_scaled.shape[1]})")
    # Use generic feature names if there's a mismatch
    feature_names = [f"Feature {i}" for i in range(X_test_scaled.shape[1])]

plt.figure(figsize=(12, 8))
shap.summary_plot(
    shap_values, 
    X_test_scaled[:100],
    feature_names=feature_names,
    show=False,
    plot_size=(12, 8)
)
plt.tight_layout()
plt.savefig(os.path.join(visualizations_dir, 'shap_summary.png'))
plt.close()

In [120]:

# Save model and artifacts
model_path = os.path.join(run_dir, "random_forest_model.pkl")
pd.to_pickle(best_model, model_path)

In [None]:
# Log metrics with MLflow
with mlflow.start_run():
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.sklearn.log_model(best_model, "model")

In [122]:
# Create a prediction function for new data
def predict_duration(review_text, response_text, start_date):
    """
    Predict duration for new review and response
    
    Parameters:
    review_text (str): The review text
    response_text (str): The response text
    start_date (str): The start date in format 'YYYY-MM-DD HH:MM:SS'
    
    Returns:
    float: Predicted duration in hours
    """
    # Preprocess text
    processed_review = preprocess_text(review_text)
    processed_response = preprocess_text(response_text)
    combined_text = processed_review + " " + processed_response
    
    # Extract text features
    review_feats = extract_text_features(review_text)
    response_feats = extract_text_features(response_text)
    
    # Create temporary dataframe for time features
    temp_df = pd.DataFrame({
        'start_date': [pd.to_datetime(start_date)]
    })
    temp_df = add_time_features(temp_df)
    
    # Transform text using saved vectorizer
    text_features = tfidf_vectorizer.transform([combined_text]).toarray()
    
    # Get topic distribution
    bow = dictionary.doc2bow(combined_text.split())
    topic_dist = get_topic_distribution(bow)
    
    # Combine all features
    numerical_feats = np.array([
        review_feats['length'], review_feats['word_count'], review_feats['sentiment_score'],
        response_feats['length'], response_feats['word_count'], response_feats['sentiment_score'],
        temp_df['day_of_week'].iloc[0], temp_df['hour_of_day'].iloc[0], 
        temp_df['month'].iloc[0], temp_df['is_weekend'].iloc[0]
    ])
    
    X_new = np.hstack([text_features, numerical_feats.reshape(1, -1), np.array(topic_dist).reshape(1, -1)])
    X_new_scaled = scaler.transform(X_new)
    
    return best_model.predict(X_new_scaled)[0]

In [None]:
# After all the model training, use the saved sample data for prediction
example_prediction = predict_duration(
    sample_data['review'],
    sample_data['response'],
    sample_data['start_date']
)
print(f"\nExample Prediction: {example_prediction:.2f} hours")

# Compare with actual duration if you want to validate
actual_duration = (pd.to_datetime(df['end_date'].iloc[0]) - pd.to_datetime(df['start_date'].iloc[0])).total_seconds() / 3600
print(f"Actual Duration: {actual_duration:.2f} hours")
print(f"Prediction Error: {abs(actual_duration - example_prediction):.2f} hours")