In [1]:
# Cell 1: Load and Prepare Models and Data
# Specify paths to trained_models folder and feedback data
trained_models_dir = '/Users/moazam_a12/Sentiment Analysis of Internship Feedback/trained models'
feedback_data_path = '/Users/moazam_a12/Sentiment Analysis of Internship Feedback/data/synthetic_intern_feedback.csv'

import os
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
import pandas as pd

bert_model_path = os.path.join(trained_models_dir, 'bert_model')

try:
    if not os.path.exists(bert_model_path):
        raise FileNotFoundError(f"BERT model directory {bert_model_path} does not exist.")
    required_files = ['config.json', 'model.safetensors', 'tokenizer_config.json']
    missing_files = [f for f in required_files if not os.path.exists(os.path.join(bert_model_path, f))]
    if missing_files:
        raise FileNotFoundError(f"Missing files in {bert_model_path}: {missing_files}")
    bert_model = DistilBertForSequenceClassification.from_pretrained(bert_model_path)
    tokenizer = DistilBertTokenizer.from_pretrained(bert_model_path)
    print("DistilBERT model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading DistilBERT model from {bert_model_path}: {e}")
    print("Ensure 'bert_model' directory contains 'config.json', 'model.safetensors', and tokenizer files.")
    # Fallback to a sample text with a freshly loaded tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    feedback_texts = ['Sample feedback for testing.']
    df = pd.DataFrame({'Feedback': feedback_texts})
else:
    try:
        df = pd.read_csv(feedback_data_path)
        possible_columns = ['feedback', 'Feedback', 'text', 'Text', 'comment', 'Comment']
        feedback_column = None
        for col in possible_columns:
            if col in df.columns:
                feedback_column = col
                break
        if feedback_column is None:
            raise ValueError("No feedback column found. Available columns: " + str(df.columns.tolist()))
        feedback_texts = df[feedback_column].tolist()
        print(f"Feedback data loaded successfully from column '{feedback_column}'.")
    except Exception as e:
        print(f"Error loading data from {feedback_data_path}: {e}")
        feedback_texts = ['Sample feedback for testing.']
        df = pd.DataFrame({'Feedback': feedback_texts})

def preprocess_bert(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)

try:
    bert_inputs = preprocess_bert(feedback_texts)
    print("Feedback texts preprocessed successfully.")
except Exception as e:
    print(f"Error preprocessing texts: {e}")
    bert_inputs = preprocess_bert(['Sample feedback for testing.'])

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
bert_model.to(device)
bert_inputs = {k: v.to(device) for k, v in bert_inputs.items()}
print(f"Model and inputs moved to {device}.")

DistilBERT model and tokenizer loaded successfully.
Feedback data loaded successfully from column 'Feedback'.
Feedback texts preprocessed successfully.
Model and inputs moved to mps.


In [3]:
# Cell 2: Classify Feedback and Show Sentiment Trends
from tqdm import tqdm

def classify_batch(inputs, batch_size=64):
    try:
        predictions = []
        total_batches = (len(inputs['input_ids']) + batch_size - 1) // batch_size
        for i in tqdm(range(0, len(inputs['input_ids']), batch_size), total=total_batches, desc="Classifying"):
            batch_inputs = {k: v[i:i+batch_size] for k, v in inputs.items()}
            with torch.no_grad():
                outputs = bert_model(**batch_inputs)
            batch_predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(batch_predictions)
        return [{0: 'Negative', 1: 'Neutral', 2: 'Positive'}[p] for p in predictions]
    except Exception as e:
        print(f"Error during classification: {e}")
        return ['Neutral'] * len(inputs['input_ids'])

df['predicted_sentiment'] = classify_batch(bert_inputs)

sentiment_dist = df['predicted_sentiment'].value_counts(normalize=True) * 100
print("Sentiment Trends:")
for sentiment, percentage in sentiment_dist.items():
    print(f"{sentiment}: {percentage:.1f}%")

Classifying: 100%|██████████████████████████| 1563/1563 [14:10<00:00,  1.84it/s]

Sentiment Trends:
Neutral: 34.0%
Positive: 33.0%
Negative: 33.0%





In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

def extract_negative_themes(df, feedback_column, num_reviews=10, use_all_feedback=False):
    negative_feedback = df[df['predicted_sentiment'] == 'Negative'][feedback_column]
    num_negative_reviews = len(negative_feedback)
    print(f"Number of negative reviews: {num_negative_reviews}")

    if num_negative_reviews == 0:
        print("No negative feedback found. Cannot extract themes.")
        return [('no themes found', 0)]

    random_indices = np.random.choice(negative_feedback.index, size=num_reviews, replace=False)
    selected_feedback = negative_feedback.loc[random_indices].tolist()
    
    print(f"\n{num_reviews} Randomly Selected Negative Feedback Entries:")
    for i, text in enumerate(selected_feedback, 1):
        print(f"Review {i}: {text[:150]}..." if len(text) > 150 else f"Review {i}: {text}")

    try:
        from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
        custom_stop_words = list(ENGLISH_STOP_WORDS) + [
            'intern', 'internship', 'was', 'were', 'felt', 'like', 'it', 'and',
            'there', 'any', 'despite', 'hardly', 'left', 'rather', 'than', 'job',
            'sometimes', 'least', 'english'
        ]

        feedback_to_analyze = negative_feedback if use_all_feedback else selected_feedback
        vectorizer_themes = TfidfVectorizer(
            stop_words=custom_stop_words,
            ngram_range=(1, 3),
            min_df=1 if not use_all_feedback else 10,
            max_df=0.8
        )
        X = vectorizer_themes.fit_transform(feedback_to_analyze)
        feature_names = vectorizer_themes.get_feature_names_out()
        tfidf_scores = X.sum(axis=0).A1
        word_score_dict = dict(zip(feature_names, tfidf_scores))
        top_themes = Counter(word_score_dict).most_common(5)
    except Exception as e:
        print(f"Error extracting themes: {e}")
        top_themes = [('no themes found', 1)]

    scope = "All Negative Feedback" if use_all_feedback else f"Selected {num_reviews} Negative Feedback"
    print(f"\nInsufficient Aspects in {scope}:")
    for theme, score in top_themes:
        formatted_theme = ' '.join(word.capitalize() for word in theme.split())
        print(f"{formatted_theme} (TF-IDF score: {score:.2f})")

    return top_themes

# Change num_reviews to adjust the number of displayed reviews
# Set use_all_feedback=True for themes from all negative feedback, False for selected reviews
top_themes = extract_negative_themes(df, feedback_column, num_reviews=10, use_all_feedback=True)

Number of negative reviews: 33000

10 Randomly Selected Negative Feedback Entries:
Review 1: My role was ill-defined, leading to constant confusion and frustration. I learned to manage my time effectively under pressure. I appreciated the free...
Review 2: The onboarding process was poorly structured and left me confused for weeks. At least the networking opportunities were abundant. I appreciated the fr...
Review 3: The onboarding process was poorly structured and left me confused for weeks. The office environment was vibrant and welcoming. At least the networking...
Review 4: There was a general lack of guidance, and feedback was either delayed or missing. Remote work made collaboration tricky at times. At least the network...
Review 5: There was a general lack of guidance, and feedback was either delayed or missing. I appreciated the freedom to explore new ideas, although guidance wa...
Review 6: I faced technical challenges that were ignored by my supervisors. I learned to manage m