<a href="https://colab.research.google.com/github/natanaelwgm/2025w-PromedUI-NLPCC-Ganjil20242025/blob/main/nlpcc25_livecoding_week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from google.colab import files
import io

In [2]:


# Step 2: Upload your Excel file
print("Please upload your Excel training file (e.g., 'training_data.xlsx')")
print("The file should have two columns: 'text' and 'label'")
uploaded = files.upload()

# Check if a file was uploaded
if not uploaded:
    print("\nNo file uploaded. Please run the cell again and upload a file.")
else:
    # Get the filename of the uploaded file
    filename = next(iter(uploaded))
    print(f"\nUploaded file: '{filename}'")

    try:
        # Step 3: Load the data from the Excel file
        # We use io.BytesIO to read the uploaded file content directly
        df = pd.read_excel(io.BytesIO(uploaded[filename]))
        print("\nFirst 5 rows of your data:")
        print(df.head())

        # Ensure the required columns exist
        if 'text' not in df.columns or 'label' not in df.columns:
            raise ValueError("Excel file must contain 'text' and 'label' columns.")

        # Drop rows where 'text' or 'label' is NaN, just in case
        df.dropna(subset=['text', 'label'], inplace=True)

        # Convert 'text' column to string type to avoid issues with non-string data
        df['text'] = df['text'].astype(str)


        # Step 4: Prepare the data
        X_train = df['text']  # The text data
        y_train = df['label'] # The corresponding labels

        print(f"\nFound {len(df)} training samples.")
        print(f"Unique labels found: {y_train.unique()}")

        # Step 5: Create and train the Naive Bayes model
        # We'll use a pipeline to combine CountVectorizer and MultinomialNB
        # CountVectorizer settings:
        # - lowercase=True: Converts all text to lowercase (as requested)
        # - analyzer='word': Splits text into words (as requested)
        model = make_pipeline(
            CountVectorizer(lowercase=True, analyzer='word'),
            MultinomialNB()
        )

        print("\nTraining the Naive Bayes model...")
        model.fit(X_train, y_train)
        print("Model training complete!")

        # Step 6: Prediction section
        print("\n--- Text Prediction ---")
        print("Type your text below and press Enter to get a sentiment prediction.")
        print("Type 'quit' or 'exit' to stop.")

        while True:
            new_text = input("\nEnter text to classify: ")
            if new_text.lower() in ['quit', 'exit']:
                print("Exiting prediction mode.")
                break
            if not new_text.strip():
                print("Please enter some text.")
                continue

            # The model's predict method expects a list or iterable of texts
            prediction = model.predict([new_text])
            # model.predict_proba can give you probabilities for each class
            probabilities = model.predict_proba([new_text])

            print(f"Predicted Label: {prediction[0]}")

            # Display probabilities for each class
            print("Probabilities:")
            for i, class_label in enumerate(model.classes_):
                print(f"  {class_label}: {probabilities[0][i]:.4f}")

    except FileNotFoundError:
        print(f"Error: File '{filename}' not found after upload. This shouldn't happen with google.colab.files.upload().")
    except ValueError as ve:
        print(f"ValueError: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please ensure your Excel file is formatted correctly with 'text' and 'label' columns.")

Please upload your Excel training file (e.g., 'training_data.xlsx')
The file should have two columns: 'text' and 'label'


Saving text_training.xlsx to text_training.xlsx

Uploaded file: 'text_training.xlsx'

First 5 rows of your data:
                                             text     label
0  Bogor adalah kota hujan yang jelek ancur lebur  Negative
1                         Bajunya bagus banget :)  Positive
2                          Soto ayam enak guys…..  Positive
3         Saya pergi ke Bandung dan kureng banget  Negative
4            Batam adalah kota yang sangat sempit  Negative

Found 6 training samples.
Unique labels found: ['Negative' 'Positive']

Training the Naive Bayes model...
Model training complete!

--- Text Prediction ---
Type your text below and press Enter to get a sentiment prediction.
Type 'quit' or 'exit' to stop.

Enter text to classify: Baru makan di kafe dan enak, bagus, lightingnya oke
Predicted Label: Positive
Probabilities:
  Negative: 0.1107
  Positive: 0.8893

Enter text to classify: Baju jelek sender mengesalkan
Predicted Label: Negative
Probabilities:
  Negative: 0.6269


KeyboardInterrupt: Interrupted by user

In [3]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from google.colab import files # Assuming you're still in Colab
import io
import numpy as np

# --- MODIFIED Function to show Naive Bayes Trace (More Concise) ---
def show_naive_bayes_trace_concise(text_input, model_pipeline):
    vectorizer = model_pipeline.named_steps['countvectorizer']
    nb_classifier = model_pipeline.named_steps['multinomialnb']

    print(f"\n--- Concise Naive Bayes Trace for: \"{text_input}\" ---")

    # 1. Preprocess and tokenize the input text
    analyzer = vectorizer.build_analyzer()
    tokens = analyzer(text_input) # These are the tokens after preprocessing

    if not tokens:
        print("1. Tokens extracted: (No tokens extracted or all tokens are stop words/too short).")
        # If no tokens, prediction will likely be based on priors or a default
        # We can still show how priors would lead to the prediction
        # This part might need more specific handling if NB errors on empty transformed input
    else:
        print(f"1. Tokens extracted (after lowercase, etc.): {tokens}")

    # Identify words from input not in vocabulary
    vocab = vectorizer.vocabulary_
    input_vocab_words = [token for token in tokens if token in vocab]
    input_oov_words = list(set([token for token in tokens if token not in vocab])) # Unique OOV words

    if not input_vocab_words and tokens: # Tokens exist, but none are in vocab
        print(f"   WARNING: None of these tokens were found in the model's training vocabulary.")
        print(f"   Out-of-vocabulary words found: {input_oov_words if input_oov_words else '(None)'}")
        # Prediction will be based purely on class priors.
    elif input_oov_words:
        print(f"   Words from input NOT in training vocabulary (ignored by model): {input_oov_words}")

    if not input_vocab_words and not tokens: # No tokens extracted at all
        pass # Already handled
    elif not input_vocab_words and tokens: # Tokens extracted, but all OOV
        print("   Scores will be based on class priors only as no vocabulary words from input contribute.")
    else: # Some tokens are in vocab
         print(f"   Vocabulary words from input contributing to scores: {list(set(input_vocab_words))}")


    # Transform the text to get the feature vector (counts of known words)
    X_transformed = vectorizer.transform([text_input])
    # This is log P(Class) + sum log P(word_i | Class) * count(word_i) for known words
    # Or more precisely: X_transformed @ nb_classifier.feature_log_prob_.T + nb_classifier.class_log_prior_
    model_internal_scores = nb_classifier._joint_log_likelihood(X_transformed)[0]

    all_class_scores_manual = {}
    print("\n2. Score Calculation per Class:")
    print("   Formula: Score(Class) = log P(Class) [Prior] + Σ log P(token_i | Class) [Likelihood Sum]")
    print("   (where token_i are occurrences of tokens from input found in vocabulary)\n")

    for i, class_label in enumerate(nb_classifier.classes_):
        # a) Log Prior Probability of the class: log P(Class)
        log_P_class = nb_classifier.class_log_prior_[i]

        # b) Sum of Log Conditional Probabilities of words: sum(log P(word | Class))
        sum_log_P_word_given_class = 0.0

        # Iterate through the tokens found in the *input text*.
        # For each token, if it's in the vocabulary, we add its log P(word | Class).
        for token in tokens: # Iterate over all tokens, including duplicates
            if token in vocab:
                token_idx = vocab[token]
                log_P_token_given_class = nb_classifier.feature_log_prob_[i, token_idx]
                sum_log_P_word_given_class += log_P_token_given_class

        manual_score_for_class = log_P_class + sum_log_P_word_given_class
        all_class_scores_manual[class_label] = manual_score_for_class

        print(f"   Class: \"{class_label}\"")
        print(f"     log P({class_label}) [Prior]                       = {log_P_class:.4f}")
        print(f"     Σ log P(token | {class_label}) [Likelihood Sum]  = {sum_log_P_word_given_class:.4f}")
        print(f"     ---------------------------------------------------")
        print(f"     Calculated Score                              = {manual_score_for_class:.4f}")

        model_calculated_score_for_class = model_internal_scores[i]
        # Minor diffs are okay due to float precision or if X_transformed is all zeros (sklearn might handle differently)
        if not np.isclose(manual_score_for_class, model_calculated_score_for_class):
             print(f"     Model Internal Score (for verification)     = {model_calculated_score_for_class:.4f} [Note potential minor diff]")
        print("-" * 20)

    print("\n3. Detailed Word Log Probabilities for Vocabulary Words in Input:")
    if not input_vocab_words:
        print("   (No vocabulary words from input to detail)")
    else:
        # Display log P(word|Class) only for unique words from input that are in vocab
        unique_input_vocab_words = sorted(list(set(input_vocab_words)))
        header = "   {:<15}".format("Word")
        for class_label in nb_classifier.classes_:
            header += " | log P(W|{:<5})".format(str(class_label)[:5]) # Truncate class name if too long
        print(header)
        print("   " + "-" * (len(header)-3))

        for word in unique_input_vocab_words:
            token_idx = vocab[word]
            row_str = "   {:<15}".format(word)
            for i, class_label in enumerate(nb_classifier.classes_):
                log_prob = nb_classifier.feature_log_prob_[i, token_idx]
                row_str += " | {:<14.4f}".format(log_prob)
            print(row_str)

    print("\n4. Final Prediction based on highest score:")
    predicted_class_idx = np.argmax(model_internal_scores) # Using model's scores
    predicted_class_label = nb_classifier.classes_[predicted_class_idx]
    print(f"   The class with the highest score is: \"{predicted_class_label}\" (Score: {model_internal_scores[predicted_class_idx]:.4f})")

# --- Main script part (assuming previous steps are similar) ---
# Step 2: Upload your Excel file (keep as is)
print("Please upload your Excel training file (e.g., 'training_data.xlsx')")
print("The file should have two columns: 'text' and 'label'")
uploaded = files.upload()

if not uploaded:
    print("\nNo file uploaded. Please run the cell again and upload a file.")
else:
    filename = next(iter(uploaded))
    print(f"\nUploaded file: '{filename}'")

    try:
        df = pd.read_excel(io.BytesIO(uploaded[filename]))
        print("\nFirst 5 rows of your data:")
        print(df.head())

        if 'text' not in df.columns or 'label' not in df.columns:
            raise ValueError("Excel file must contain 'text' and 'label' columns.")

        df.dropna(subset=['text', 'label'], inplace=True)
        df['text'] = df['text'].astype(str)

        X_train = df['text']
        y_train = df['label']

        print(f"\nFound {len(df)} training samples.")
        print(f"Unique labels found: {y_train.unique()}")

        model = make_pipeline(
            CountVectorizer(lowercase=True, analyzer='word'),
            MultinomialNB()
        )

        print("\nTraining the Naive Bayes model...")
        model.fit(X_train, y_train)
        print("Model training complete!")

        print("\n--- Text Prediction ---")
        print("Type your text below and press Enter to get a sentiment prediction.")
        print("Type 'quit' or 'exit' to stop.")

        while True:
            new_text = input("\nEnter text to classify (or 'quit'/'exit'): ")
            if new_text.lower() in ['quit', 'exit']:
                print("Exiting prediction mode.")
                break
            if not new_text.strip():
                print("Please enter some text.")
                continue

            prediction = model.predict([new_text])
            probabilities = model.predict_proba([new_text])

            print(f"\n--- Prediction for: \"{new_text}\" ---")
            print(f"Overall Predicted Label: {prediction[0]}")

            print("\nNormalized Probabilities (P(Class | Text)):")
            for i, class_label in enumerate(model.classes_):
                print(f"  {class_label}: {probabilities[0][i]:.4f}")

            # Use the concise trace function
            show_naive_bayes_trace_concise(new_text, model) # MODIFIED HERE

    except FileNotFoundError:
        print(f"Error: File '{filename}' not found after upload.")
    except ValueError as ve:
        print(f"ValueError: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please ensure your Excel file is formatted correctly with 'text' and 'label' columns.")

Please upload your Excel training file (e.g., 'training_data.xlsx')
The file should have two columns: 'text' and 'label'


Saving text_training.xlsx to text_training (2).xlsx

Uploaded file: 'text_training (2).xlsx'

First 5 rows of your data:
                                             text     label
0  Bogor adalah kota hujan yang jelek ancur lebur  Negative
1                         Bajunya bagus banget :)  Positive
2                          Soto ayam enak guys…..  Positive
3         Saya pergi ke Bandung dan kureng banget  Negative
4            Batam adalah kota yang sangat sempit  Negative

Found 6 training samples.
Unique labels found: ['Negative' 'Positive']

Training the Naive Bayes model...
Model training complete!

--- Text Prediction ---
Type your text below and press Enter to get a sentiment prediction.
Type 'quit' or 'exit' to stop.

Enter text to classify (or 'quit'/'exit'): Baju jelek sender mengesalkan

--- Prediction for: "Baju jelek sender mengesalkan" ---
Overall Predicted Label: Negative

Normalized Probabilities (P(Class | Text)):
  Negative: 0.6269
  Positive: 0.3731

--- Concise Na

In [2]:
# 1. Install vaderSentiment library
!pip install vaderSentiment

# 2. Import necessary components and clear previous output
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from IPython.display import clear_output
import time

# Give a moment for pip install to complete and then clear the output
time.sleep(1)
clear_output()

print("VADER Sentiment Analyzer is Ready!")
print("===================================")

# 3. Create an input field for text
text_to_analyze = input("Enter the text you want to analyze: ")

# 4. Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# 5. Get the sentiment polarity scores
# The polarity_scores method returns a dictionary with 'neg', 'neu', 'pos', and 'compound' scores.
# - neg: Negative score
# - neu: Neutral score
# - pos: Positive score
# - compound: A normalized, weighted composite score ranging from -1 (most extreme negative) to +1 (most extreme positive).
vs = analyzer.polarity_scores(text_to_analyze)
compound_score = vs['compound']

print("\n--- Sentiment Analysis Result ---")
print(f"Input Text: \"{text_to_analyze}\"")
print(f"VADER Scores: {vs}")
print(f"Compound Score: {compound_score:.4f}")

# 6. Predict positive or negative based on the compound score
# Typical thresholds (can be adjusted):
# - positive sentiment: compound score >= 0.05
# - neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
# - negative sentiment: compound score <= -0.05
if compound_score >= 0.05:
    sentiment_prediction = "Positive"
elif compound_score <= -0.05:
    sentiment_prediction = "Negative"
else:
    sentiment_prediction = "Neutral"

print(f"Predicted Sentiment: {sentiment_prediction}")
print("===================================")

VADER Sentiment Analyzer is Ready!
Enter the text you want to analyze: goddamn this place so bad

--- Sentiment Analysis Result ---
Input Text: "goddamn this place so bad"
VADER Scores: {'neg': 0.717, 'neu': 0.283, 'pos': 0.0, 'compound': -0.822}
Compound Score: -0.8220
Predicted Sentiment: Negative
