<a href="https://colab.research.google.com/github/natanaelwgm/2025w-PromedUI-NLPCC-Ganjil20242025/blob/main/nlpcc25_livecoding_week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from google.colab import files
import io

In [None]:


# Step 2: Upload your Excel file
print("Please upload your Excel training file (e.g., 'training_data.xlsx')")
print("The file should have two columns: 'text' and 'label'")
uploaded = files.upload()

# Check if a file was uploaded
if not uploaded:
    print("\nNo file uploaded. Please run the cell again and upload a file.")
else:
    # Get the filename of the uploaded file
    filename = next(iter(uploaded))
    print(f"\nUploaded file: '{filename}'")

    try:
        # Step 3: Load the data from the Excel file
        # We use io.BytesIO to read the uploaded file content directly
        df = pd.read_excel(io.BytesIO(uploaded[filename]))
        print("\nFirst 5 rows of your data:")
        print(df.head())

        # Ensure the required columns exist
        if 'text' not in df.columns or 'label' not in df.columns:
            raise ValueError("Excel file must contain 'text' and 'label' columns.")

        # Drop rows where 'text' or 'label' is NaN, just in case
        df.dropna(subset=['text', 'label'], inplace=True)

        # Convert 'text' column to string type to avoid issues with non-string data
        df['text'] = df['text'].astype(str)


        # Step 4: Prepare the data
        X_train = df['text']  # The text data
        y_train = df['label'] # The corresponding labels

        print(f"\nFound {len(df)} training samples.")
        print(f"Unique labels found: {y_train.unique()}")

        # Step 5: Create and train the Naive Bayes model
        # We'll use a pipeline to combine CountVectorizer and MultinomialNB
        # CountVectorizer settings:
        # - lowercase=True: Converts all text to lowercase (as requested)
        # - analyzer='word': Splits text into words (as requested)
        model = make_pipeline(
            CountVectorizer(lowercase=True, analyzer='word'),
            MultinomialNB()
        )

        print("\nTraining the Naive Bayes model...")
        model.fit(X_train, y_train)
        print("Model training complete!")

        # Step 6: Prediction section
        print("\n--- Text Prediction ---")
        print("Type your text below and press Enter to get a sentiment prediction.")
        print("Type 'quit' or 'exit' to stop.")

        while True:
            new_text = input("\nEnter text to classify: ")
            if new_text.lower() in ['quit', 'exit']:
                print("Exiting prediction mode.")
                break
            if not new_text.strip():
                print("Please enter some text.")
                continue

            # The model's predict method expects a list or iterable of texts
            prediction = model.predict([new_text])
            # model.predict_proba can give you probabilities for each class
            probabilities = model.predict_proba([new_text])

            print(f"Predicted Label: {prediction[0]}")

            # Display probabilities for each class
            print("Probabilities:")
            for i, class_label in enumerate(model.classes_):
                print(f"  {class_label}: {probabilities[0][i]:.4f}")

    except FileNotFoundError:
        print(f"Error: File '{filename}' not found after upload. This shouldn't happen with google.colab.files.upload().")
    except ValueError as ve:
        print(f"ValueError: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please ensure your Excel file is formatted correctly with 'text' and 'label' columns.")