<a href="https://colab.research.google.com/github/makhlufiaero338/tugas-machine-learning/blob/main/tugasperbaikan/Tugas_perbaikan_bab8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.validation import check_is_fitted
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
# Load IMDB dataset from nltk
def load_imdb_data():
    data = []
    labels = []
    for fileid in movie_reviews.fileids('pos'):
        data.append(movie_reviews.raw(fileid))
        labels.append('positive')
    for fileid in movie_reviews.fileids('neg'):
        data.append(movie_reviews.raw(fileid))
        labels.append('negative')
    return pd.DataFrame({"review": data, "sentiment": labels})

data = load_imdb_data()
print(f"Dataset loaded: {data.shape[0]} samples")

Dataset loaded: 2000 samples


In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['review'], data['sentiment'], test_size=0.3, random_state=42
)

In [None]:
# Humans in the Loop
# Let's manually validate a sample
print("\nSample for manual review:")
print(X_train.iloc[0][:200], "...\n")
print(f"Label: {y_train.iloc[0]}")


Sample for manual review:
note : some may consider portions of the following text to be spoilers . 
be forewarned . 
 " all the world's a stage and all the men and women merely players they have their exits and their entrances ...

Label: positive


In [None]:
# From Prototype to Production
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB()),
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, y_pred))


Classification Report (Naive Bayes):
              precision    recall  f1-score   support

    negative       0.77      0.84      0.81       298
    positive       0.83      0.75      0.79       302

    accuracy                           0.80       600
   macro avg       0.80      0.80      0.80       600
weighted avg       0.80      0.80      0.80       600



In [None]:
# Testing Production Systems
# A simple test function
def test_pipeline(pipeline, sample_text):
    """Test pipeline with sample input."""
    try:
        prediction = pipeline.predict([sample_text])
        return prediction[0]
    except Exception as e:
        print(f"Error during prediction: {e}")

sample_review = "The movie was amazing, the performances were stellar, and I loved every minute of it."
print("\nTest Result:", test_pipeline(pipeline, sample_review))


Test Result: positive


In [None]:
# Building Your Own Estimator
class TextLengthExtractor(BaseEstimator, TransformerMixin):
    """Custom transformer to extract text length."""
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([[len(text)] for text in X])


In [None]:
# Updating the pipeline to include custom estimator
pipeline_custom = Pipeline([
    ('text_length', TextLengthExtractor()),
    ('classifier', LogisticRegression()),
])

In [None]:
# Generate length-based feature
X_train_length = TextLengthExtractor().transform(X_train)
pipeline_custom.fit(X_train_length, y_train)

print("\nCustom Estimator Pipeline trained on text length.")


Custom Estimator Pipeline trained on text length.


In [None]:
# Neural Networks with TensorFlow/Keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization for NN
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Define the model
nn_model = Sequential([
    Embedding(max_words, 50, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
# Train the model
nn_model.fit(X_train_pad, (y_train == 'positive').astype(int), epochs=3, batch_size=32, validation_split=0.2)

# Evaluate the model
nn_loss, nn_accuracy = nn_model.evaluate(X_test_pad, (y_test == 'positive').astype(int))
print(f"\nNeural Network Accuracy: {nn_accuracy:.2f}")

Epoch 1/3
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step - accuracy: 0.5281 - loss: 0.6925 - val_accuracy: 0.5000 - val_loss: 0.6908
Epoch 2/3
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6813 - loss: 0.6853 - val_accuracy: 0.5893 - val_loss: 0.6850
Epoch 3/3
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7295 - loss: 0.6668 - val_accuracy: 0.7429 - val_loss: 0.6676
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7483 - loss: 0.6632

Neural Network Accuracy: 0.74


In [None]:
# Summary
print("\nSummary:")
print("1. Humans in the Loop demonstrated manual review.")
print("2. Production pipeline built and tested using Naive Bayes.")
print("3. Custom estimator incorporated into a pipeline.")
print("4. Neural network model trained and evaluated.")


Summary:
1. Humans in the Loop demonstrated manual review.
2. Production pipeline built and tested using Naive Bayes.
3. Custom estimator incorporated into a pipeline.
4. Neural network model trained and evaluated.
