# Lesson 5: Advanced ML and Natural Language Processing

Explore advanced machine learning topics including NLP, ensemble methods, and model optimization.

## What You'll Learn
- Ensemble learning methods
- Natural Language Processing (NLP) basics
- Text classification and sentiment analysis
- Word embeddings and transformers
- Hyperparameter tuning
- Model deployment considerations

## Ensemble Learning

Combining multiple models for better predictions:
- **Bagging**: Train models on random subsets (Random Forest)
- **Boosting**: Sequential learning from mistakes (XGBoost, AdaBoost)
- **Stacking**: Combine predictions with meta-learner

In [None]:
# pip install scikit-learn xgboost

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import numpy as np

# Generate dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                          n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Dataset: {X_train.shape[0]} training, {X_test.shape[0]} test samples")

### Random Forest (Bagging)

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_acc * 100:.2f}%")

# Feature importance
print("\nTop 5 Important Features:")
feature_importance = sorted(zip(range(20), rf_model.feature_importances_), 
                           key=lambda x: x[1], reverse=True)[:5]
for feat, importance in feature_importance:
    print(f"  Feature {feat}: {importance:.4f}")

### Gradient Boosting

In [None]:
# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, 
                                     max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)

gb_pred = gb_model.predict(X_test)
gb_acc = accuracy_score(y_test, gb_pred)
print(f"Gradient Boosting Accuracy: {gb_acc * 100:.2f}%")

# XGBoost (optimized gradient boosting)
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, 
                             max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
print(f"XGBoost Accuracy: {xgb_acc * 100:.2f}%")

### Voting Ensemble

In [None]:
# Combine multiple models
voting_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
        ('lr', LogisticRegression(random_state=42))
    ],
    voting='soft'  # Use probability averages
)

voting_model.fit(X_train, y_train)
voting_pred = voting_model.predict(X_test)
voting_acc = accuracy_score(y_test, voting_pred)

print(f"\nVoting Ensemble Accuracy: {voting_acc * 100:.2f}%")
print("\nComparison:")
print(f"  Random Forest: {rf_acc * 100:.2f}%")
print(f"  Gradient Boosting: {gb_acc * 100:.2f}%")
print(f"  XGBoost: {xgb_acc * 100:.2f}%")
print(f"  Voting Ensemble: {voting_acc * 100:.2f}%")

## Natural Language Processing

### Text Preprocessing and Vectorization

In [None]:
# pip install nltk

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import re

# Sample text data
texts = [
    "This movie was absolutely amazing! Best film I've ever seen.",
    "Terrible movie, waste of time and money. Very disappointing.",
    "Great acting and plot. Highly recommend this movie!",
    "Boring and predictable. Would not watch again.",
    "Excellent cinematography and soundtrack. A masterpiece!",
    "Worst movie ever. Don't waste your time."
]

labels = [1, 0, 1, 0, 1, 0]  # 1=positive, 0=negative

def preprocess_text(text):
    """Clean and normalize text."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Preprocess
processed_texts = [preprocess_text(text) for text in texts]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)
X_tfidf = tfidf.fit_transform(processed_texts)

print("TF-IDF Matrix shape:", X_tfidf.shape)
print("\nTop features:", tfidf.get_feature_names_out()[:10])

### Sentiment Analysis

In [None]:
# Create larger dataset for sentiment analysis
positive_reviews = [
    "Excellent product, highly recommended!",
    "Love it! Best purchase ever.",
    "Amazing quality and fast delivery.",
    "Perfect! Exactly what I needed.",
    "Outstanding service and product."
]

negative_reviews = [
    "Terrible quality, very disappointed.",
    "Waste of money, do not buy.",
    "Poor customer service and defective product.",
    "Not as described, requesting refund.",
    "Awful experience, would give zero stars."
]

# Combine and label
all_reviews = positive_reviews + negative_reviews
sentiment_labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)

# Create pipeline
sentiment_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Train
sentiment_pipeline.fit(all_reviews, sentiment_labels)

# Test on new reviews
test_reviews = [
    "This is absolutely fantastic!",
    "Very poor quality and service",
    "Good value for money"
]

predictions = sentiment_pipeline.predict(test_reviews)
probabilities = sentiment_pipeline.predict_proba(test_reviews)

print("Sentiment Predictions:")
for review, pred, prob in zip(test_reviews, predictions, probabilities):
    sentiment = "Positive" if pred == 1 else "Negative"
    confidence = max(prob) * 100
    print(f"'{review}'")
    print(f"  -> {sentiment} ({confidence:.1f}% confident)\n")

## Word Embeddings

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
import tensorflow as tf

# Prepare text data
tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
tokenizer.fit_on_texts(all_reviews)

sequences = tokenizer.texts_to_sequences(all_reviews)
padded_sequences = pad_sequences(sequences, maxlen=20, padding='post')

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Padded sequence shape: {padded_sequences.shape}")

# Build LSTM model with embeddings
embedding_dim = 16

lstm_model = Sequential([
    Embedding(1000, embedding_dim, input_length=20),
    LSTM(32, return_sequences=True),
    Dropout(0.2),
    LSTM(16),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.summary()

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
rf_grid = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_grid, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

print("Running grid search... (this may take a moment)")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_ * 100:.2f}%")

# Test best model
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test set accuracy: {test_score * 100:.2f}%")

## Model Evaluation and Interpretation

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Get predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

## Exercise

Build an advanced text classification system:
1. Collect or create a dataset of news articles with categories (politics, sports, technology, etc.)
2. Implement advanced text preprocessing (stemming, lemmatization, stop word removal)
3. Compare different vectorization methods (Bag of Words, TF-IDF, Word2Vec)
4. Build and compare multiple models (Naive Bayes, Random Forest, LSTM)
5. Use hyperparameter tuning to optimize the best model
6. Implement k-fold cross-validation
7. Create a confusion matrix and ROC curves
8. Build a simple deployment function that takes raw text and returns prediction

In [None]:
# Your code here

