In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import re
import warnings
warnings.filterwarnings('ignore')

In [17]:
X_combined = pd.read_csv('data/engineered_features.csv')
df = pd.read_csv('data/data_cleaned.csv')

In [20]:
#Text Processing Functions

def clean_text(text):
    """Basic text cleaning - nothing fancy"""
    if pd.isna(text) or not text:
        return ""
    
    text = str(text).lower()
    # Keep letters, numbers, and spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # Clean up extra spaces
    return ' '.join(text.split())


def combine_text_fields(df):
    """Combine review text and summary into one field"""
    data = df.copy()
    
    if 'combined_text' not in data.columns:
        # Start with empty strings
        data['combined_text'] = ''
        
        # Add review text if it exists
        if 'review/text' in data.columns:
            data['combined_text'] += data['review/text'].fillna('')
        
        # Add review summary if it exists
        if 'review/summary' in data.columns:
            # Add a space between text and summary if both exist
            data['combined_text'] += ' ' + data['review/summary'].fillna('')
        
        # Clean up any extra spaces
        data['combined_text'] = data['combined_text'].str.strip()
    
    data['clean_text'] = data['combined_text'].apply(clean_text)
    return data


In [9]:
#  Feature Engineering Functions

def extract_basic_stats(df):
    """Pull out basic text statistics"""
    data = df.copy()
    
    data['char_count'] = data['clean_text'].str.len()
    data['word_count'] = data['clean_text'].str.split().str.len().fillna(0)
    
    def avg_word_len(text):
        words = text.split() if text else []
        return np.mean([len(w) for w in words]) if words else 0
    
    data['avg_word_len'] = data['clean_text'].apply(avg_word_len)
    data['sentence_count'] = data['clean_text'].str.count(r'[.!?]') + 1
    data['words_per_sentence'] = data['word_count'] / data['sentence_count']
    
    return data


def extract_sentiment_signals(df):
    """Look for positive and negative sentiment indicators"""
    data = df.copy()
    
    good_words = ['good', 'great', 'excellent', 'amazing', 'love', 'perfect', 
                 'wonderful', 'fantastic', 'outstanding', 'recommend']
    bad_words = ['bad', 'terrible', 'awful', 'hate', 'horrible', 'worst', 
                'disappointing', 'waste', 'useless', 'poor']
    
    def count_sentiment(text, word_list):
        words = text.split() if text else []
        matches = sum(1 for word in words if word in word_list)
        return matches / max(len(words), 1)
    
    data['positive_signal'] = data['clean_text'].apply(lambda x: count_sentiment(x, good_words))
    data['negative_signal'] = data['clean_text'].apply(lambda x: count_sentiment(x, bad_words))
    
    return data


def extract_style_features(df):
    """Extract writing style indicators"""
    data = df.copy()
    
    def caps_ratio(text):
        if not text:
            return 0
        return sum(1 for c in str(text) if c.isupper()) / max(len(str(text)), 1)
    
    data['caps_ratio'] = data['combined_text'].apply(caps_ratio)
    data['punct_ratio'] = data['combined_text'].apply(
        lambda x: sum(1 for c in str(x) if c in '!?.,;:') / max(len(str(x)), 1)
    )
    
    return data


def build_all_features(df):
    """Combine all feature extraction steps"""
    data = combine_text_fields(df)
    data = extract_basic_stats(data)
    data = extract_sentiment_signals(data)
    data = extract_style_features(data)
    return data

In [10]:
# Data Preparation Functions

def prepare_text_features(df, vectorizer=None, vocab_size=10000):
    """Convert text to numerical vectors using TF-IDF"""
    if vectorizer is None:
        vectorizer = TfidfVectorizer(
            max_features=vocab_size,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=3,
            max_df=0.9,
            sublinear_tf=True
        )
        text_vectors = vectorizer.fit_transform(df['clean_text'])
        return text_vectors, vectorizer
    else:
        text_vectors = vectorizer.transform(df['clean_text'])
        return text_vectors, vectorizer


def prepare_numeric_features(df, scaler=None):
    """Scale and prepare numerical features"""
    feature_cols = [
        'char_count', 'word_count', 'avg_word_len', 
        'sentence_count', 'words_per_sentence',
        'positive_signal', 'negative_signal', 
        'caps_ratio', 'punct_ratio'
    ]
    
    # Include helpfulness score if available
    if 'review/helpfulness' in df.columns:
        feature_cols.append('review/helpfulness')
    
    # Fill missing values
    numeric_data = df[feature_cols].copy()
    for col in feature_cols:
        if col in numeric_data.columns:
            numeric_data[col] = numeric_data[col].fillna(numeric_data[col].median())
    
    # Scale features
    if scaler is None:
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(numeric_data)
        return scaled_features, scaler
    else:
        scaled_features = scaler.transform(numeric_data)
        return scaled_features, scaler


def combine_features(text_vectors, numeric_features):
    """Combine text and numeric features into one matrix"""
    from scipy.sparse import hstack, csr_matrix
    return hstack([text_vectors, csr_matrix(numeric_features)])



In [11]:
# Main Classifier Class

class TextAnalyzer:
    """
    A text classifier that learns to identify helpful reviews
    Uses neural networks under the hood but keeps things simple
    """
    
    def __init__(self, layers=None, vocab_size=12000):
        self.layers = layers or (384, 192, 96)
        self.vocab_size = vocab_size
        self.brain = None
        self.text_processor = None
        self.feature_scaler = None
        self.test_data = None
        
    def train(self, df, target='is_helpful', validation_split=0.2, training_rounds=50):
        """Train the classifier on the data"""
        print("Training text classifier...")
        print(f"Working with {len(df)} reviews")
        
        # Build features
        processed = build_all_features(df)
        
        # Check target distribution
        target_counts = processed[target].value_counts()
        print(f"Target distribution: {dict(target_counts)}")
        
        # Prepare text features
        text_vectors, self.text_processor = prepare_text_features(
            processed, vocab_size=self.vocab_size
        )
        
        # Prepare numeric features
        numeric_features, self.feature_scaler = prepare_numeric_features(processed)
        
        # Combine all features
        X = combine_features(text_vectors, numeric_features)
        y = processed[target]
        
        print(f"Using {X.shape[1]} total features")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=validation_split, random_state=42, stratify=y
        )
        
        print(f"Training on {X_train.shape[0]} samples, testing on {X_test.shape[0]}")
        
        # Create and train neural network
        self.brain = MLPClassifier(
            hidden_layer_sizes=self.layers,
            activation='relu',
            solver='adam',
            alpha=0.0001,
            learning_rate_init=0.001,
            max_iter=training_rounds,
            early_stopping=True,
            validation_fraction=0.15,
            n_iter_no_change=10,
            random_state=42
        )
        
        print(f"Training neural network with layers: {self.layers}")
        self.brain.fit(X_train, y_train)
        
        # Evaluate performance
        self._evaluate_training(X_train, X_test, y_train, y_test)
        self.test_data = (X_test, y_test)
        
        return self
    
    def _evaluate_training(self, X_train, X_test, y_train, y_test):
        """Internal method to evaluate training results"""
        train_pred = self.brain.predict(X_train)
        test_pred = self.brain.predict(X_test)
        train_proba = self.brain.predict_proba(X_train)[:, 1]
        test_proba = self.brain.predict_proba(X_test)[:, 1]
        
        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        train_auc = roc_auc_score(y_train, train_proba)
        test_auc = roc_auc_score(y_test, test_proba)
        
        print(f"\nResults:")
        print(f"Training accuracy: {train_acc:.3f}")
        print(f"Test accuracy: {test_acc:.3f}")
        print(f"Training AUC: {train_auc:.3f}")
        print(f"Test AUC: {test_auc:.3f}")
    
    def predict(self, input_data, with_confidence=True):
        """Make predictions on new data"""
        if self.brain is None:
            raise ValueError("Need to train the model first!")
        
        # Handle different input types
        if isinstance(input_data, str):
            temp_df = pd.DataFrame({'combined_text': [input_data]})
        elif isinstance(input_data, list):
            temp_df = pd.DataFrame({'combined_text': input_data})
        else:
            temp_df = input_data
        
        # Process features
        processed = build_all_features(temp_df)
        text_vectors, _ = prepare_text_features(processed, self.text_processor)
        numeric_features, _ = prepare_numeric_features(processed, self.feature_scaler)
        X = combine_features(text_vectors, numeric_features)
        
        predictions = self.brain.predict(X)
        
        if with_confidence:
            probabilities = self.brain.predict_proba(X)[:, 1]
            return predictions, probabilities
        
        return predictions
    
    def check_review(self, text):
        """Analyze a single review"""
        pred, prob = self.predict([text])
        
        return {
            'text_preview': text[:150] + "..." if len(text) > 150 else text,
            'helpful': bool(pred[0]),
            'confidence': float(prob[0]),
            'prediction': 'Helpful' if pred[0] else 'Not Helpful'
        }
    
    def analyze_performance(self):
        """Detailed performance analysis"""
        if not self.test_data:
            print("No test data available - train the model first")
            return
        
        X_test, y_test = self.test_data
        predictions = self.brain.predict(X_test)
        probabilities = self.brain.predict_proba(X_test)[:, 1]
        
        acc = accuracy_score(y_test, predictions)
        auc = roc_auc_score(y_test, probabilities)
        
        print(f"Model Performance:")
        print(f"Accuracy: {acc:.3f}")
        print(f"AUC Score: {auc:.3f}")
        print(f"Trained for {self.brain.n_iter_} iterations")
        
        print("\nDetailed breakdown:")
        print(classification_report(y_test, predictions, 
                                  target_names=['Not Helpful', 'Helpful']))


In [14]:
# Helper Functions


def build_classifier(data, hidden_layers=(256, 128), vocab_size=10000):
    """Build and train a text classifier"""
    classifier = TextAnalyzer(layers=hidden_layers, vocab_size=vocab_size)
    classifier.train(data, training_rounds=100)
    classifier.analyze_performance()
    return classifier


def demo_predictions(classifier, data, num_examples=3):
    """Show some example predictions"""
    print("\nExample predictions:")
    print("-" * 50)
    
    samples = data.sample(n=min(num_examples, len(data)))
    
    for _, row in samples.iterrows():
        text = row.get('review/text', row.get('combined_text', ''))
        actual = row['is_helpful']
        
        result = classifier.check_review(text)
        
        print(f"Text: {result['text_preview']}")
        print(f"Actual: {'Helpful' if actual else 'Not Helpful'}")
        print(f"Predicted: {result['prediction']} ({result['confidence']:.3f})")
        print("-" * 50)

# Quick start
if __name__ == "__main__":
    print("Organic Text Classifier")
    print("Usage: classifier = build_classifier(your_dataframe)")
    print("Then: result = classifier.check_review('your text here')")

Organic Text Classifier
Usage: classifier = build_classifier(your_dataframe)
Then: result = classifier.check_review('your text here')


In [21]:
classifier = build_classifier(df)

Training text classifier...
Working with 13477 reviews
Target distribution: {0: np.int64(6879), 1: np.int64(6598)}
Using 10010 total features
Training on 10781 samples, testing on 2696
Training neural network with layers: (256, 128)

Results:
Training accuracy: 0.993
Test accuracy: 0.961
Training AUC: 1.000
Test AUC: 0.995
Model Performance:
Accuracy: 0.961
AUC Score: 0.995
Trained for 13 iterations

Detailed breakdown:
              precision    recall  f1-score   support

 Not Helpful       0.97      0.95      0.96      1376
     Helpful       0.95      0.97      0.96      1320

    accuracy                           0.96      2696
   macro avg       0.96      0.96      0.96      2696
weighted avg       0.96      0.96      0.96      2696

