In [3]:
# # Amazon Review Analysis and Prediction
# This notebook performs data extraction, preprocessing, feature engineering, and modeling on Amazon review data.

## **1. Import Libraries and Define Utility Functions**
# Import essential libraries
import sys
import os
import pandas as pd
import numpy as np
import joblib
import re
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from google.cloud import bigquery

# Download NLTK resources if not already available
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Custom Exception Class for Handling Errors
class CustomException(Exception):
    def __init__(self, message, original_exception):
        super().__init__(message)
        self.original_exception = original_exception

# Class to Handle TF-IDF Vectorization in Pipelines
class TfidfVectorizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, max_features=100):
        self.max_features = max_features
        self.vectorizer = TfidfVectorizer(max_features=self.max_features)

    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        return self.vectorizer.transform(X).toarray()

In [5]:
def flesch_kincaid(text):
    """Calculate Flesch-Kincaid readability score."""
    try:
        words = word_tokenize(text)
        sentences = len(re.split(r'[.!?]', text))
        syllables = sum([len([s for s in word if s in 'aeiou']) for word in words])
        if len(words) == 0 or sentences == 0:
            return np.nan
        return 206.835 - (1.015 * (len(words) / sentences)) - (84.6 * (syllables / len(words)))
    except Exception as e:
        raise CustomException("Error in flesch_kincaid calculation", e)

def pos_counts(text):
    """Calculate counts of nouns, verbs, and adjectives in text."""
    try:
        words = word_tokenize(text)
        pos_tags = nltk.pos_tag(words)
        pos_counts = {"nouns": 0, "verbs": 0, "adjectives": 0}
        for _, tag in pos_tags:
            if tag.startswith('N'):
                pos_counts["nouns"] += 1
            elif tag.startswith('V'):
                pos_counts["verbs"] += 1
            elif tag.startswith('J'):
                pos_counts["adjectives"] += 1
        return pd.Series(pos_counts)
    except Exception as e:
        raise CustomException("Error calculating POS counts", e)

def add_feature_columns(df):
    """Adds various feature columns to the DataFrame based on text analysis."""
    try:
        df['review_length'] = df['review_text'].apply(len)
        df['review_word_count'] = df['review_text'].apply(lambda x: len(word_tokenize(x)))
        df['review_sentiment'] = df['review_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
        df['review_subjectivity'] = df['review_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
        df['flesch_kincaid'] = df['review_text'].apply(flesch_kincaid)

        # Add keyword-based features
        keywords = ['good', 'bad', 'recommend', 'disappoint', 'excellent']
        for keyword in keywords:
            df[f'keyword_{keyword}'] = df['review_text'].apply(lambda x: int(keyword in x.lower()))

        # Calculate rating deviation
        avg_rating = df['rating'].mean()
        df['rating_deviation'] = df['rating'] - avg_rating
        df['title_sentiment'] = df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)
        df['title_length'] = df['title'].apply(len)

        # Part of speech counts
        pos_features = df['review_text'].apply(pos_counts)
        df = pd.concat([df, pos_features], axis=1)

        # Negation and pronoun counts
        negations = ["not", "no", "never", "none"]
        df['negation_count'] = df['review_text'].apply(lambda x: sum([x.lower().count(neg) for neg in negations]))

        pronouns = ["i", "we", "you", "he", "she", "they"]
        df['pronoun_count'] = df['review_text'].apply(lambda x: sum([x.lower().count(pronoun) for pronoun in pronouns]))
        df['helpful_to_length_ratio'] = df['helpful_votes'] / (df['review_length'] + 1)
        
        return df
    except Exception as e:
        raise CustomException("Error in add_feature_columns function", e)


In [6]:
def load_data_from_bigquery(query):
    client = bigquery.Client()
    try:
        df = client.query(query).to_dataframe()
        print("Data loaded from BigQuery successfully.")
        return df
    except Exception as e:
        raise CustomException("Error loading data from BigQuery", e)

def preprocess_data(df):
    """Preprocesses data for model training."""
    try:
        df = add_feature_columns(df)
        bins = [0, 1, 5, float("inf")]
        labels = ["low", "medium", "high"]
        df['helpfulness_class'] = pd.cut(df['helpful_votes'], bins=bins, labels=labels)
        label_encoder = LabelEncoder()
        df['helpfulness_class_encoded'] = label_encoder.fit_transform(df['helpfulness_class'])
        
        X = df.drop(columns=['helpful_votes', 'helpfulness_class', 'helpfulness_class_encoded'])
        y = df['helpfulness_class_encoded']
        
        # Numerical and text transformations
        numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        review_text_transformer = TfidfVectorizerTransformer(max_features=100)
        title_transformer = TfidfVectorizerTransformer(max_features=100)
        
        # Concatenate transformations
        X_num = numerical_transformer.fit_transform(X[numerical_cols])
        X_review_text = review_text_transformer.fit_transform(X['review_text'])
        X_title = title_transformer.fit_transform(X['title'])
        
        return np.hstack([X_num, X_review_text, X_title]), y
    except Exception as e:
        raise CustomException("Error in preprocess_data function", e)

def train_model(X_train, y_train):
    """Trains the model."""
    try:
        model = Pipeline(steps=[
            ('classifier', RandomForestClassifier(random_state=42))
        ])
        model.fit(X_train, y_train)
        return model
    except Exception as e:
        raise CustomException("Error training the model", e)


In [7]:
query = "SELECT * FROM `airy-box-431604-j9.amazon_reviews.clean_data`"
df = load_data_from_bigquery(query)
X, y = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = train_model(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Model accuracy:", accuracy)
print("Classification report:\n", report)

Data loaded from BigQuery successfully.


CustomException: Error in preprocess_data function