In [6]:
import pandas as pd
from datasets import load_dataset

In [8]:
import zipfile

with zipfile.ZipFile('amazonreviews.zip', 'r') as zip_ref:
    zip_ref.extractall('unzipped')

In [None]:
train_data = pd.read_csv('unzipped/train.ft.txt.bz2', compression='bz2', delimiter='\t', header=None)
test_data = pd.read_csv('unzipped/test.ft.txt.bz2', compression='bz2', delimiter='\t', header=None)

In [None]:
print(train_data.head())
print(test_data.head())

In [None]:
def data_view(file):                  
    data = []                                   
    for index, row in file.iterrows():          
        line = row[0]                           
        label, text = line.split(' ', 1)        
        label = label.replace('__label__', '')  
        data.append((label, text.strip()))      
    cols = ['label', 'content']                  
    return pd.DataFrame(data, columns=cols) 

train = data_view(train_data)
train['content'] = train['content'].fillna('')
test = data_view(test_data)

In [None]:
#Checking the label distribution
print(train['label'].value_counts())

# Hypothesis 1: positive reviews are longer

In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    stop_words='english'  # Use built-in English stop words
)

# Fit and transform the text data
X = vectorizer.fit_transform(train['content'])

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Optional: print some information
print("Shape of TF-IDF matrix:", X.shape)
print("Number of features:", len(feature_names))
print("First few feature names:", feature_names[:10])

In [None]:
train['review_length'] = train['content'].apply(len)
length_means = train.groupby('label')['review_length'].mean()
print(length_means)

#Write conclucsion on length hypothesis here

# Hypothesis2: Are the reviews with shipping mentioned more positive

In [None]:
# shipping-related keywords
shipping_keywords = [
    'shipping', 'delivered', 'delivery', 'amazon logistics', 
    'ship', 'package', 'shipment', 'carrier', 'arrived'
]

# Function to check shipping mentions
def has_shipping_mention(text):
    text_lower = text.lower()
    # Using regex to find shipping-related keywords
    return any(re.search(r'\b' + keyword + r'\b', text_lower) for keyword in shipping_keywords)

In [None]:
train['has_shipping_mention'] = train['content'].apply(has_shipping_mention)

# Calculate sentiment distribution for reviews with and without shipping mentions
shipping_sentiment_summary = train.groupby('has_shipping_mention')['label'].value_counts(normalize=True).unstack() * 100

print("Sentiment Distribution of Reviews:")
print("(Percentage of reviews)")
print(shipping_sentiment_summary)

# Count of reviews with shipping mentions
shipping_mention_counts = train['has_shipping_mention'].value_counts()
print("\nCount of Reviews:")
print(shipping_mention_counts)

# Visualize with a bar plot
import matplotlib.pyplot as plt

shipping_sentiment_summary.plot(kind='bar', stacked=True)
plt.title('Sentiment Distribution by Shipping Mention')
plt.xlabel('Contains Shipping Mention')
plt.ylabel('Percentage')
plt.legend(title='Label', labels=['Negative', 'Positive'])
plt.tight_layout()
plt.show()

# As per the plot the reviews with shipping mention is actually more negative

# Sentiment Analysis

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
def extract_top_tfidf_terms(tfidf_matrix, feature_names, label_indices, n_terms=3000):
    """Extract top N terms based on TF-IDF scores for a specific label."""
    avg_tfidf = np.mean(tfidf_matrix[label_indices].toarray(), axis=0)
    top_indices = np.argsort(avg_tfidf)[-n_terms:]
    
    return [feature_names[i] for i in top_indices]

def construct_binary_representation(text, significant_terms):
    """Create binary representation indicating presence/absence of significant terms."""
    tokens = set(word_tokenize(text.lower()))
    return {term: (term in tokens) for term in significant_terms}


In [None]:
# Identify top terms for each label
print("Extracting significant terms per label...")
significant_terms = set()

for label in train['label'].unique():
    label_indices = train['label'] == label
    top_label_terms = extract_top_tfidf_terms(X, feature_names, label_indices)
    significant_terms.update(top_label_terms)

print(f"Total unique significant terms: {len(significant_terms)}")

# Create binary features for training data
print("Generating binary features for training data...")
X_train_binary = []
for text in train['content']:
    features = construct_binary_representation(text, significant_terms)
    X_train_binary.append([int(v) for v in features.values()])
X_train_binary = np.array(X_train_binary)
y_train = train['label']

In [None]:
# Split training data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_binary, y_train, test_size=0.2, random_state=42
)

In [None]:
# Train classification model
print("Training classifier...")
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_split, y_train_split)

# Evaluate on validation set
print("Validation set evaluation:")
y_val_pred = classifier.predict(X_val)
print(classification_report(y_val, y_val_pred))

In [None]:
def predict_text_category(text):
    """Predict category and probabilities for new text."""
    features = construct_binary_representation(text, significant_terms)
    feature_vector = np.array([int(v) for v in features.values()]).reshape(1, -1)
    prediction = classifier.predict(feature_vector)[0]
    probabilities = classifier.predict_proba(feature_vector)[0]
    return prediction, probabilities


In [None]:
# Create binary features for test data
print("Generating binary features for test data...")
X_test_binary = []
for text in test['content']:
    features = construct_binary_representation(text, significant_terms)
    X_test_binary.append([int(v) for v in features.values()])
X_test_binary = np.array(X_test_binary)
y_test = test['label']

# Evaluate on test set
print("Test set evaluation:")
y_test_pred = classifier.predict(X_test_binary)
print(classification_report(y_test, y_test_pred))

# Example prediction
example_text = "If I was a new user, I would be reluctant to buy this product."
predicted_label, confidence_scores = predict_text_category(example_text)
print(f"Predicted Label: {predicted_label}")
print(f"Confidence Scores: {confidence_scores}")