<a href="https://colab.research.google.com/github/pennpennyu/GitHubSentinel/blob/main/Google_Play_Store_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================================================================
# TECH68 Project: Google Play Store Sentiment Analysis
# Comparative Study: Traditional Machine Learning vs Few-Shot LLM Approach
# ===============================================================================
# Authors: Yaxing Yu & Shivam Goel
# Dataset: Google Play Store User Reviews (Kaggle)
# Purpose: Compare traditional ML (Random Forest) with Few-Shot LLM (GPT-4o-mini)
#          for sentiment classification of app reviews.
# ===============================================================================

from google.colab import files
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("="*80)
print("GOOGLE PLAY STORE SENTIMENT ANALYSIS")
print("="*80)

# ===============================================================================
# SECTION 1: Data Loading
# ===============================================================================

print("\nPlease upload 'googleplaystore_user_reviews.csv'")
uploaded = files.upload()
filename = list(uploaded.keys())[0]
data = pd.read_csv(io.BytesIO(uploaded[filename]))

print(f"\nDataset loaded successfully!")
print(f"Shape: {data.shape[0]:,} rows x {data.shape[1]} columns")
display(data.head())
print(data.info())


GOOGLE PLAY STORE SENTIMENT ANALYSIS

Please upload 'googleplaystore_user_reviews.csv'


Saving googleplaystore_user_reviews.csv to googleplaystore_user_reviews (5).csv
Saving googleplaystore.csv to googleplaystore (5).csv
Saving license.txt to license (5).txt

Dataset loaded successfully!
Shape: 64,295 rows x 5 columns


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB
None


In [None]:
# ===============================================================================
# SECTION 2: Missing Data Analysis
# ===============================================================================

print("\n" + "="*80)
print("STEP 2: MISSING DATA ANALYSIS")
print("="*80)

missing_df = pd.DataFrame({
    'Column': data.columns,
    'Missing Count': data.isnull().sum().values,
    'Missing %': (data.isnull().sum() / len(data) * 100).round(2).values
})
display(missing_df[missing_df['Missing Count'] > 0])

data_processed = data.copy()
data_processed['Translated_Review_Clean'] = data_processed['Translated_Review'].fillna('[No Review]')

if 'Sentiment_Polarity' in data_processed.columns:
    def infer_sentiment_from_polarity(row):
        if pd.notna(row['Sentiment']):
            return row['Sentiment']
        elif pd.notna(row['Sentiment_Polarity']):
            if row['Sentiment_Polarity'] > 0.1:
                return 'Positive'
            elif row['Sentiment_Polarity'] < -0.1:
                return 'Negative'
            else:
                return 'Neutral'
        else:
            return 'Neutral'
    data_processed['Sentiment_Inferred'] = data_processed.apply(infer_sentiment_from_polarity, axis=1)
else:
    data_processed['Sentiment_Inferred'] = data_processed['Sentiment'].fillna('Neutral')

data_clean = data_processed[
    (data_processed['Translated_Review_Clean'] != '') &
    (data_processed['Sentiment_Inferred'].notna())
].copy()

data_clean['Text'] = data_clean['Translated_Review_Clean']
data_clean['Sentiment'] = data_clean['Sentiment_Inferred']

print(f"\nClean dataset: {data_clean.shape[0]:,} samples")


STEP 2: MISSING DATA ANALYSIS


Unnamed: 0,Column,Missing Count,Missing %
1,Translated_Review,26868,41.79
2,Sentiment,26863,41.78
3,Sentiment_Polarity,26863,41.78
4,Sentiment_Subjectivity,26863,41.78



Clean dataset: 64,295 samples


In [None]:
# ===============================================================================
# SECTION 3: Class Distribution & Balancing
# ===============================================================================

print("\n" + "="*80)
print("STEP 3: CLASS DISTRIBUTION & BALANCING")
print("="*80)

sentiment_counts = data_clean['Sentiment'].value_counts()
print("\nOriginal distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"   {sentiment}: {count:,} ({count/len(data_clean)*100:.1f}%)")

colors = {'Negative': '#d62728', 'Neutral': '#ffbb78', 'Positive': '#2ca02c'}

class_counts = data_clean['Sentiment'].value_counts()
target_size = int(class_counts.median())

df_negative = data_clean[data_clean['Sentiment'] == 'Negative']
df_neutral = data_clean[data_clean['Sentiment'] == 'Neutral']
df_positive = data_clean[data_clean['Sentiment'] == 'Positive']

df_negative_balanced = resample(df_negative, n_samples=target_size, random_state=42, replace=True)
df_neutral_balanced = resample(df_neutral, n_samples=target_size, random_state=42, replace=False)
df_positive_balanced = resample(df_positive, n_samples=target_size, random_state=42, replace=False)

data_balanced = pd.concat([df_negative_balanced, df_neutral_balanced, df_positive_balanced])
data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nBalanced dataset: {data_balanced.shape[0]:,} samples")



STEP 3: CLASS DISTRIBUTION & BALANCING

Original distribution:
   Neutral: 32,026 (49.8%)
   Positive: 23,998 (37.3%)
   Negative: 8,271 (12.9%)

Balanced dataset: 71,994 samples


In [None]:
# ===============================================================================
# SECTION 4: Train-Test Split
# ===============================================================================

print("\n" + "="*80)
print("STEP 4: TRAIN-TEST SPLIT")
print("="*80)

sentiment_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
data_balanced['Sentiment_Numeric'] = data_balanced['Sentiment'].map(sentiment_map)

X_train, X_temp, y_train, y_temp = train_test_split(
    data_balanced['Text'], data_balanced['Sentiment_Numeric'],
    test_size=0.4, random_state=42, stratify=data_balanced['Sentiment_Numeric']
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Training: {len(X_train):,} | Validation: {len(X_val):,} | Test: {len(X_test):,}")


STEP 4: TRAIN-TEST SPLIT
Training: 43,196 | Validation: 14,399 | Test: 14,399


In [None]:
# ===============================================================================
# SECTION 5: Traditional ML Training
# ===============================================================================

print("\n" + "="*80)
print("STEP 5: TRADITIONAL ML TRAINING")
print("="*80)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naive Bayes': MultinomialNB()
}

results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    pipeline = make_pipeline(
        TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english'),
        model
    )
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)

    accuracy = metrics.accuracy_score(y_val, y_val_pred)
    f1_macro = f1_score(y_val, y_val_pred, average='macro')
    results[name] = {'Accuracy': accuracy, 'F1-Macro': f1_macro}
    print(f"   Accuracy: {accuracy:.4f} | F1: {f1_macro:.4f}")

results_df = pd.DataFrame(results).T
best_model_name = results_df['F1-Macro'].idxmax()
print(f"\nBest Model: {best_model_name}")

final_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english'),
    models[best_model_name]
)
final_pipeline.fit(X_train, y_train)


STEP 5: TRADITIONAL ML TRAINING

Training Logistic Regression...
   Accuracy: 0.9272 | F1: 0.9276

Training Random Forest...
   Accuracy: 0.9528 | F1: 0.9528

Training Naive Bayes...
   Accuracy: 0.8559 | F1: 0.8582

Best Model: Random Forest


In [None]:
# ===============================================================================
# SECTION 6: Traditional ML Evaluation
# ===============================================================================

print("\n" + "="*80)
print("STEP 6: TRADITIONAL ML EVALUATION")
print("="*80)

y_test_pred = final_pipeline.predict(X_test)
print(classification_report(y_test, y_test_pred, target_names=['Negative', 'Neutral', 'Positive']))

traditional_ml_f1 = f1_score(y_test, y_test_pred, average='macro')
traditional_ml_accuracy = metrics.accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {traditional_ml_accuracy:.4f} | F1: {traditional_ml_f1:.4f}")



STEP 6: TRADITIONAL ML EVALUATION
              precision    recall  f1-score   support

    Negative       0.93      0.98      0.95      4800
     Neutral       0.98      0.95      0.96      4799
    Positive       0.95      0.92      0.94      4800

    accuracy                           0.95     14399
   macro avg       0.95      0.95      0.95     14399
weighted avg       0.95      0.95      0.95     14399

Test Accuracy: 0.9510 | F1: 0.9510


In [None]:
# ===============================================================================
# SECTION 7: Few-Shot LLM Setup
# ===============================================================================

print("\n" + "="*80)
print("PART 2: FEW-SHOT LLM APPROACH")
print("="*80)

import subprocess
subprocess.run(['pip', 'install', '-q', 'openai'], check=True)

from openai import OpenAI
from google.colab import userdata
import json
import time
from tqdm import tqdm

try:
    openai_api_key = userdata.get('OPENAI_API_KEY')
except:
    openai_api_key = userdata.get('open_ai_key')

client = OpenAI(api_key=openai_api_key)
print("OpenAI initialized")

# REDUCED TO 500 SAMPLES FOR FASTER PROCESSING
sample_size = 500
llm_test_data = data_balanced.groupby('Sentiment', group_keys=False).apply(
    lambda x: x.sample(min(len(x), sample_size // 3), random_state=42)
).sample(frac=1, random_state=42).reset_index(drop=True)
llm_test_data['Sentiment_Numeric'] = llm_test_data['Sentiment'].map(sentiment_map)

print(f"\nLLM test sample: {len(llm_test_data):,} reviews (500 samples)")
print(f"Estimated time: ~25 minutes")
print(f"Estimated cost: ~$1.00")


PART 2: FEW-SHOT LLM APPROACH
OpenAI initialized

LLM test sample: 498 reviews (500 samples)
Estimated time: ~25 minutes
Estimated cost: ~$1.00


In [None]:
# ===============================================================================
# SECTION 8: Few-Shot Classifier
# ===============================================================================

def classify_few_shot(review):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": """You are a sentiment classifier. Classify reviews as Positive, Neutral, or Negative.

Examples:
- "Amazing app! Love it!" -> {"sentiment": "Positive", "confidence": 0.95}
- "Terrible, crashes constantly" -> {"sentiment": "Negative", "confidence": 0.95}
- "It's okay, nothing special" -> {"sentiment": "Neutral", "confidence": 0.85}

Return JSON: {"sentiment": "Positive/Neutral/Negative", "confidence": 0.0-1.0}"""},
                {"role": "user", "content": f'Classify: "{review[:500]}"'}
            ],
            temperature=0.0,
            max_tokens=50
        )
        result = json.loads(response.choices[0].message.content)
        sentiment = result.get("sentiment", "Neutral")
        confidence = float(result.get("confidence", 0.5))
        sentiment_to_num = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
        return sentiment_to_num.get(sentiment, 1), confidence, sentiment
    except:
        return 1, 0.5, "Neutral"

print("Classifier ready")

Classifier ready


In [None]:
# ===============================================================================
# SECTION 9: Run LLM Classification (500 samples)
# ===============================================================================

print("\n" + "="*80)
print("STEP 7: RUNNING LLM CLASSIFICATION (500 samples)")
print("="*80)

predictions = []
confidences = []
sentiment_labels = []

print("Processing 500 reviews (this will take ~25 minutes)...")
for idx, review in enumerate(tqdm(llm_test_data['Text'], desc="Classifying")):
    pred_num, conf, sent_label = classify_few_shot(review)
    predictions.append(pred_num)
    confidences.append(conf)
    sentiment_labels.append(sent_label)
    if (idx + 1) % 20 == 0:
        time.sleep(1)
    else:
        time.sleep(0.1)

llm_test_data['LLM_Prediction'] = predictions
llm_test_data['LLM_Confidence'] = confidences
llm_test_data['LLM_Sentiment_Label'] = sentiment_labels



STEP 7: RUNNING LLM CLASSIFICATION (500 samples)
Processing 500 reviews (this will take ~25 minutes)...


Classifying: 100%|██████████| 498/498 [07:46<00:00,  1.07it/s]


In [None]:
# ===============================================================================
# SECTION 10: LLM Results
# ===============================================================================

print("\n" + "="*80)
print("STEP 8: LLM RESULTS (500 samples)")
print("="*80)

llm_accuracy = metrics.accuracy_score(llm_test_data['Sentiment_Numeric'], llm_test_data['LLM_Prediction'])
llm_f1_macro = f1_score(llm_test_data['Sentiment_Numeric'], llm_test_data['LLM_Prediction'], average='macro')

print(f"Accuracy: {llm_accuracy:.4f} | F1: {llm_f1_macro:.4f}")
print("\nClassification Report:")
print(classification_report(
    llm_test_data['Sentiment_Numeric'],
    llm_test_data['LLM_Prediction'],
    target_names=['Negative', 'Neutral', 'Positive']
))



STEP 8: LLM RESULTS (500 samples)
Accuracy: 0.7309 | F1: 0.7238

Classification Report:
              precision    recall  f1-score   support

    Negative       0.76      0.74      0.75       166
     Neutral       0.65      0.93      0.76       166
    Positive       0.89      0.52      0.66       166

    accuracy                           0.73       498
   macro avg       0.76      0.73      0.72       498
weighted avg       0.76      0.73      0.72       498



In [None]:
# ===============================================================================
# SECTION 11: Comparison
# ===============================================================================

print("\n" + "="*80)
print("STEP 9: COMPARISON (500 sample comparison)")
print("="*80)

ml_predictions = final_pipeline.predict(llm_test_data['Text'])
ml_f1 = f1_score(llm_test_data['Sentiment_Numeric'], ml_predictions, average='macro')
ml_accuracy = metrics.accuracy_score(llm_test_data['Sentiment_Numeric'], ml_predictions)

print(f"\nTraditional ML: Accuracy={ml_accuracy:.4f}, F1={ml_f1:.4f}")
print(f"Few-Shot LLM:   Accuracy={llm_accuracy:.4f}, F1={llm_f1_macro:.4f}")

if llm_f1_macro > ml_f1:
    print(f"\nLLM outperforms by {(llm_f1_macro - ml_f1):.4f}")
else:
    print(f"\nML outperforms by {(ml_f1 - llm_f1_macro):.4f}")


STEP 9: COMPARISON (500 sample comparison)

Traditional ML: Accuracy=0.9859, F1=0.9859
Few-Shot LLM:   Accuracy=0.7309, F1=0.7238

ML outperforms by 0.2621


In [None]:
# ===============================================================================
# SECTION 12: Save Results
# ===============================================================================

print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

comparison = pd.DataFrame({
    'Method': ['Traditional ML', 'Few-Shot LLM'],
    'Model': [best_model_name, 'GPT-4o-mini'],
    'Accuracy': [ml_accuracy, llm_accuracy],
    'F1-Macro': [ml_f1, llm_f1_macro],
    'Sample Size': [500, 500]
})

comparison.to_csv('results.csv', index=False)
llm_test_data.to_csv('predictions.csv', index=False)
print("Saved: results.csv, predictions.csv")

display(comparison)

print("\n" + "="*80)
print("ANALYSIS COMPLETE (500 samples processed)")
print("="*80)




SAVING RESULTS
Saved: results.csv, predictions.csv


Unnamed: 0,Method,Model,Accuracy,F1-Macro,Sample Size
0,Traditional ML,Random Forest,0.985944,0.985893,500
1,Few-Shot LLM,GPT-4o-mini,0.730924,0.723822,500



ANALYSIS COMPLETE (500 samples processed)


In [None]:
# ===============================================================================
# SECTION 13: INTERACTIVE REVIEW TESTING
# ===============================================================================

print("\n" + "="*80)
print("INTERACTIVE REVIEW TESTING")
print("="*80)

import ipywidgets as widgets
from IPython.display import display, clear_output

# Create widgets
review_input = widgets.Textarea(
    value='',
    placeholder='Type your app review here...',
    description='Review:',
    layout=widgets.Layout(width='80%', height='100px')
)

predict_button = widgets.Button(
    description='Predict Sentiment',
    button_style='success',
    tooltip='Click to predict',
    icon='check'
)

clear_button = widgets.Button(
    description='Clear',
    button_style='warning',
    tooltip='Clear results',
    icon='refresh'
)

output_area = widgets.Output()

def predict_user_review(review_text):
    """Predict sentiment for user-provided review using both models"""

    if not review_text.strip():
        print("Please enter a valid review.")
        return

    # Traditional ML Prediction
    ml_pred = final_pipeline.predict([review_text])[0]
    ml_proba = final_pipeline.predict_proba([review_text])[0]
    sentiment_names = ['Negative', 'Neutral', 'Positive']
    ml_sentiment = sentiment_names[ml_pred]
    ml_confidence = ml_proba[ml_pred]

    # LLM Prediction
    print("Analyzing with LLM...")
    llm_pred_num, llm_conf, llm_sentiment = classify_few_shot(review_text)

    # Display results
    print("\n" + "="*80)
    print(f"YOUR REVIEW:")
    print(f"'{review_text}'")
    print("="*80)

    print(f"\nTraditional ML ({best_model_name}):")
    print(f"   Prediction: {ml_sentiment}")
    print(f"   Confidence: {ml_confidence:.2%}")
    print(f"\n   Probabilities:")
    for i, name in enumerate(sentiment_names):
        bar = '█' * int(ml_proba[i] * 20)
        print(f"      {name:8s}: {ml_proba[i]:.2%} {bar}")

    print(f"\nFew-Shot LLM (GPT-4o-mini):")
    print(f"   Prediction: {llm_sentiment}")
    print(f"   Confidence: {llm_conf:.2%}")

    if ml_sentiment == llm_sentiment:
        print(f"\nBOTH MODELS AGREE: {ml_sentiment}")
    else:
        print(f"\nMODELS DISAGREE:")
        print(f"   Traditional ML: {ml_sentiment}")
        print(f"   LLM: {llm_sentiment}")

    print("="*80)
    print("\nTry another review above!")

def on_predict_button_clicked(b):
    with output_area:
        clear_output()
        review_text = review_input.value
        predict_user_review(review_text)

def on_clear_button_clicked(b):
    with output_area:
        clear_output()
        print("Results cleared. Enter a new review above!")

predict_button.on_click(on_predict_button_clicked)
clear_button.on_click(on_clear_button_clicked)

# Display interface
print("\nInteractive Review Tester:")
print("1. Type your review in the text box")
print("2. Click 'Predict Sentiment' to see results")
print("3. Click 'Clear' to reset and try another review")
print("\nTry examples like:")
print("   - 'This app is amazing! Love it!'")
print("   - 'Terrible, crashes all the time'")
print("   - 'It's okay, nothing special'")
print("\n")

display(review_input)
display(widgets.HBox([predict_button, clear_button]))
display(output_area)

print("\n" + "="*80)
print("SESSION COMPLETE")
print("="*80)


INTERACTIVE REVIEW TESTING

Interactive Review Tester:
1. Type your review in the text box
2. Click 'Predict Sentiment' to see results
3. Click 'Clear' to reset and try another review

Try examples like:
   - 'This app is amazing! Love it!'
   - 'Terrible, crashes all the time'
   - 'It's okay, nothing special'




Textarea(value='', description='Review:', layout=Layout(height='100px', width='80%'), placeholder='Type your a…

HBox(children=(Button(button_style='success', description='Predict Sentiment', icon='check', style=ButtonStyle…

Output()


SESSION COMPLETE
