In [None]:

from google.colab import files
import pandas as pd
import numpy as np
import nltk
import string
import re
import scipy.sparse
import pickle
import time

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
print("Upload your kaggle.json file:")
files.upload()

!mv "kaggle.json" kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle --version

Upload your kaggle.json file:


Saving kaggle.json to kaggle.json
mv: 'kaggle.json' and 'kaggle.json' are the same file
Kaggle API 1.7.4.5


In [None]:
!kaggle datasets download -d kritanjalijain/amazon-reviews -p /content
!unzip /content/amazon-reviews.zip -d /content/amazon_reviews

Dataset URL: https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews
License(s): CC0-1.0
Downloading amazon-reviews.zip to /content
100% 1.29G/1.29G [00:16<00:00, 84.5MB/s]
100% 1.29G/1.29G [00:16<00:00, 84.6MB/s]
Archive:  /content/amazon-reviews.zip
  inflating: /content/amazon_reviews/amazon_review_polarity_csv.tgz  
  inflating: /content/amazon_reviews/test.csv  
  inflating: /content/amazon_reviews/train.csv  


In [None]:
try:
    df_train = pd.read_csv('/content/amazon_reviews/train.csv',
                            encoding='utf-8',
                            engine='python',
                            on_bad_lines='skip',
                            header=None)
    df_train.columns = ['label', 'title', 'text']

    df_test = pd.read_csv('/content/amazon_reviews/test.csv',
                           encoding='utf-8',
                           engine='python',
                           on_bad_lines='skip',
                           header=None)
    df_test.columns = ['label', 'title', 'text']

    print(f"Training data size: {len(df_train):,} rows")
    print(f"Testing data size: {len(df_test):,} rows")
    print("\nSample of the data:")
    print(df_train.head())


except Exception as e:
    print(f"Error loading data: {e}")

Training data size: 3,600,000 rows
Testing data size: 400,000 rows

Sample of the data:
   label                                              title  \
0      2                     Stuning even for the non-gamer   
1      2              The best soundtrack ever to anything.   
2      2                                           Amazing!   
3      2                               Excellent Soundtrack   
4      2  Remember, Pull Your Jaw Off The Floor After He...   

                                                text  
0  This sound track was beautiful! It paints the ...  
1  I'm reading a lot of reviews saying that this ...  
2  This soundtrack is my favorite music of all ti...  
3  I truly like this soundtrack and I enjoy video...  
4  If you've played the game, you know how divine...  


In [None]:
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

sample_size_train = 500000
sample_size_test = 50000

df_train_sample = df_train.sample(n=min(sample_size_train, len(df_train)),
                                   random_state=42).copy()
df_test_sample = df_test.sample(n=min(sample_size_test, len(df_test)),
                                 random_state=42).copy()

print(f"Training sample size: {len(df_train_sample):,} rows")
print(f"Testing sample size: {len(df_test_sample):,} rows\n")


Training sample size: 500,000 rows
Testing sample size: 50,000 rows



In [None]:
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading stopwords...")
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Downloading wordnet...")
    nltk.download('wordnet')
    nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)
lemmatizer = WordNetLemmatizer()

print("Cleaning tools prepared\n")


Downloading stopwords...
Downloading wordnet...
Cleaning tools prepared



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

sample_size_train = 500000
sample_size_test = 50000

df_train_sample = df_train.sample(n=sample_size_train, random_state=42).copy()
df_test_sample = df_test.sample(n=sample_size_test, random_state=42).copy()

print(f"Training sample size: {len(df_train_sample):,} rows")
print(f"Testing sample size: {len(df_test_sample):,} rows\n")


Training sample size: 500,000 rows
Testing sample size: 50,000 rows



In [None]:
def clean_text_advanced(text):

    try:
        if pd.isna(text) or not isinstance(text, str):
            return ""

        text = text.lower()
        text = text.translate(punctuation_table)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

        words = text.split()
        filtered_words = [
            lemmatizer.lemmatize(word)
            for word in words
            if word not in stop_words and len(word) > 2
        ]

        result = " ".join(filtered_words)
        return result if result else "unknown"

    except Exception as e:
        print(f"Error cleaning text: {e}")
        return "unknown"


In [None]:
df_train_sample['full_review'] = (df_train_sample['title'].astype(str) + ' ' +
                                   df_train_sample['text'].astype(str))
df_test_sample['full_review'] = (df_test_sample['title'].astype(str) + ' ' +
                                  df_test_sample['text'].astype(str))

print("Cleaning training data...")
df_train_sample['clean_review'] = df_train_sample['full_review'].apply(clean_text_advanced)

print("Cleaning testing data...")
df_test_sample['clean_review'] = df_test_sample['full_review'].apply(clean_text_advanced)

df_train_sample = df_train_sample[df_train_sample['clean_review'].str.len() > 5].copy()
df_test_sample = df_test_sample[df_test_sample['clean_review'].str.len() > 5].copy()

print("Data cleaned successfully")
print(f"Training size after cleaning: {len(df_train_sample):,}")
print(f"Testing size after cleaning: {len(df_test_sample):,}")

print("\nExample of cleaning:")
print("Before:", df_train_sample['full_review'].iloc[0][:100])
print("After:", df_train_sample['clean_review'].iloc[0][:100])


Cleaning training data...
Cleaning testing data...
Data cleaned successfully
Training size after cleaning: 499,999
Testing size after cleaning: 50,000

Example of cleaning:
Before: Wings Over The Pacific A very poor movie. There were no Germany squardons in the Pacific. Poor actin
After: wing pacific poor movie germany squardons pacific poor acting setting story message


In [None]:
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)

X_train = df_train_sample['clean_review']
X_test = df_test_sample['clean_review']
y_train = df_train_sample['label']
y_test = df_test_sample['label']

print("Training TF-IDF...")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Training shape: {X_train_tfidf.shape}")
print(f"Testing shape: {X_test_tfidf.shape}\n")


Training TF-IDF...
Training shape: (499999, 50000)
Testing shape: (50000, 50000)



In [None]:
models = {
    'Logistic Regression (C=2.0)': LogisticRegression(
        solver='saga', C=2.0, max_iter=1000, random_state=42, n_jobs=-1
    ),
    'Logistic Regression (C=5.0)': LogisticRegression(
        solver='saga', C=5.0, max_iter=1000, random_state=42, n_jobs=-1
    ),
    'Linear SVC (C=1.0)': LinearSVC(
        C=1.0, max_iter=2000, random_state=42
    ),
    'Naive Bayes (alpha=0.1)': MultinomialNB(alpha=0.1)
}

results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training: {name}")
    print('='*60)

    try:
        start_time = time.time()
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        duration = time.time() - start_time

        results[name] = {
            'accuracy': accuracy,
            'duration': duration,
            'model': model,
            'predictions': y_pred
        }

        print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
        print(f"Time: {duration:.2f} seconds")

    except Exception as e:
        print(f"Error training {name}: {e}")



Training: Logistic Regression (C=2.0)
Accuracy: 0.9113 (91.13%)
Time: 22.13 seconds

Training: Logistic Regression (C=5.0)
Accuracy: 0.9107 (91.07%)
Time: 23.42 seconds

Training: Linear SVC (C=1.0)
Accuracy: 0.9076 (90.76%)
Time: 20.40 seconds

Training: Naive Bayes (alpha=0.1)
Accuracy: 0.8791 (87.91%)
Time: 0.17 seconds


In [None]:
sorted_results = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)

print("\nModel ranking:")
print("-" * 60)
for i, (name, info) in enumerate(sorted_results, 1):
    print(f"{i}. {name:40s}: {info['accuracy']:.4f} ({info['accuracy']*100:.2f}%)")

best_model_name = sorted_results[0][0]
best_model_info = sorted_results[0][1]
best_model = best_model_info['model']

print(f"\nBest model: {best_model_name}")
print(f"Accuracy: {best_model_info['accuracy']:.4f}")



Model ranking:
------------------------------------------------------------
1. Logistic Regression (C=2.0)             : 0.9113 (91.13%)
2. Logistic Regression (C=5.0)             : 0.9107 (91.07%)
3. Linear SVC (C=1.0)                      : 0.9076 (90.76%)
4. Naive Bayes (alpha=0.1)                 : 0.8791 (87.91%)

Best model: Logistic Regression (C=2.0)
Accuracy: 0.9113


In [None]:
print("\n Classification Report:")
print(classification_report(y_test, best_model_info['predictions']))



 Classification Report:
              precision    recall  f1-score   support

           1       0.91      0.91      0.91     25155
           2       0.91      0.91      0.91     24845

    accuracy                           0.91     50000
   macro avg       0.91      0.91      0.91     50000
weighted avg       0.91      0.91      0.91     50000



In [None]:
print("\nTraining Ensemble Model...")
try:
    ensemble = VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(C=5.0, solver='saga', max_iter=1000, n_jobs=-1, random_state=42)),
            ('svc', LinearSVC(C=1.0, max_iter=2000, random_state=42)),
            ('nb', MultinomialNB(alpha=0.1))
        ],
        voting='hard',
        n_jobs=-1
    )

    ensemble.fit(X_train_tfidf, y_train)
    y_pred_ensemble = ensemble.predict(X_test_tfidf)
    ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)

    print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")

    if ensemble_accuracy > best_model_info['accuracy']:
        print("Ensemble is better!")
        final_model = ensemble
        final_accuracy = ensemble_accuracy
        final_model_name = "Ensemble Model"
    else:
        print("Single model is better")
        final_model = best_model
        final_accuracy = best_model_info['accuracy']
        final_model_name = best_model_name

except Exception as e:
    print(f"Error in Ensemble: {e}")
    final_model = best_model
    final_accuracy = best_model_info['accuracy']
    final_model_name = best_model_name



Training Ensemble Model...
Ensemble Accuracy: 0.9103
Single model is better


In [None]:
try:
    with open('final_sentiment_model.pkl', 'wb') as f:
        pickle.dump(final_model, f)
    print("Model saved successfully")

    with open('final_tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(tfidf, f)
    print("Vectorizer saved successfully")

    scipy.sparse.save_npz('X_train_tfidf_final.npz', X_train_tfidf)
    scipy.sparse.save_npz('X_test_tfidf_final.npz', X_test_tfidf)

    y_train.to_csv('y_train_final.csv', index=False, header=['label'])
    y_test.to_csv('y_test_final.csv', index=False, header=['label'])

    print("All files saved successfully")

except Exception as e:
    print(f"Error saving files: {e}")


Model saved successfully
Vectorizer saved successfully
All files saved successfully


In [None]:
try:
    files.download('final_sentiment_model.pkl')
    files.download('final_tfidf_vectorizer.pkl')
except Exception as e:
    print(f"Cannot download files (may not work outside Colab): {e}")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("\n" + "="*60)
print("Final Summary")
print("="*60)
print(f"Training size: {len(df_train_sample):,}")
print(f"Testing size: {len(df_test_sample):,}")
print(f"Number of features: {X_train_tfidf.shape[1]:,}")
print(f"Final model: {final_model_name}")
print(f"Final accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print("="*60)
print("Project completed!")



Final Summary
Training size: 499,999
Testing size: 50,000
Number of features: 50,000
Final model: Logistic Regression (C=2.0)
Final accuracy: 0.9113 (91.13%)
Project completed!


In [None]:
def predict_sentiment(review_text):
    """Predict sentiment for a new review"""
    try:
        cleaned = clean_text_advanced(review_text)
        tfidf_vector = tfidf.transform([cleaned])
        prediction = final_model.predict(tfidf_vector)[0]
        return "Positive" if prediction == 2 else "Negative"
    except Exception as e:
        return f"Error: {e}"

# Test
test_reviews = [
    "This product is absolutely amazing! I love it!",
    "Terrible quality, waste of money.",
    "It's okay, nothing special."
]

print("\nTest:")
for review in test_reviews:
    sentiment = predict_sentiment(review)
    print(f" {review[:50]}...")
    print(f"   → {sentiment}\n")




Test:
 This product is absolutely amazing! I love it!...
   → Positive

 Terrible quality, waste of money....
   → Negative

 It's okay, nothing special....
   → Negative

