In [1]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import joblib

In [2]:
# Step 2: Load the dataset
df = pd.read_csv('data/test.csv',header=None)
df.columns = ['polarity', 'title', 'text']  # adjust if you have 3 columns


# Combine 'title' and 'text' for better feature representation
df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')

# Step 2.1: Ensure polarity labels are integers
df['polarity'] = df['polarity'].astype(int)

# Step 3: Features and target
X = df['content']
y = df['polarity']

# Step 4: Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:

# Step 5: Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 6: Train Logistic Regression model
log_model = LogisticRegression(max_iter=1000)
log_model .fit(X_train_vec, y_train)


y_pred = log_model .predict(X_test_vec)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Logistic Regression Accuracy: 0.909
Classification Report:
               precision    recall  f1-score   support

           1       0.91      0.91      0.91     39896
           2       0.91      0.91      0.91     40104

    accuracy                           0.91     80000
   macro avg       0.91      0.91      0.91     80000
weighted avg       0.91      0.91      0.91     80000



In [6]:
# ===============================
# Step 9: Predict Sentiment
# ===============================

# Example: Predict on multiple new reviews
new_reviews = [
    "My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT.",
    "Batteries died within a year. Might as well just get alkaline disposables.",
    "Great for the non-audiophile. Easy to setup and works well.",
    "DVD Player crapped out after one year."
]

for review in new_reviews:
    # Transform using the fitted vectorizer
    review_vec = vectorizer.transform([review])
    
    # Predict sentiment (1 = positive, 0 = negative)
    pred = log_model.predict(review_vec)[0]
    
    # Print result
    sentiment = "✅ Positive Review" if pred == 1 else "❌ Negative Review"
    print(f"\nReview: {review}\nPrediction: {sentiment}")



Review: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT.
Prediction: ❌ Negative Review

Review: Batteries died within a year. Might as well just get alkaline disposables.
Prediction: ✅ Positive Review

Review: Great for the non-audiophile. Easy to setup and works well.
Prediction: ❌ Negative Review

Review: DVD Player crapped out after one year.
Prediction: ✅ Positive Review


In [5]:
# ===============================
# Predict Sentiment on New Review
# ===============================

# Example single review
new_review = input("Enter a review to predict sentiment: ")

# Convert the review to TF-IDF features
new_review_vec = vectorizer.transform([new_review])

# Predict sentiment
prediction = log_model.predict(new_review_vec)[0]

# Show result
if prediction == 1:
    print("✅ Sentiment: Positive Review")
else:
    print("❌ Sentiment: Negative Review")


✅ Sentiment: Positive Review


In [12]:
# --------- Train Naive Bayes ---------
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

y_pred_nb = nb_model.predict(X_test_vec)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))




Naive Bayes Accuracy: 0.8696
              precision    recall  f1-score   support

           1       0.87      0.87      0.87     39896
           2       0.87      0.87      0.87     40104

    accuracy                           0.87     80000
   macro avg       0.87      0.87      0.87     80000
weighted avg       0.87      0.87      0.87     80000



In [13]:
base_svm = LinearSVC(max_iter=10000)                # Fast linear SVM
svm_model = CalibratedClassifierCV(base_svm)        # Adds probability support

# Fit the model
svm_model.fit(X_train_vec, y_train)

# Predict on test data
y_pred_svm = svm_model.predict(X_test_vec)

# Evaluate accuracy
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

# Optional: classification report
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.909525
              precision    recall  f1-score   support

           1       0.91      0.91      0.91     39896
           2       0.91      0.91      0.91     40104

    accuracy                           0.91     80000
   macro avg       0.91      0.91      0.91     80000
weighted avg       0.91      0.91      0.91     80000



In [14]:
rf_model = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=20,           # Limit depth to reduce training time
    min_samples_split=10,   # Minimum samples required to split a node
    n_jobs=-1,              # Use all CPU cores for parallel training
    random_state=42
)

# Fit the model
rf_model.fit(X_train_vec, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test_vec)

# Evaluate accuracy
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

# Optional: classification report
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.838075
              precision    recall  f1-score   support

           1       0.85      0.82      0.83     39896
           2       0.83      0.86      0.84     40104

    accuracy                           0.84     80000
   macro avg       0.84      0.84      0.84     80000
weighted avg       0.84      0.84      0.84     80000



In [15]:
# Step 8: Save model and vectorizer for Flask
joblib.dump(log_model, 'logistic_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(rf_model, 'rf_model.pkl')
print("Models and vectorizer saved successfully!")

Models and vectorizer saved successfully!
