In [40]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [41]:
# Sample text data
text_data = [
    "This is a great movie",
    "The movie was excellent",
    "Absolutely fantastic film",
    "Bad movie, not recommended",
    "Terrible film, waste of time",
    "Worst movie ever seen",
    "Amazing film, loved it",
    "Not good, very boring",
    "Good film but a bit slow",
    "Wonderful performance by the cast"
]



In [42]:

# Corresponding labels (1 = positive, 0 = negative)
labels = [1, 1, 1, 0, 0, 0, 1, 0, 1, 1]


In [43]:

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, labels, test_size=0.2, random_state=0)



In [44]:
# Enhanced TF-IDF pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 2), min_df=1)),
    ('classifier', MultinomialNB(alpha=0.1))  # Adjust smoothing parameter
])






In [45]:
# Fit the updated pipeline to training data
pipeline.fit(X_train, y_train)




In [46]:
# Predict on the test data
y_pred = pipeline.predict(X_test)



In [47]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Pipeline Test Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))



Pipeline Test Accuracy: 0.50
Confusion Matrix:
[[0 0]
 [1 1]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.50      0.67         2

    accuracy                           0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
# --- Experiment with Support Vector Machine ---
svm_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC(kernel='linear'))
])


In [49]:

# Fit the pipeline to the training data
svm_pipeline.fit(X_train, y_train)



In [50]:
# Predict on the test data
y_pred_svm = svm_pipeline.predict(X_test)



In [51]:
# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Pipeline Test Accuracy: {accuracy_svm:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))



SVM Pipeline Test Accuracy: 0.50
Confusion Matrix:
[[0 0]
 [1 1]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.50      0.67         2

    accuracy                           0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
# --- Visualize Feature Importance (TF-IDF Scores) ---
vectorizer = pipeline.named_steps['vectorizer']
feature_names = vectorizer.get_feature_names_out()



In [53]:
# Calculate feature importance using log probabilities
log_probabilities = pipeline.named_steps['classifier'].feature_log_prob_[1]  # Class 1 (positive)
indices = np.argsort(log_probabilities)[::-1]



In [54]:
print("Top 5 Features for Positive Class:")
for i in range(5):
    print(f"{feature_names[indices[i]]}: {log_probabilities[indices[i]]:.2f}")



Top 5 Features for Positive Class:
the: -3.11
movie: -3.29
movie was: -3.47
excellent: -3.47
the movie: -3.47


In [55]:
# --- Moving from Prototype to Production ---
import joblib



In [56]:
# Save the trained pipeline to a file
model_filename = "text_classification_pipeline.pkl"
joblib.dump(pipeline, model_filename)
print(f"Pipeline saved to {model_filename}")



Pipeline saved to text_classification_pipeline.pkl


In [57]:
# Load the pipeline back for prediction
loaded_pipeline = joblib.load(model_filename)



In [58]:
# Predict and evaluate again
new_texts = ["Fantastic movie, would watch again", "Horrible plot, terrible acting"]
predictions = pipeline.predict(new_texts)
for text, label in zip(new_texts, predictions):
    sentiment = "Positive" if label == 1 else "Negative"
    print(f"Text: '{text}' - Sentiment: {sentiment}")

Text: 'Fantastic movie, would watch again' - Sentiment: Negative
Text: 'Horrible plot, terrible acting' - Sentiment: Negative
