Import Library/ies

In [160]:
import json
import joblib
import pandas as pd
import pickle
import re
import scipy.sparse as sp

In [134]:
def calculate_repetition_features(text):
    if not isinstance(text, str):
        text = ""  # Replace non-string or NaN values with an empty string
    # Count word repetitions
    words = text.split()
    word_count = {word: words.count(word) for word in set(words)}
    word_repetition = sum(count for count in word_count.values() if count > 1)

    # Count letter repetitions
    letter_repetition = len(re.findall(r'(.)\1{2,}', text))  # Counts letters repeated 3 or more times

    return pd.Series([word_repetition, letter_repetition])

Load the model and vectorizer

In [137]:
loaded_model = joblib.load('sentiment_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

Load the JSON file

In [140]:
df = pd.read_json('reviews.json')

Convert to CSV file

In [143]:
# Save as CSV
df.to_csv('reviews.csv', index=False)

Load CSV data

In [146]:
csv_file_path = 'reviews.csv'
data = pd.read_csv(csv_file_path)

In [128]:
# Assuming 'review' column contains the text
X_new = data['review']

In [150]:
# Ensure all reviews are strings and handle missing values
X_new = data['review'].fillna("").astype(str)

In [152]:
# Extract repetition features
repetition_features_new = X_new.apply(calculate_repetition_features)
repetition_features_new.columns = ['word_repetition', 'letter_repetition']

In [156]:
# Transform the text using the loaded vectorizer
X_tfidf_new = loaded_vectorizer.transform(X_new)

In [162]:
# Combine TF-IDF with repetition features
X_combined_new = sp.hstack([X_tfidf_new, sp.csr_matrix(repetition_features_new.values)])

In [166]:
# Predict the sentiments
predictions = loaded_model.predict(X_combined_new)

In [168]:
# Add predictions to the dataset
data['predicted_sentiment'] = predictions

# Save the results to a CSV
data.to_csv('datasetb_withpredictions.csv', index=False)

print("Predictions saved to 'predicted_sentiments.csv'")

Predictions saved to 'predicted_sentiments.csv'


In [6]:
import pandas as pd
import json

df = pd.read_json('reviews.json')

df.to_csv('new_reviews.csv', index=False)

# Load the new dataset
new_dataset_path = 'new_reviews.csv'  # Update with your file path
new_data = pd.read_csv(new_dataset_path)

# Assuming the dataset has a 'review' column
reviews = new_data['review']

In [10]:
import joblib

# Load the saved model and vectorizer
loaded_model = joblib.load('improved_logistic_regression_model.pkl')
loaded_vectorizer = joblib.load('new_tfidf_vectorizer.pkl')

print("Model and vectorizer loaded successfully!")

Model and vectorizer loaded successfully!


In [16]:
# Replace NaN values in reviews with a placeholder
new_data['review'] = new_data['review'].fillna('neutral')

In [18]:
# Transform the new reviews
reviews_tfidf = loaded_vectorizer.transform(new_data['review'])

In [20]:
# Predict sentiments
predicted_sentiments = loaded_model.predict(reviews_tfidf)

In [22]:
# Add predictions to the dataset
new_data['predicted_sentiment'] = predicted_sentiments

In [24]:
# Map numeric labels to sentiment names
sentiment_mapping = {
    1: 'Negative',
    2: 'Neutral',
    3: 'Positive',
    4: 'Mixed'
}
new_data['sentiment_label'] = new_data['predicted_sentiment'].map(sentiment_mapping)

# Save the dataset with predictions
new_data.to_csv('new_dataset_with_predictions.csv', index=False)
print("Predictions saved to new_dataset_with_predictions.csv")

Predictions saved to new_dataset_with_predictions.csv


In [40]:
# Create a mapping of string labels to integer labels
sentiment_mapping = {
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Mixed': 4
}

# Apply this mapping to the true sentiment column
new_data['sentiment_label'] = new_data['sentiment_label'].map(sentiment_mapping)

# Now, the true sentiment labels are integers
from sklearn.metrics import classification_report

true_labels = new_data['sentiment_label']
predicted_labels = new_data['predicted_sentiment']

# Generate the classification report
report = classification_report(true_labels, predicted_labels, target_names=['Negative', 'Neutral', 'Positive', 'Mixed'])
print(report)


              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00       327
     Neutral       1.00      1.00      1.00        97
    Positive       1.00      1.00      1.00       294
       Mixed       1.00      1.00      1.00       283

    accuracy                           1.00      1001
   macro avg       1.00      1.00      1.00      1001
weighted avg       1.00      1.00      1.00      1001



In [44]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(true_labels, predicted_labels)
print(cm)

[[327   0   0   0]
 [  0  97   0   0]
 [  0   0 294   0]
 [  0   0   0 283]]
