# Emoji-Aware Sentiment Analysis Model Training

In [1]:
!pip install emoji demoji pandas scikit-learn nltk joblib



In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
import emoji
import joblib
import warnings

nltk.download('stopwords')
nltk.download('wordnet')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noahd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\noahd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Load emoji sentiment lexicon
EMOJI_SENTIMENT = {
    "😊": 1, "😄": 1, "😃": 1, "❤️": 1, "👍": 1, "🎉": 1, "🌟": 1, "😍": 1,
    "😢": -1, "😭": -1, "😡": -1, "👎": -1, "😠": -1, "😞": -1, "💔": -1,
    "🤔": 0, "😐": 0, "📝": 0, "🎬": 0, "🎥": 0
}

In [4]:
def extract_emoji_features(text):
    """Extract emoji features including sentiment"""
    emoji_count = 0
    emoji_sentiment_sum = 0
    emoji_descriptions = []

    for char in text:
        if char in emoji.EMOJI_DATA:
            emoji_count += 1
            sentiment = EMOJI_SENTIMENT.get(char, 0)
            emoji_sentiment_sum += sentiment
            description = emoji.EMOJI_DATA[char]['en'].replace(':', '').replace('_', ' ')
            emoji_descriptions.append(f"{char} {description} ({sentiment})")

    avg_sentiment = emoji_sentiment_sum / emoji_count if emoji_count > 0 else 0
    return {
        'emoji_count': emoji_count,
        'avg_sentiment': avg_sentiment,
        'descriptions': ' '.join(emoji_descriptions)
    }

In [5]:
def preprocess_text(text):
    """Preprocess text while preserving emoji information"""
    # Extract emoji features
    emoji_features = extract_emoji_features(text)
    
    # Clean text
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    
    # Preserve emojis while cleaning
    cleaned_text = ''.join([char for char in text if char.isalnum() or char.isspace() or char in emoji.EMOJI_DATA])
    
    # Combine with emoji descriptions
    return f"{cleaned_text.lower()} {emoji_features['descriptions']}".strip()

In [None]:
# Load your dataset
df = pd.read_csv('../assets/datasets/IMDb_Dataset.csv')

# Preprocess the data
df['processed_review'] = df['review'].apply(preprocess_text)
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_review'],
    df['label'],
    test_size=0.2,
    random_state=42
)

In [8]:
# Create and train the vectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [9]:
# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8979

Classification Report:

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [10]:
# Save the model and vectorizer
joblib.dump(model, 'emoji_sentiment_model.joblib')
joblib.dump(tfidf, 'emoji_tfidf_vectorizer.joblib')
joblib.dump(EMOJI_SENTIMENT, 'emoji_sentiment_dict.joblib')

['emoji_sentiment_dict.joblib']

In [11]:
# Test with some emoji-containing text
test_texts = [
    "This movie was amazing! 😊 Loved every minute of it! 🎬 ❤️",
    "Terrible movie 😠 Complete waste of time 👎",
    "Not sure how to feel about this one 🤔"
]

for text in test_texts:
    processed = preprocess_text(text)
    vector = tfidf.transform([processed])
    prediction = model.predict(vector)[0]
    probability = model.predict_proba(vector)[0]
    
    print(f"\nOriginal: {text}")
    print(f"Processed: {processed}")
    print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'}")
    print(f"Confidence: {max(probability):.2f}")


Original: This movie was amazing! 😊 Loved every minute of it! 🎬 ❤️
Processed: this movie was amazing 😊 loved every minute of it 🎬 ❤ 😊 smiling face with smiling eyes (1) 🎬 clapper board (0) ❤ red heart (0)
Prediction: Positive
Confidence: 0.91

Original: Terrible movie 😠 Complete waste of time 👎
Processed: terrible movie 😠 complete waste of time 👎 😠 angry face (-1) 👎 thumbs down (-1)
Prediction: Negative
Confidence: 1.00

Original: Not sure how to feel about this one 🤔
Processed: not sure how to feel about this one 🤔 🤔 thinking face (0)
Prediction: Negative
Confidence: 0.59
