#  Loading some Required Libraries

In [29]:
import re
import joblib
import warnings


import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Ignore all warnings
warnings.filterwarnings("ignore")

# Step 1: Load Data

In [11]:
# Load the data
data = pd.read_csv('emotion_prediction_dataset.csv')

In [12]:
data.sample(10)

Unnamed: 0,Tweet,Emotion
48,Mate the thing I get excited about in my profe...,joy
82,will brawndo cure my depression? @MikeJudge #I...,sadness
85,Texans and Astros both shut out tonight. Houst...,sadness
14,"@Montel_Williams If this ban goes through, it ...",disgust
94,@janhopis I found the first few episodes of Bo...,trust
99,"@ProSyndicate thanks for replying, I'm ironing...",trust
6,tomorrow will be the last episode of despair a...,anticipation
42,Thank you disney themed episode for letting me...,joy
98,Have any of you ever stayed in hostels oversea...,trust
2,Im so serious about putting words in my mouth ...,anger


In [13]:
data.shape

(100, 2)

In [45]:
data.columns

Index(['Tweet', 'Emotion'], dtype='object')

In [17]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Tweet    100 non-null    object
 1   Emotion  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB
None


### Above you can see data is cleaned no NULL value in both columns.

# Step 2: Data Cleaning Functions

We'll create three separate data cleaning functions to preprocess the ``'Tweet'`` column. <br> These functions will handle different aspects of text cleaning: ``removing symbols and numbers``, converting text to ``lowercase``, and removing ``stopwords``. <br>We will also create a clean_text function that applies all three cleaning steps for simplicity.

## Function 1: remove_symbols_numbers

This function removes all ``non-alphabetic`` characters (symbols and numbers) from the input text, retaining only alphabetic characters and spaces.

In [18]:
def remove_symbols_numbers(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

## Function 2: to_lowercase

This function converts all characters in the input text to ``lowercase``.

In [19]:
def to_lowercase(text):
    return text.lower()

## Function 3: remove_stopwords

This function removes common stopwords from the input text. Stopwords are frequently used words in a language that carry little meaning ``(e.g., "and", "the", "is")``.

In [20]:
def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return ' '.join(words)

## Combined Function: clean_text

This function applies all three cleaning steps to the input text.

In [21]:
def clean_text(text):
    text = remove_symbols_numbers(text)
    text = to_lowercase(text)
    text = remove_stopwords(text)
    return text

## Applying the Cleaning Functions

We will apply the clean_text function to the ``'Tweet'`` column of the dataset.

In [22]:
# Apply the cleaning function to the 'description' column
data['Tweet'] = data['Tweet'].apply(clean_text)

# Step 3: Vectorization

After cleaning the text data, we convert the ``'Tweet'`` column into numerical features using ``TF-IDF(Term Frequency-Inverse Document Frequency`` vectorization. This step transforms the text data into a format suitable for machine learning models.

## TF-IDF Vectorization

To convert text data into numerical features by calculating the ``TF-IDF`` score for each word in the text. TF-IDF helps in giving importance to words that are frequent in a document but not across all documents.

In [23]:
vectorizer = TfidfVectorizer()

# Apply TF-IDF on 'description' column
X = vectorizer.fit_transform(data['Tweet'])

# The target variable
y = data['Emotion']

 The result is a sparse matrix X where each row represents a description and each column represents a unique word in the corpus. The values in the matrix are the TF-IDF scores of the words.

# Step 4: Model Training and Evaluation

We train and evaluate a machine learning model to predict the gender based on the ``TF-IDF`` vectors. Here, we use ``Logistic Regression`` as an example.

## Train-Test Split

To split the data into training and testing sets. The ``training set`` is used to train the model, and the ``testing set`` is used to evaluate the model's performance on unseen data.

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Training

To train a machine learning model on the training data.

### Logistic Regression model

In [33]:
model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Step 6: Model Evaluation

To evaluate the performance.

In [34]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Logistic Regression Accuracy: 0.15
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         2
anticipation       0.00      0.00      0.00         2
     disgust       0.50      0.50      0.50         2
        fear       0.00      0.00      0.00         3
         joy       0.18      0.67      0.29         3
        love       0.00      0.00      0.00         1
   pessimism       0.00      0.00      0.00         2
     sadness       0.00      0.00      0.00         4
    surprise       0.00      0.00      0.00         1
       trust       0.00      0.00      0.00         0

    accuracy                           0.15        20
   macro avg       0.07      0.12      0.08        20
weighted avg       0.08      0.15      0.09        20



## Applying different models and testing who is performing better

# Naive Bayes model

In [35]:
nb_model = MultinomialNB()


nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.2
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         2
anticipation       0.00      0.00      0.00         2
     disgust       0.50      0.50      0.50         2
        fear       0.18      0.67      0.29         3
         joy       0.25      0.33      0.29         3
        love       0.00      0.00      0.00         1
   pessimism       0.00      0.00      0.00         2
     sadness       0.00      0.00      0.00         4
    surprise       0.00      0.00      0.00         1
       trust       0.00      0.00      0.00         0

    accuracy                           0.20        20
   macro avg       0.09      0.15      0.11        20
weighted avg       0.11      0.20      0.14        20



# Decision Tree model

In [36]:
dt_model = DecisionTreeClassifier()

# Train the model
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.3
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         2
anticipation       0.00      0.00      0.00         2
     disgust       0.33      0.50      0.40         2
        fear       0.00      0.00      0.00         3
         joy       0.17      0.67      0.27         3
        love       1.00      1.00      1.00         1
   pessimism       0.67      1.00      0.80         2
     sadness       0.00      0.00      0.00         4
    surprise       0.00      0.00      0.00         1

    accuracy                           0.30        20
   macro avg       0.24      0.35      0.27        20
weighted avg       0.17      0.30      0.21        20



# Random Forest model

In [37]:
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.2
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         2
anticipation       0.00      0.00      0.00         2
     disgust       0.15      1.00      0.27         2
        fear       0.00      0.00      0.00         3
         joy       0.50      0.33      0.40         3
        love       0.20      1.00      0.33         1
   pessimism       0.00      0.00      0.00         2
     sadness       0.00      0.00      0.00         4
    surprise       0.00      0.00      0.00         1

    accuracy                           0.20        20
   macro avg       0.09      0.26      0.11        20
weighted avg       0.10      0.20      0.10        20



 # SVM model

In [38]:
svm_model = SVC()

# Train the model
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.2
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         2
anticipation       0.00      0.00      0.00         2
     disgust       1.00      0.50      0.67         2
        fear       0.00      0.00      0.00         3
         joy       0.16      1.00      0.27         3
        love       0.00      0.00      0.00         1
   pessimism       0.00      0.00      0.00         2
     sadness       0.00      0.00      0.00         4
    surprise       0.00      0.00      0.00         1

    accuracy                           0.20        20
   macro avg       0.13      0.17      0.10        20
weighted avg       0.12      0.20      0.11        20



# KNN (K Nearest Neighbors) model

In [39]:
knn_model = KNeighborsClassifier()

# Train the model
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.1
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         2
anticipation       0.00      0.00      0.00         2
     disgust       0.50      0.50      0.50         2
        fear       0.00      0.00      0.00         3
         joy       0.00      0.00      0.00         3
        love       0.14      1.00      0.25         1
    optimism       0.00      0.00      0.00         0
   pessimism       0.00      0.00      0.00         2
     sadness       0.00      0.00      0.00         4
    surprise       0.00      0.00      0.00         1

    accuracy                           0.10        20
   macro avg       0.06      0.15      0.07        20
weighted avg       0.06      0.10      0.06        20



## We can see in our nature of dataset the Decision Tree model give us high result.
So, this model use for real time prediction.

In [40]:
# Save the best model KNN
joblib.dump(dt_model, 'best_emotion_prediction_model_dt_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

# Step 7: Load the Model and Vectorizer for Real-Time Predictions

Now, let's write the code to load the saved model and vectorizer, and then get real-time predictions based on user input:

In [42]:
import joblib

# Load the trained model
model = joblib.load('best_emotion_prediction_model_dt_model.pkl')

# Load the TF-IDF vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [43]:
def predict_emotion(description):
    # Clean the input description
    cleaned_description = clean_text(description)

    # Transform the input description using the trained TF-IDF vectorizer
    description_tfidf = vectorizer.transform([cleaned_description])

    # Predict gender using the trained model
    prediction = model.predict(description_tfidf)

    return prediction[0]

In [44]:
# Get input description from the user
new_description = input("Enter a Tweet : ")

# Predict the gender
predicted_gender = predict_emotion(new_description)
print("Predicted Emotion:", predicted_gender)


Predicted Emotion: joy


## ---- Happy Learning ----
## ---- Always Keep Smile ----

# ---- Jazakallah Khair ----