# Natural Language Processing for Sentiment Analysis

1. Importing required libraries</br>
    a) nltk</br>
    b) re</br>
    c) numpy</br>
    d) pandas</br>
    e) sklearn</br>
    f) tensorflow</br>

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2. load data


In [None]:
# Step: Data load
data = pd.read_csv("hate.csv", encoding="windows-1252")

3. Data Preprocessing</br>
    In the preprocess_text method, we convert all text to lowercase, remove the stop words, Tokenize data, apply stemming and do Label Encoder.

In [None]:
# Step: Data Preprocessing

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Join stemmed tokens back into a string
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

# Apply preprocessing to the 'comment' column
data['comment'] = data['comment'].apply(preprocess_text)

# Step : Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['comment'])

X = tokenizer.texts_to_sequences(data['comment'])
X = pad_sequences(X, maxlen=100)  # Adjust maxlen as needed
y = data['label']

# Convert labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

4. Split the dataset

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

5. Defining Model

In [None]:
# Step : Define the TensorFlow model (LSTM)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=100),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 output classes: positive, negative, neutral
])

6. Compile the model

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

7. Start Model Training

In [None]:
# Step : Model Training
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:

# Step 6: Evaluation
# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Make predictions on test data
y_pred = np.argmax(model.predict(X_test), axis=-1)

# Convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(y_pred)



In [None]:
y_train

array([0, 0, 0, ..., 2, 0, 2])

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Test Accuracy:", test_accuracy)
print("Test Loss:", test_loss)
print("Overall Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Test Accuracy: 0.6257290840148926
Test Loss: 3.2544116973876953
Overall Accuracy: 0.6257290991574854
Precision: 0.6249980524176595
Recall: 0.6257290991574854
F1-score: 0.6253528704545208


  _warn_prf(average, modifier, msg_start, len(result))
