In [2]:
#1. Loading and Preprocessing 

import pandas as pd

# Load the dataset
df = pd.read_csv('E:\\DSML\Assignments\\nlp\\nlpdataset.csv')
data = pd.read_csv(df)

# Display the first few rows
print(data.head())

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Function for preprocessing
def preprocess_text(text):
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

#Impact on Model Performance:

#Text Cleaning: 
'''Helps in reducing noise in the data, which allows the model to focus on meaningful words.'''
#Tokenization: 
'''Converts text into a structured format for further analysis.'''
#Stopword Removal: 
'''Reduces dimensionality and improves model performance by eliminating non-informative words.'''


TypeError: argument of type 'method' is not iterable

In [None]:
#2. Feature Extraction (2 marks):
'''Implement feature extraction using CountVectorizer or TfidfVectorizer. Describe how the chosen method transforms the text data into numerical 
features.'''


from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['emotion']  # Assuming the emotion labels are in this column

'''
TF-IDF: Stands for Term Frequency-Inverse Document Frequency. It increases the weight of words that are frequent in a document
but not across all documents, thus highlighting unique terms that may be important for classification.'''

In [None]:
#3. Model Development (2 marks):
#Train the following machine learning models

#a)Naive Bayes
    from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


#b)Support Vector Machine 

from sklearn.svm import SVC

# Train SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)


In [None]:
#4. Model Comparison (2 marks)
'''Evaluate the model using appropriate metrics (e.g., accuracy, F1-score).
Provide a brief explanation of the chosen model and its suitability for emotion classification.'''

from sklearn.metrics import accuracy_score, f1_score

# Predictions
nb_predictions = nb_model.predict(X_test)
svm_predictions = svm_model.predict(X_test)

# Calculate metrics
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_f1 = f1_score(y_test, nb_predictions, average='weighted')

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions, average='weighted')

# Display results
print(f'Naive Bayes Accuracy: {nb_accuracy:.4f}, F1 Score: {nb_f1:.4f}')
print(f'SVM Accuracy: {svm_accuracy:.4f}, F1 Score: {svm_f1:.4f}')

'''
Suitability of Models:

Naive Bayes: Works well with text data, especially when the features are conditionally independent. 
It's computationally efficient and performs well for multi-class classification problems.


SVM: Effective in high-dimensional spaces and particularly useful for text classification due to its ability to find hyperplanes
that best separate classes.
'''