In [3]:
import nltk
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
nltk.download('stopwords')
nltk.download('punkt')  # Download the 'punkt' tokenizer models

# Load the dataset
data = pd.read_csv('model-dataset.csv')
data = data.dropna()


# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.notna(text):
        words = word_tokenize(str(text).lower())
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
        return ' '.join(filtered_words)
    else:
        return ''  # Handle NaN values by returning an empty string


# Apply the preprocess_text function to the 'comment' column
data['comment'] = data['comment'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = data['comment']  # Use 'comment' as the column name
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a sentiment classifier (SVM in this case)
classifier = SVC(kernel='linear')
classifier.fit(X_train_tfidf, y_train)

# Predict sentiment labels
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eranda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eranda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.9018531017978769
              precision    recall  f1-score   support

    negative       0.88      0.81      0.84      7152
     neutral       0.88      0.98      0.93     11067
    positive       0.93      0.89      0.91     14375

    accuracy                           0.90     32594
   macro avg       0.90      0.89      0.89     32594
weighted avg       0.90      0.90      0.90     32594

