In [27]:
import nltk
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
nltk.download('stopwords')
nltk.download('punkt')  # Download the 'punkt' tokenizer models

# Load the dataset
data = pd.read_csv('model-dataset.csv')

# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_words)

data['comment'] = data['comment'].apply(preprocess_text)  # Use 'comment' as the column name

# Split the dataset into training and testing sets
X = data['comment']  # Use 'comment' as the column name
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a sentiment classifier (SVM in this case)
classifier = SVC(kernel='linear')
classifier.fit(X_train_tfidf, y_train)

# Predict sentiment labels
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7466666666666667
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         8
     neutral       0.74      1.00      0.85        53
    positive       1.00      0.21      0.35        14

    accuracy                           0.75        75
   macro avg       0.58      0.40      0.40        75
weighted avg       0.71      0.75      0.67        75



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eranda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eranda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
