In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

try:
    df = pd.read_csv('musical1.tsv', sep='\t')
except:
    df = pd.read_csv('musical1.tsv', sep='\t', header=None)

if len(df.columns) == 2:
    df.columns = ['review', 'sentiment']
elif 'Score' in df.columns:
    df = df.rename(columns={'Score': 'sentiment'})
else:
    df = df.rename(columns={df.columns[0]: 'review', df.columns[1]: 'sentiment'})

df = df.dropna()
df = df[df['review'].notna() & df['sentiment'].notna()]

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    try:
        # Tokenization
        tokens = word_tokenize(str(text).lower())
        
        # Stemming
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        
        # Lemmatization
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
        
        # Join tokens back to string
        return ' '.join(lemmatized_tokens)
    except Exception as e:
        print(f"Error processing text: {text}")
        print(f"Error: {e}")
        return ""

df['processed_review'] = df['review'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_review'])
y = df['sentiment'].astype(int)  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\makif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\makif\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\makif\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Evaluation Metrics:
Accuracy: 0.7650
Precision: 0.7769
Recall: 0.8246
F1 Score: 0.8000
