In [2]:
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
df = pd.read_csv('/content/YoutubeCommentsDataSet.csv')
print("Columns in dataset:", df.columns.tolist())

Columns in dataset: ['Comment', 'Sentiment']


In [5]:
# Drop rows with missing values in 'Comment' or 'Sentiment'
df.dropna(subset=['Comment', 'Sentiment'], inplace=True)

In [7]:
# Function to clean text
def clean_text(text):
    text = str(text)  # Ensure text is string
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text.strip().lower()  # Convert to lowercase

# Apply the clean_text function to create the 'clean_text' column
df['clean_text'] = df['Comment'].apply(clean_text)

In [8]:
# Lemmatization (Ensure this runs after 'clean_text' is created)
lemmatizer = WordNetLemmatizer()
df['lemmatized_tokens'] = df['clean_text'].apply(
    lambda x: [lemmatizer.lemmatize(word) for word in word_tokenize(x)]
)

In [9]:

# Just Example
sample_comment = "Check out this video: https://youtu.be/FLZvOKSCkxY?si=-fVMETgk-_q3X2Wi  and subscribe!"
cleaned_sample = clean_text(sample_comment)
print("Cleaned Sample:", cleaned_sample)

Cleaned Sample: check out this video   and subscribe


In [10]:
# Clean overall Dataset
df['clean_text'] = df['Comment'].apply(clean_text)

In [11]:
# Process the data in datsets
print("\nProcessed DataFrame:")
print(df[['Comment', 'clean_text', 'lemmatized_tokens']].head()) # Changed 'stemmed_tokens' to 'lemmatized_tokens'


Processed DataFrame:
                                             Comment  \
0  lets not forget that apple pay in 2014 require...   
1  here in nz 50 of retailers don’t even have con...   
2  i will forever acknowledge this channel with t...   
3  whenever i go to a place that doesn’t take app...   
4  apple pay is so convenient secure and easy to ...   

                                          clean_text  \
0  lets not forget that apple pay in  required a ...   
1  here in nz  of retailers dont even have contac...   
2  i will forever acknowledge this channel with t...   
3  whenever i go to a place that doesnt take appl...   
4  apple pay is so convenient secure and easy to ...   

                                   lemmatized_tokens  
0  [let, not, forget, that, apple, pay, in, requi...  
1  [here, in, nz, of, retailer, dont, even, have,...  
2  [i, will, forever, acknowledge, this, channel,...  
3  [whenever, i, go, to, a, place, that, doesnt, ...  
4  [apple, pay, is, so, conve

In [12]:
# TF-IDF Vectorization with stop words removal
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])  # Convert text to numerical features
y = df['Sentiment']  # Target variable

In [13]:
# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Train Naive Bayes
model = MultinomialNB()  # Create a MultinomialNB object
model.fit(X_train, y_train)  # Train the model

y_pred = model.predict(X_test)  # Make predictions

In [15]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6492146596858639
              precision    recall  f1-score   support

    negative       1.00      0.01      0.01       275
     neutral       0.80      0.11      0.20       582
    positive       0.64      0.99      0.78      1435

    accuracy                           0.65      2292
   macro avg       0.82      0.37      0.33      2292
weighted avg       0.73      0.65      0.54      2292



In [16]:
# Save the trained model and vectorizer using pickle
with open("sentiment_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [17]:
# Function to load model and vectorizer for prediction
def load_model():
    with open("sentiment_model.pkl", "rb") as model_file:
        loaded_model = pickle.load(model_file)
    with open("tfidf_vectorizer.pkl", "rb") as vectorizer_file:
        loaded_vectorizer = pickle.load(vectorizer_file)
    return loaded_model, loaded_vectorizer

In [18]:
#Predict sentiment for the entire dataset (New Addition)
df['predicted_sentiment'] = model.predict(X)  # Apply model to all comments

In [19]:
#Display the first few rows with predictions (New Addition)
print(df[['Comment', 'Sentiment', 'predicted_sentiment']].head())

                                             Comment Sentiment  \
0  lets not forget that apple pay in 2014 require...   neutral   
1  here in nz 50 of retailers don’t even have con...  negative   
2  i will forever acknowledge this channel with t...  positive   
3  whenever i go to a place that doesn’t take app...  negative   
4  apple pay is so convenient secure and easy to ...  positive   

  predicted_sentiment  
0            positive  
1            positive  
2            positive  
3            positive  
4            positive  


In [20]:
# Save the dataset with predictions to a new file (New Addition)
df.to_csv("YoutubeCommentsDataSet_with_predictions.csv", index=False)#print("Predictions saved in 'YoutubeCommentsDataSet_with_predictions.csv'")

In [21]:
# Function to predict sentiment of new comments using the saved model
def predict_sentiment(comment):
    model, vectorizer = load_model()
    clean_comment = clean_text(comment)
    vectorized_comment = vectorizer.transform([clean_comment])
    return model.predict(vectorized_comment)[0]


In [22]:
# Example Prediction
example_comment = "This video is amazing! I loved it."
print("Predicted Sentiment:", predict_sentiment(example_comment))


Predicted Sentiment: positive
