In [1]:
# Importing required libraries
import nltk  # Natural Language Toolkit for text processing
import pickle  # To save and load model files
from sklearn.model_selection import train_test_split  # To split data into training and testing
from sklearn.feature_extraction.text import TfidfVectorizer  # To convert text into numerical features
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes model for classification
from sklearn.metrics import accuracy_score, classification_report  # To check model performance
import pandas as pd  # For handling dataset
import re  # Regular expressions for text cleaning
from nltk.tokenize import word_tokenize  # To split sentences into words
from nltk.stem import WordNetLemmatizer  # To reduce words to their base form
from imblearn.over_sampling import SMOTE  # To handle imbalanced data by generating synthetic samples

In [3]:
# Downloading necessary NLTK data files
nltk.download('punkt')  # For tokenization
nltk.download('wordnet')  # For lemmatization


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\conne\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\conne\AppData\Roaming\nltk_data...


True

In [4]:
# Load the dataset
df = pd.read_csv('data/YoutubeCommentsDataSet.csv')  # Read the CSV file containing YouTube comments
print("Columns in dataset:", df.columns.tolist())  # Print the column names of the dataset

Columns in dataset: ['Comment', 'Sentiment']


In [5]:
# Drop rows where 'Comment' or 'Sentiment' is missing (null)
df.dropna(subset=['Comment', 'Sentiment'], inplace=True)

In [6]:
# Function to clean the text data
def clean_text(text):
    text = str(text)  # Make sure the input is a string
    text = re.sub(r'http\S+', '', text)  # Remove any URLs from text
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove all special characters and numbers
    return text.strip().lower()  # Remove extra spaces and convert text to lowercase

In [7]:
# Apply text cleaning to the 'Comment' column
df['clean_text'] = df['Comment'].apply(clean_text)

In [8]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization on the cleaned text
df['lemmatized_tokens'] = df['clean_text'].apply(
    lambda x: [lemmatizer.lemmatize(word) for word in word_tokenize(x)]  # Lemmatize each word
)

In [9]:
# Initialize TF-IDF Vectorizer to convert text into numerical form
vectorizer = TfidfVectorizer(
    stop_words='english',  # Remove common English stopwords
    ngram_range=(1, 2),  # Use both unigrams and bigrams
    sublinear_tf=True,  # Apply sublinear term frequency scaling
    max_df=0.95,  # Ignore words that appear in more than 95% of documents
    min_df=5  # Ignore words that appear in less than 5 documents
)

In [10]:
# Transform the cleaned text into vectors
X = vectorizer.fit_transform(df['clean_text'])  # Features (X)
y = df['Sentiment']  # Target variable (y)


In [11]:
# Apply SMOTE to balance the classes (if some sentiments have very few samples)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)  # Generate synthetic samples


In [12]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

In [21]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [20]:

# Predict sentiments for test data
y_pred = model.predict(X_test)

In [22]:
# Check how well the model is performing
print("Accuracy:", accuracy_score(y_test, y_pred))  # Print accuracy
print(classification_report(y_test, y_pred))  # Print detailed classification report


Accuracy: 0.7392575270388775
              precision    recall  f1-score   support

    negative       0.66      0.94      0.77      2222
     neutral       0.84      0.49      0.61      2287
    positive       0.79      0.80      0.80      2333

    accuracy                           0.74      6842
   macro avg       0.76      0.74      0.73      6842
weighted avg       0.76      0.74      0.73      6842



In [39]:

# Save the trained model using pickle
with open("sentiment_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save the TF-IDF vectorizer as well
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Function to load the saved model and vectorizer later for prediction
def load_model():
    with open("sentiment_model.pkl", "rb") as model_file:
        loaded_model = pickle.load(model_file)  # Load the model
    with open("tfidf_vectorizer.pkl", "rb") as vectorizer_file:
        loaded_vectorizer = pickle.load(vectorizer_file)  # Load the vectorizer
    return loaded_model, loaded_vectorizer


In [40]:

# Predict the sentiment of all comments in the dataset
df['predicted_sentiment'] = model.predict(X)

In [41]:
# Show few rows with original and predicted sentiments
print(df[['Comment', 'Sentiment', 'predicted_sentiment']].head())


                                             Comment Sentiment  \
0  lets not forget that apple pay in 2014 require...   neutral   
1  here in nz 50 of retailers don’t even have con...  negative   
2  i will forever acknowledge this channel with t...  positive   
3  whenever i go to a place that doesn’t take app...  negative   
4  apple pay is so convenient secure and easy to ...  positive   

  predicted_sentiment  
0            negative  
1            negative  
2            positive  
3            negative  
4            negative  


In [42]:
# Save the updated dataset (with predictions) into a new CSV file
df.to_csv("YoutubeCommentsDataSet_with_predictions.csv", index=False)


In [43]:
# Define a function to predict sentiment for any new comment
def predict_sentiment(comment):
    model, vectorizer = load_model()  # Load the saved model and vectorizer
    clean_comment = clean_text(comment)  # Clean the new comment
    vectorized_comment = vectorizer.transform([clean_comment])  # Convert it into vector form
    return model.predict(vectorized_comment)[0]  # Return the predicted sentiment

In [47]:
# Example usage of prediction function
example_comment = "This video is worst"
print("Predicted Sentiment:", predict_sentiment(example_comment))  # Predict and print sentiment

Predicted Sentiment: negative
