<a href="https://colab.research.google.com/github/munavarhs/AnalysisOfJakeAndHyde/blob/main/RoastMeAndToastMeAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import gensim.downloader as api
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
toastme_data = pd.read_csv('ToastMe.csv', sep=None, engine='python', on_bad_lines='warn')
roastme_data = pd.read_csv('RoastMe.csv',  sep=None, engine='python', on_bad_lines='warn')

In [5]:
#Combining the data
data = pd.concat([toastme_data, roastme_data])


In [6]:
#Text Preprocessing
# Removing the rows with missing comments
data = data.dropna(subset=['Comment'])

In [7]:
# Converting the text to lowercase and strip whitespaces
data['Processed_Comment'] = data['Comment'].str.lower().str.strip()
# Removing punctuations, spaces and special characters
data['Processed_Comment'] = data['Processed_Comment'].str.replace(r'[^\w\s]', '', regex=True)


In [8]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [9]:
# Removing Stopwords
def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)


In [10]:
# Using the above function to remove Stopwords on comments
data['Processed_Comment'] = data['Processed_Comment'].apply(remove_stopwords)

In [11]:
# Lemmatization
def lemmatize_text(text):
    tokens = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)


In [12]:
# Using the above function on comments for further preprocessing
data['Processed_Comment'] = data['Processed_Comment'].apply(lemmatize_text)

In [13]:
# Display the first few rows of the processed data
data[['Comment', 'Processed_Comment']].head()

Unnamed: 0,Comment,Processed_Comment
0,"Duuuude, yes! Super boss. Alright, now for t...",duuuude yes super bos alright realness think l...
1,Good looking dude who seems genuinely nice. Ju...,good looking dude seems genuinely nice judging...
2,Here's to a girl that is willing to take chanc...,here girl willing take chance cheer
3,This is an awesome idea! I'm surprised no one ...,awesome idea im surprised one thought edit oh ...
4,"Dude, you're so beautiful! I love your hair a...",dude youre beautiful love hair smile would loo...


In [14]:
# Encoding target labels: 0 for ToastMe, 1 for RoastMe
y = data['Subreddit'].apply(lambda x: 1 if x == 'RoastMe' else 0)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data['Comment'], data['Subreddit'], test_size=0.2, random_state=56)


In [16]:
#Performing feature Representations using TF-IDF, Count Vectorization and Glove Models

#1. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [17]:
# 2. Count Vectorization
count_vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [18]:
# 3. Word2Vec/GloVe Embeddings
# Loading GloVe pre-trained embeddings
glove_model = api.load("glove-wiki-gigaword-50")



In [19]:
def get_average_word2vec(sentence, model, vector_size):
    words = sentence.split()
    feature_vec = np.zeros((vector_size,), dtype="float32")
    num_words = 0
    for word in words:
        if word in model.key_to_index:
            num_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if num_words > 0:
        feature_vec = np.divide(feature_vec, num_words)
    return feature_vec


In [20]:
X_train_glove = np.array([get_average_word2vec(sent, glove_model, 50) for sent in X_train])
X_test_glove = np.array([get_average_word2vec(sent, glove_model, 50) for sent in X_test])

In [21]:
# Standardizing embeddings for models like SVM
scaler = MinMaxScaler()
X_train_glove_scaled = scaler.fit_transform(X_train_glove)
X_test_glove_scaled = scaler.transform(X_test_glove)

In [22]:
### Model Training and Evaluation ###

In [23]:
# Initializing all the mentioned models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(kernel='linear')
}

In [36]:
# Train and evaluate models using TF-IDF
print("### TF-IDF Results ###")
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    print(f"{model_name}:\n", classification_report(y_test, y_pred))


### TF-IDF Results ###
Logistic Regression:
               precision    recall  f1-score   support

     RoastMe       0.96      0.99      0.97     60259
     toastme       0.90      0.72      0.80      9762

    accuracy                           0.95     70021
   macro avg       0.93      0.85      0.89     70021
weighted avg       0.95      0.95      0.95     70021

Naive Bayes:
               precision    recall  f1-score   support

     RoastMe       0.93      0.99      0.96     60259
     toastme       0.92      0.55      0.69      9762

    accuracy                           0.93     70021
   macro avg       0.93      0.77      0.82     70021
weighted avg       0.93      0.93      0.92     70021

Decision Tree:
               precision    recall  f1-score   support

     RoastMe       0.95      0.96      0.96     60259
     toastme       0.75      0.67      0.71      9762

    accuracy                           0.92     70021
   macro avg       0.85      0.82      0.83     70021

In [24]:
# Train and evaluate models using Count Vectorization
print("Count Vectorization Results....")
for model_name, model in models.items():
    model.fit(X_train_count, y_train)
    y_pred = model.predict(X_test_count)
    print(f"{model_name}:\n", classification_report(y_test, y_pred))


Count Vectorization Results....
Logistic Regression:
               precision    recall  f1-score   support

     RoastMe       0.96      0.99      0.97     60259
     toastme       0.91      0.72      0.80      9762

    accuracy                           0.95     70021
   macro avg       0.93      0.85      0.89     70021
weighted avg       0.95      0.95      0.95     70021

Naive Bayes:
               precision    recall  f1-score   support

     RoastMe       0.96      0.97      0.96     60259
     toastme       0.79      0.75      0.77      9762

    accuracy                           0.94     70021
   macro avg       0.88      0.86      0.87     70021
weighted avg       0.94      0.94      0.94     70021

Decision Tree:
               precision    recall  f1-score   support

     RoastMe       0.95      0.95      0.95     60259
     toastme       0.70      0.68      0.69      9762

    accuracy                           0.91     70021
   macro avg       0.82      0.82      0.82 

In [None]:
# Train and evaluate models using GloVe/Word2Vec embeddings
print("GloVe Embedding Results...")
for model_name, model in models.items():
    model.fit(X_train_glove_scaled, y_train)
    y_pred = model.predict(X_test_glove_scaled)
    print(f"{model_name}:\n", classification_report(y_test, y_pred, zero_division=1))

GloVe Embedding Results...
Logistic Regression:
               precision    recall  f1-score   support

     RoastMe       0.88      0.98      0.93     60259
     toastme       0.57      0.16      0.25      9762

    accuracy                           0.87     70021
   macro avg       0.72      0.57      0.59     70021
weighted avg       0.83      0.87      0.83     70021

Naive Bayes:
               precision    recall  f1-score   support

     RoastMe       0.86      1.00      0.93     60259
     toastme       1.00      0.00      0.00      9762

    accuracy                           0.86     70021
   macro avg       0.93      0.50      0.46     70021
weighted avg       0.88      0.86      0.80     70021

Decision Tree:
               precision    recall  f1-score   support

     RoastMe       0.92      0.91      0.91     60259
     toastme       0.47      0.48      0.48      9762

    accuracy                           0.85     70021
   macro avg       0.69      0.70      0.69     7