<a href="https://colab.research.google.com/github/ns5504/Assessing-the-Impact-of-ChatGPT/blob/main/ChatGPT_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install
%pip install nlpaug==1.1.11
%pip install imbalanced-learn==0.11.0
%pip install openpyxl
%pip install wordcloud
%pip install contractions
%pip install vaderSentiment
%pip install transformers
%pip install torch

[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)


In [None]:
# Standard library imports
import os
import re
import string
import glob
import pathlib
from datetime import datetime

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# Deep learning
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments
)

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Web requests
import requests

# Progress tracking
from tqdm import tqdm

In [None]:
#Bert Model


# =======================
# 1. Load and Preprocess Data
# =======================
df = pd.read_csv("/content/Majority Voting 2nd round.csv")
df = df[['Body', 'Majority Vote']].rename(columns={'Body': 'text', 'Majority Vote': 'label'})
df = df.dropna()

# Get unique labels and their count
unique_labels = df['label'].unique()
num_labels = len(unique_labels)
print(f"Unique Labels: {unique_labels}, Number of Labels: {num_labels}")

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])  # Encode labels to numerical values

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# =======================
# 2. Tokenization & Dataset Setup
# =======================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length',
                                  max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# =======================
# 3. Model and Optimizer
# =======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Update num_labels to match the actual number of labels in your dataset
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# =======================
# 4. Training Loop
# =======================
epochs = 2
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

# =======================
# 5. Evaluation
# =======================
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# =======================
# 6. Metrics
# =======================
acc = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_)

print(f"\n✅ Accuracy: {acc:.4f}")
print("📊 Classification Report:")
print(report)

In [None]:
# Roberta model and tokenizer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Custom Dataset class remains the same
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Ensure text is a string
        text = str(self.texts.iloc[idx])  # Convert to string explicitly
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Text preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Trainer remains the same
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Load the datasets
ground_truth_path = '/content/Majority Voting 2nd round.csv'
new_data_path = '/content/Academic_data_cleaned.csv'

ground_truth = pd.read_csv(ground_truth_path, encoding='latin1')
new_data = pd.read_csv(new_data_path)

#ground_truth['Body'] = (ground_truth['Title'] + ' ' + ground_truth['Body']).apply(preprocess_text)
ground_truth['Body'] = ground_truth['Body'].astype(str)
new_data['selftext'] = new_data['selftext'].astype(str)


X = ground_truth['Body'].apply(preprocess_text)
y = ground_truth['Majority Vote']

# Map sentiments to numeric labels
label_map = {label: idx for idx, label in enumerate(y.unique())}
y = y.map(label_map)


# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_map))

# Stratified train-test split
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


train_dataset = TextDataset(X, y, tokenizer, max_length=256)
test_dataset = TextDataset(new_data['selftext'].apply(preprocess_text), pd.Series([0] * len(new_data)), tokenizer, max_length=256)


# Create datasets
"""train_dataset = TextDataset(X_train, y_train, tokenizer, max_length=256)  # Increased max_length
val_dataset = TextDataset(X_val, y_val, tokenizer, max_length=256)"""

# Training arguments remain the same
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=1e-5,  # Adjust learning rate
    per_device_train_batch_size=8,  # Adjust batch size
    per_device_eval_batch_size=8,  # Adjust batch size
    num_train_epochs=10,  # Adjust number of epochs
    weight_decay=0.005,  # Adjust weight decay
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
                               'attention_mask': torch.stack([f['attention_mask'] for f in data]),
                               'labels': torch.stack([f['labels'] for f in data])},
    tokenizer=tokenizer
)
# Train the model
trainer.train()

# Get predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

print("Classification Report:\n")
print(classification_report([0] * len(new_data), pred_labels))
accuracy = accuracy_score([0] * len(new_data), pred_labels)
print(f"Test Accuracy: {accuracy:.4f}")

# Validation and new dataset predictions remain the same
new_data['text'] = (new_data['title'].astype(str) + ' ' + new_data['selftext'].astype(str)).apply(preprocess_text)
new_texts = new_data['text']

new_encodings = tokenizer(
    new_texts.tolist(),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors="pt"
)

model.eval()
with torch.no_grad():
    outputs = model(
        input_ids=new_encodings['input_ids'],
        attention_mask=new_encodings['attention_mask']
    )
    predictions = torch.argmax(outputs.logits, dim=1).numpy()

inverse_label_map = {idx: label for label, idx in label_map.items()}
new_data['predicted_sentiment'] = [inverse_label_map[pred] for pred in predictions]

output_path = '/content/roBerta_model.csv'
new_data.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

In [None]:
#Logistic Regression Model

# Load the Reddit ground truth dataset
ground_truth_path = '/content/drive/MyDrive/Validation sets/majority_voting_results.csv'
new_data_path = '/content/drive/MyDrive/Reddit_Data/cleaned/Education/merged_output.csv'

ground_truth = pd.read_csv(ground_truth_path, encoding='latin1')
new_data = pd.read_csv(new_data_path)

# Prepare the training data
# Combine 'Title' and 'Body' for better context
ground_truth['text'] = ground_truth['Title'] + ' ' + ground_truth['Body']
X = ground_truth['text']
y = ground_truth['Final Sentimental']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=120)

# Create a pipeline with TfidfVectorizer and LogisticRegression
pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(random_state=120)
)

# Train the model
pipeline.fit(X_train, y_train)

# Validate the model
val_predictions = pipeline.predict(X_val)
print("Classification Report:\n")
print(classification_report(y_val, val_predictions))

accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

# Prepare the new dataset
# Combine 'title' and 'selftext' for prediction
new_data['text'] = new_data['title'] + ' ' + new_data['selftext']

# Predict sentiment for the new data
new_data['predicted_sentiment'] = pipeline.predict(new_data['text'])

# Save the labeled dataset
output_path = '/content/drive/MyDrive/Validation sets/labeled_merged_output2.csv'
new_data.to_csv(output_path, index=False)

print(f"Labeled dataset saved to {output_path}")

In [None]:
#Reddit Cleaner

def menu_selection():
    """
    Displays the main menu and handles user input for different actions.
    """
    while True:
        print(
            '\nMenu Options:'
            '\n1. Scrape Reddit data'
            '\n2. Clean Reddit data'
            '\n3. Exit'
        )
        selection = input('Enter your choice (1/2/3): ').strip()

        if selection == '1':
            get_information()
        elif selection == '2':
            path = input('Enter the path to your data file (e.g., /content/yourfile.csv): ').strip()
            clean_data(path)
        elif selection == '3':
            print("Exiting...")
            break
        else:
            print('Invalid selection. Please try again.')


def get_information():
    """
    Collects input from the user to scrape Reddit data and calls the scraper function.
    """
    subreddit = input("Enter subreddit(s) to scrape (comma-separated): ").strip()
    word = input("Enter keyword(s) to search (comma-separated): ").strip()
    start = input("Enter start date (YYYY-MM-DD): ").strip()
    end = input("Enter end date (YYYY-MM-DD): ").strip()
    folder = input("Enter folder to save data (leave blank to save in /content): ").strip()

    if not folder:
        folder = "/content"
    else:
        folder = os.path.join("/content", folder)
        os.makedirs(folder, exist_ok=True)

    parse_tag_list(word, subreddit, start, end, folder)


def fetch_reddit_posts(subreddit, start_date, end_date, keyword=None, folder=None):
    """
    Fetches Reddit posts for a specific subreddit and keyword within a date range.
    :param subreddit: Subreddit name.
    :param start_date: Start date in YYYY-MM-DD format.
    :param end_date: End date in YYYY-MM-DD format.
    :param keyword: Keyword to search.
    :param folder: Folder to save the results.
    """
    start_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())
    end_timestamp = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())

    base_url = 'https://api.pushshift.io/reddit/search/submission/'

    params = {
        'subreddit': subreddit,
        'after': start_timestamp,
        'before': end_timestamp,
        'size': 100,
        'sort': 'desc',
        'sort_type': 'created_utc'
    }

    if keyword:
        params['q'] = keyword

    posts = []
    while True:
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            break

        data = response.json()
        if 'data' not in data or len(data['data']) == 0:
            break

        posts.extend(data['data'])
        params['before'] = data['data'][-1]['created_utc']

    df = pd.DataFrame(posts)
    if not df.empty:
        df = df[['id', 'title', 'selftext', 'author', 'created_utc', 'url', 'num_comments', 'score']]
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

        save_path = os.path.join(folder, f"{subreddit}_{keyword}_{start_date}_{end_date}.csv")
        df.to_csv(save_path, index=False)
        print(f"Data saved to {save_path}")


def parse_tag_list(tag_list, sub_list, start_date, end_date, folder):
    """
    Processes the list of keywords and subreddits, and calls the fetch function for each combination.
    """
    keywords = [tag.strip() for tag in tag_list.split(',')]
    subreddits = [sub.strip() for sub in sub_list.split(',')]

    for subreddit in subreddits:
        for keyword in keywords:
            fetch_reddit_posts(subreddit, start_date, end_date, keyword, folder)


def clean_data(path):
    """
    Cleans the Reddit data by removing duplicates, empty rows, and posts marked as 'deleted' or 'removed'.
    :param path: Path to the CSV file.
    """
    if not os.path.exists(path):
        print(f"Error: The file at '{path}' does not exist.")
        return

    try:
        df = pd.read_csv(path)

        # Remove duplicate rows
        df.drop_duplicates(inplace=True)

        # Remove rows with NaN values
        #df.dropna(inplace=True)

        # Remove rows with 'deleted' or 'removed' in 'selftext'
        if 'selftext' in df.columns:
            df = df[~df['selftext'].str.contains('deleted|removed', case=False, na=False)]

        # Save the cleaned data
        save_file = path.replace(".csv", "_cleaned.csv")
        df.to_csv(save_file, index=False)
        print(f"Cleaned data saved to {save_file}")
    except Exception as e:
        print(f"An error occurred during cleaning: {e}")


# Start the program
if __name__ == "__main__":
    menu_selection()

In [None]:
# Concatenate CSV files in a directory

def concat_csv_in_directory(path, subreddit_domain_file):
    # Read the main CSV file; if it's empty, initialize an empty DataFrame with expected columns (if any)
    try:
        df = pd.read_csv(subreddit_domain_file)
    except pd.errors.EmptyDataError:
        df = pd.DataFrame()

    for file in os.listdir(path):
        file_path = os.path.join(path, file)

        # Process only CSV files
        if os.path.isfile(file_path) and file.endswith(".csv"):
            try:
                df_temp = pd.read_csv(file_path)
                df_temp["subreddit"] = file.split('_')[0]  # Add the filename column
                df = pd.concat([df, df_temp], ignore_index=True)
            except pd.errors.EmptyDataError:
                print(f"Skipping empty file: {file}")
                continue  # Skip this file if it is empty

    df.to_csv(subreddit_domain_file, index=False)
    return df

def concat_csv(path, subreddit_domain_file):
    for file in path.iterdir():
        if file.is_file() and file.name.endswith(".csv"):
            return concat_csv_in_directory(path, subreddit_domain_file)
    return pd.DataFrame()

file_path = pathlib.Path('/content/Academic_data.csv')
folder_path = pathlib.Path('/content/unclean')

print(len(os.listdir(folder_path)))

if file_path.exists() and file_path.suffix == ".csv":
    print('File found')
    df = concat_csv(folder_path, file_path)
    print(df)
else:
    print('File not found')
    pd.DataFrame().to_csv(file_path, index=False)  # Create an empty CSV
    df = concat_csv(folder_path, file_path)
    print(df)

In [None]:
#Reddit Scraper

def menu_selection():
  print('Input a number from the menu below:'
          '\n1. Scrape Reddit data'
          '\n2. clean Reddit data'
          '\n3. Exit')

  selection = input('Enter your choice: ')

  if selection == '1':
      get_information()

  if selection == '2':
      path = input('Enter the path to your data: ')
      save_path = input('Enter the path to save your data please include .csv: ')
      clean_data('./' + path, save_path)

  if selection == '3':
      return None

  elif selection != '1' or selection != '2' or selection != '3':
    print('Invalid selection, try again')
    menu_selection()

def get_information():
    subreddit = input("Enter the name of the subreddit to scrape, if multiple separate with a comma: ")
    word = input("Enter the word you'd like to search, if multiple separate each with a comma: ")
    start = input("Enter the date to begin scraping YYYY-MM-DD: ")
    end = input("Enter the date to end scraping YYYY-MM-DD: ")
    folder = input("Enter the folder where you want the data to be saved: ")
    parse_tag_list(word, subreddit, start, end, folder)

"""#Scraping function"""

def fetch_reddit_posts(subreddit, start_date, end_date, keyword=None, folder=None):
    """
    Fetches Reddit posts from a given subreddit.
    :param folder:
    :param subreddit:  The subreddit to fetch posts from.
    :param start_date:  The start date to retrieve posts from.
    :param end_date:  The end date to retrieve posts from.
    :param keyword:  The keyword used to fetch posts from.
    :return:
    """

    # Convert dates to Unix timestamps
    start_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())
    end_timestamp = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())

    base_url = 'https://api.pullpush.io/reddit/search/submission/'

    params = {
        'subreddit': subreddit,
        'after': start_timestamp,
        'before': end_timestamp,
        'size': 100,  # Number of results per request (max 100)
        'sort': 'desc',
        'sort_type': 'created_utc'
    }

    if keyword:
        params['q'] = keyword

    posts = []
    while True:
        print(f"Fetching data with params: {params}")
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            break

        data = response.json()
        if 'data' not in data or len(data['data']) == 0:
            print("No data found or no more posts to fetch.")
            break

        posts.extend(data['data'])

        print(f"Fetched {len(data['data'])} posts.")

        # Update the 'before' parameter to the timestamp of the last fetched post
        params['before'] = data['data'][-1]['created_utc']

    # Create a DataFrame from the posts
    df = pd.DataFrame(posts)

    # Select relevant columns
    if not df.empty:
        df = df[['id', 'title', 'selftext', 'author', 'created_utc', 'url', 'num_comments', 'score']]

        # Convert the created_utc column to a readable date format
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

        # Save DataFrame to CSV if a file path is provided
        if not folder:
          df.to_csv(f"{subreddit}_{keyword}_{start_date}_{end_date}.csv", index=False)
          print(f"Saved to {subreddit}_{keyword}_{start_date}_{end_date}.csv")

        if folder and os.path.exists(folder):
          print('folder true and path exist')
          df.to_csv(f"{folder}/{subreddit}_{keyword}_{start_date}_{end_date}.csv", index=False)
          print(f"Saved to {folder}/{subreddit}_{keyword}_{start_date}_{end_date}.csv")

        elif folder and not os.path.exists(folder):
          print('folder true but doesnt exist')
          try:
            os.mkdir(folder)
            df.to_csv(f"{folder}/{subreddit}_{keyword}_{start_date}_{end_date}.csv", index=False)
            print(f"Saved to {folder}/{subreddit}_{keyword}_{start_date}_{end_date}.csv")
          except:
            print('An error occured')
    return df

"""#Function that checks for multiple tags and subreddits"""

def parse_tag_list(tag_list, sub_list, start_date, end_date, folder):
    keyword_list = tag_list.split(',')
    subreddit_list = sub_list.split(',')

    for x in range(0, len(subreddit_list)):
      subreddit_list[x] = subreddit_list[x].strip()

    for x in range(0, len(keyword_list)):
      keyword_list[x] = keyword_list[x].strip()

    for subreddit in subreddit_list:

        for word in keyword_list:
            fetch_reddit_posts(subreddit, start_date, end_date, word, folder)


"""#Function to clean data"""

def clean_data(path, save_file):
    df = pd.DataFrame({})
    # If path is directory concat all data
    if os.path.isdir(path):
        for file in os.listdir(path):
            df_temp = pd.read_csv(os.path.join(path, file))
            df = pd.concat([df, df_temp], ignore_index=True)

    if not os.path.isdir(path):
        df = pd.read_csv(path)

    # Remove duplicate data points
    df.drop_duplicates(keep='first', inplace=True)

    # Remove empty data
    df.dropna(inplace=True)

    # Save df to new CSV
    df.to_csv(save_file, index=False)

"""#Call function to scrape data"""

menu_selection()

In [None]:
# Text Preprocessing

def remove_stopwords(text) -> str:
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]

    return ' '.join(filtered_words)

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def standardize_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Load CSV file into a pandas DataFrame
df = pd.read_csv("/content/drive/MyDrive/Validation sets/Marcos' Validation dataset - Sheet1.csv")

# download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

print(df['Body'])

contracted_string = df['Body'].apply(lambda text: contractions.fix(text))
#print(contracted_string)

lemmatized_string = contracted_string.apply(lemmatize_text)
#print(lemmatized_string)

df['Body'] = lemmatized_string.apply(standardize_text)

no_stpwords_string = df['Body'].apply(remove_stopwords)
#print(no_stpwords_string)

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(no_stpwords_string)
print(tfidf_vectors.toarray())

In [None]:
#VADER Model

# ======================
# 1. Load and Prepare Data
# ======================
df = pd.read_csv("Majority Voting 2nd round.csv")
df = df[['Body', 'Majority Vote']].rename(columns={'Body': 'text', 'Majority Vote': 'label'})
df = df.dropna()

# Encode true labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])  # 0 = Negative, 1 = Positive

# ======================
# 2. Initialize VADER
# ======================
analyzer = SentimentIntensityAnalyzer()

def get_vader_label(text):
    score = analyzer.polarity_scores(text)["compound"]
    if score >= 0.8:  # Relaxed positive threshold
        return "Positive"
    elif score <= -0.8:  # Relaxed negative threshold
        return "Negative"
    else:
        return "Neutral"

# ======================
# 3. Apply VADER to Dataset
# ======================
df['vader_pred'] = df['text'].apply(get_vader_label)

# ======================
# 4. Evaluation
# ======================
accuracy = accuracy_score(df['label_encoded'], df['vader_pred'])
report = classification_report(df['label_encoded'], df['vader_pred'], target_names=label_encoder.classes_)

print(f"✅ VADER Accuracy: {accuracy:.4f}")
print("📊 VADER Classification Report:")
print(report)

In [None]:
#Sentiment separation

# Ask user for the Excel file path
file_path = input("Enter the full path to your Excel file (e.g., /content/sentiment.xlsx): ")

# Read the Excel file
try:
    df = pd.read_excel(file_path)
except Exception as e:
    raise FileNotFoundError(f"Could not read the file: {e}")

# Ensure required columns exist
if not {'Sentiment', 'Body'}.issubset(df.columns):
    raise ValueError("The Excel file must have 'Sentiment' and 'Body' columns.")

# Normalize sentiment to lowercase
df['Sentiment'] = df['Sentiment

'].str.lower()

# Filter by sentiment
positive_df = df[df['Sentiment'] == 'positive'][['Body', 'Sentiment']]
negative_df = df[df['Sentiment'] == 'negative'][['Body', 'Sentiment']]
neutral_df  = df[df['Sentiment'] == 'neutral'][['Body', 'Sentiment']]

# Save to separate Excel files
positive_path = 'positive_sentiments.xlsx'
negative_path = 'negative_sentiments.xlsx'
neutral_path  = 'neutral_sentiments.xlsx'

positive_df.to_excel(positive_path, index=False)
negative_df.to_excel(negative_path, index=False)
neutral_df.to_excel(neutral_path, index=False)

print("\n✅ Files saved successfully:")
print(f"- {positive_path}")
print(f"- {negative_path}")
print(f"- {neutral_path}")


In [None]:
# Agreement Score Calculation

def calculate_agreement_score():
    # Prompt the user to enter the file paths for each file
    majority_file = input("Enter the path of the Majority Voting result file: ")
    comparison_file = input("Enter the path of the comparison file: ")

    try:
        # Load the two Excel files
        majority_df = pd.read_excel(majority_file)
        comparison_df = pd.read_excel(comparison_file)

        # Check if the 'Majority Vote' column exists in the majority file
        if 'Majority Vote' not in majority_df.columns:
            print("Error: 'Majority Vote' column not found in the majority voting file.")
            return None

        # Check if the 'Sentiment' column exists in the comparison file
        if 'Sentiment' not in comparison_df.columns:
            print("Error: 'new Sentiment' column not found in the comparison file.")
            return None

        # Make sure both files have the same number of rows for comparison
        if len(majority_df) != len(comparison_df):
            print("Error: Files do not have the same number of rows.")
            return None

        # Capitalize the values in both columns for consistency
        majority_df['Majority Vote'] = majority_df['Majority Vote'].str.capitalize()
        comparison_df['Sentiment'] = comparison_df['Sentiment'].str.capitalize()

        # Calculate the agreement (count how many rows are the same in both columns)
        agreements = (majority_df['Majority Vote'] == comparison_df['Sentiment']).sum()
        total = len(majority_df)
        agreement_score = (agreements / total) * 100

        # Output the agreement score
        print(f"\nAgreement Score: {agreement_score:.2f}%")
        return agreement_score

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function to calculate the agreement score
calculate_agreement_score()

In [None]:
# Word Cloud

folder_path = "/content/drive/MyDrive/Reddit_Data/GradSchool_chatgpt_2023-1-1_2023-4-30.csv"  # Adjust this to point to the 'reddit' folder on your flash drive
file_pattern = os.path.join(folder_path, "*.csv")
files = glob.glob(file_pattern)

# Set the stopwords list
stopwords = set(STOPWORDS)
new_words = [ 'https','i', 'you', 'me', 'a', 'us','thank','you','chatgpt','ve','lot','please','now','something']  # Add other words you want to exclude
new_stopwords = stopwords.union(new_words)

# df = pd.read_csv(files)
if files:
    # Read the first (and presumably only) CSV file
    df = pd.read_csv(files)
else:
    print("No CSV files found in the specified folder.")
    # Handle the case where no files are found, e.g., exit or raise an exception

plt.rcParams["figure.figsize"] = (10, 10)

if text.strip():  # Check if text is not empty or just whitespace
    wordcloud = WordCloud(
        max_font_size=50,
        max_words=100,
        background_color="white",
        stopwords=new_stopwords,
        colormap='Dark2'  # Choose a contrasting colormap
    ).generate(text)

    # Plot Wordcloud
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
else:
    print("No text available to generate word cloud.")

if text.strip():  # Check if text is not empty or just whitespace
    wordcloud = WordCloud(
        max_font_size=50,
        max_words=100,
        background_color="white",
        stopwords=new_stopwords,
        colormap='Dark2'  # Choose a contrasting colormap
    ).generate(text)

    # Plot Wordcloud
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
else:
    print("No text available to generate word cloud.")

# Size of Word Cloud
plt.rcParams["figure.figsize"] = (10,10)

# Make Wordcloud
wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white",stopwords=new_stopwords, colormap='flag').generate(text)

# Plot Wordcloud
plt.plot()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()