In [1]:
import pandas as pd
import pickle
import os.path
import numpy as np
import string
import requests
import math
import time
import torch
import ast
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt

import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, EarlyStoppingCallback
import torch.nn.functional as F
import warnings
import simplemma
from setfit import AbsaModel, AbsaTrainer, TrainingArguments
from datasets import load_dataset
from nltk.stem import WordNetLemmatizer


transformers.logging.set_verbosity_error()

from transformers import AlbertTokenizer, AlbertForSequenceClassification

In [None]:
file_path = '/Users/negarakhgar/Desktop/nlp project/data/boardgames_comments.csv'
df_comments = pd.read_csv(file_path)

In [3]:
import re
# Dropping the rows where the comments is NaN or too short
df_comments = df_comments.dropna(subset=['value'])
df_comments = df_comments[df_comments['value'].str.len() > 10]

# Cleaning the comments column to remove URLs and non-text content
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|img\S+", '', text) 
    text = re.sub(r"\[.*?\]", '', text)  
    text = re.sub(r"\s+", ' ', text) 
    return text.strip() 

df_comments['cleaned_comment'] = df_comments['value'].apply(clean_text)

df_comments['rating'] = pd.to_numeric(df_comments['rating'], errors='coerce')

In [4]:
df_comments

Unnamed: 0.1,Unnamed: 0,username,rating,value,boardgame_id,cleaned_comment
0,0,1 Family Meeple,,SLEEVED[IMG]https://cf.geekdo-static.com/mbs/m...,224517,SLEEVED Deluxe Edition with Clay Coins
2,2,1bez,10.0,"Great game, full controllo of your strategy th...",224517,"Great game, full controllo of your strategy th..."
3,3,1x0r,,Location: MSK,224517,Location: MSK
4,4,2bit,7.5,"Very clever game, enjoyable overall. Plus poi...",224517,"Very clever game, enjoyable overall. Plus poin..."
5,5,2d20,9.0,Brilliant! Fits right into my wheelhouse all ...,224517,Brilliant! Fits right into my wheelhouse all a...
...,...,...,...,...,...,...
69018,4965,Zvonmirus,7.5,Only played the beginning scenarios with my el...,291457,Only played the beginning scenarios with my el...
69019,4966,Zygomax,,BGS Prize Nov. 2021,291457,BGS Prize Nov. 2021
69020,4967,_Kenneth,9.5,Cooperative Legacy (2P-3P) ✓ Completion,291457,Cooperative Legacy (2P-3P) ✓ Completion
69021,4968,_LSK_,5.0,Too hard under the rules in the game and a bit...,291457,Too hard under the rules in the game and a bit...


In [5]:
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0

def is_english(comment):
    try:
        return detect(comment) == 'en'  
    except Exception:
        return False 

In [6]:
# Filter comments that are in English
df_comments['is_english'] = df_comments['cleaned_comment'].apply(is_english)

# Keep only English comments
df_english_comments = df_comments[df_comments['is_english']]

df_english_comments = df_english_comments.drop(columns=['is_english'])

print(df_english_comments.head())

   Unnamed: 0         username  rating  \
0           0  1 Family Meeple     NaN   
2           2             1bez    10.0   
3           3             1x0r     NaN   
4           4             2bit     7.5   
5           5             2d20     9.0   

                                               value  boardgame_id  \
0  SLEEVED[IMG]https://cf.geekdo-static.com/mbs/m...        224517   
2  Great game, full controllo of your strategy th...        224517   
3                                      Location: MSK        224517   
4  Very clever game, enjoyable overall.  Plus poi...        224517   
5  Brilliant!  Fits right into my wheelhouse all ...        224517   

                                     cleaned_comment  
0             SLEEVED Deluxe Edition with Clay Coins  
2  Great game, full controllo of your strategy th...  
3                                      Location: MSK  
4  Very clever game, enjoyable overall. Plus poin...  
5  Brilliant! Fits right into my wheelhouse all a..

In [None]:
# Saving the filtered English comments to a new CSV file
english_comments_file = '/Users/negarakhgar/Desktop/nlp project/data/english_boardgames_comments.csv'
df_english_comments.to_csv(english_comments_file, index=False)

print(f"English comments extracted and saved to {english_comments_file}")