In [8]:
import spacy
import re
import pandas as pd
import contractions
import os
from spacy.language import Language
from spacy_langdetect import LanguageDetector



In [2]:
data = pd.read_csv('DANE/df_preprocessed_v2.csv')
data.drop(["Unnamed: 0","nr"],axis=1,inplace=True)
data.head()

Unnamed: 0,text,label,target
0,Talkin' Tacos & Bowls is my go-to spot for tas...,CG,1.0
1,I recently dined at this restaurant and was im...,CG,1.0
2,The visit to this charming vineyard was a deli...,CG,1.0
3,"I recently visited Berry Sweet, which opened i...",CG,1.0
4,I stumbled upon this gem of a restaurant and w...,CG,1.0


In [4]:
# Unique chars
unique_chars = pd.Series(
    [char for sentence in data["text"] for char in sentence]
).unique()
print("Number of unique chars:", len(unique_chars))
print(unique_chars)

Number of unique chars: 123
['T' 'a' 'l' 'k' 'i' 'n' "'" ' ' 'c' 'o' 's' '&' 'B' 'w' 'm' 'y' 'g' '-'
 't' 'p' 'f' 'r' 'b' 'e' '!' 'h' ',' '.' 'C' 'z' 'v' 'd' '4' 'S' 'D' 'P'
 'I' 'u' 'M' 'H' 'q' 'j' 'O' 'x' 'A' '2' '1' '0' '\n' 'W' 'N' 'E' '$' 'J'
 'G' '/' 'F' 'L' '7' '5' '3' '6' '–' '?' 'R' 'K' 'Y' 'U' 'Z' 'V' 'Q' '"'
 '¡' ';' '(' ')' '9' ':' '8' 'X' 'é' 'á' 'í' 'ñ' 'ú' 'ó' '—' '#' '%' '™'
 'è' 'û' '*' 'ì' '’' '+' '🥪' '🥤' '🤤' 'â' '@' 'ł' 'ą' 'à' 'ü' '‘' 'ç' 'ê'
 '=' '_' '~' 'ä' 'ß' '|' '\xa0' '\r' 'ś' 'ù' ']' '\u200b' 'É' '`' '×']


# NER removal

In [23]:
nlp = spacy.load("en_core_web_sm")



In [29]:
sentence = data["text"][3]
print(sentence)
doc = nlp(sentence)
for ent in doc.ents:
    print(ent.text, ent.label_)

I recently visited Berry Sweet, which opened its doors on August 21, 2010. The first thing that caught my eye was the bright and clean interior, albeit reminiscent of a Phileo knock off. The renovated space had a modern feel, but I couldn't help but notice the limited toppings selection, which left much to be desired. 

Despite this drawback, I plan on giving Berry Sweet another chance during my next visit. The convenience of its location near my apartment is a major plus, and I'm hoping that the overall experience will improve. While the toppings selection may have been lacking, the potential for a better experience is there. 

Overall, Berry Sweet has some room for improvement, but I see potential in the place. I'm willing to give it another shot and hope for a more satisfying frozen yogurt experience next time.
Berry Sweet PERSON
August 21, 2010 DATE
first ORDINAL
Phileo PRODUCT
Berry Sweet PERSON
Berry Sweet PERSON


In [30]:
def remove_ner(text):
    doc = nlp(text)
    ents = [e.text for e in doc.ents]
    for ent in ents:
        text = text.replace(ent, '')
    return text

In [33]:
sentence = data["text"][3]
print(sentence)
print("=" * 50)
print(remove_ner(sentence))

I recently visited Berry Sweet, which opened its doors on August 21, 2010. The first thing that caught my eye was the bright and clean interior, albeit reminiscent of a Phileo knock off. The renovated space had a modern feel, but I couldn't help but notice the limited toppings selection, which left much to be desired. 

Despite this drawback, I plan on giving Berry Sweet another chance during my next visit. The convenience of its location near my apartment is a major plus, and I'm hoping that the overall experience will improve. While the toppings selection may have been lacking, the potential for a better experience is there. 

Overall, Berry Sweet has some room for improvement, but I see potential in the place. I'm willing to give it another shot and hope for a more satisfying frozen yogurt experience next time.
I recently visited , which opened its doors on . The  thing that caught my eye was the bright and clean interior, albeit reminiscent of a  knock off. The renovated space had 

In [34]:
# Zastosowanie funkcji do kolumny 'text' w dataframe
from tqdm import tqdm
tqdm.pandas()
data['text_without_ner'] = data['text'].progress_apply(remove_ner)

100%|██████████| 19811/19811 [10:14<00:00, 32.22it/s]


# Basic cleaning

In [35]:
def clean_text(text: str) -> str:
    """
    Cleans input text by lowercasing and removing punctuation.

    :param text: An input string to be cleaned
    :return: Cleaned string
    """

    text = text.lower()

    # print("Lower case")
    # print(text)

    desired_elements = r"[^a-z\?\!\'\ ]"
    text = re.sub(desired_elements, "", text)

    # print()
    # print("Desired signs")
    # print(text)

    text = " ".join([contractions.fix(word) for word in text.split()])

    # print()
    # print("Contractions")
    # print(text)

    replacements = {
        r"'s\b": "",
        r"\s+": " ",
    }

    for replace, by in replacements.items():
        text = re.sub(replace, by, text)

    # print()
    # print("Space and 's")

    return text.strip()

In [20]:
initial_sentence = data["text"][3]
print(initial_sentence)
example_sentence = clean_text(initial_sentence)
example_sentence

I recently visited Berry Sweet, which opened its doors on August 21, 2010. The first thing that caught my eye was the bright and clean interior, albeit reminiscent of a Phileo knock off. The renovated space had a modern feel, but I couldn't help but notice the limited toppings selection, which left much to be desired. 

Despite this drawback, I plan on giving Berry Sweet another chance during my next visit. The convenience of its location near my apartment is a major plus, and I'm hoping that the overall experience will improve. While the toppings selection may have been lacking, the potential for a better experience is there. 

Overall, Berry Sweet has some room for improvement, but I see potential in the place. I'm willing to give it another shot and hope for a more satisfying frozen yogurt experience next time.


'i recently visited berry sweet which opened its doors on august the first thing that caught my eye was the bright and clean interior albeit reminiscent of a phileo knock off the renovated space had a modern feel but i could not help but notice the limited toppings selection which left much to be desired despite this drawback i plan on giving berry sweet another chance during my next visit the convenience of its location near my apartment is a major plus and i am hoping that the overall experience will improve while the toppings selection may have been lacking the potential for a better experience is there overall berry sweet has some room for improvement but i see potential in the place i am willing to give it another shot and hope for a more satisfying frozen yogurt experience next time'

In [36]:
from tqdm import tqdm
tqdm.pandas()
# załóżmy, że 'data' to twój dataframe, a 'text' to kolumna, którą chcesz oczyścić
data['cleaned_text'] = data['text_without_ner'].progress_apply(clean_text)

  0%|          | 0/19811 [00:00<?, ?it/s]

100%|██████████| 19811/19811 [00:11<00:00, 1743.86it/s]


In [37]:
data.to_csv("DANE/PREPROCESSED/yelp_multiple.csv")

# Zastosowanie dla pozostałych zbiorów

In [38]:
data = pd.read_csv('df_preprocessed_v1.csv')

amazon_single = pd.read_csv("DANE/reviews_generated1_cut.csv")
yelp_single = pd.read_csv("DANE/reviews_generated2_cut.csv")
amazon_ext = pd.read_csv("DANE/fake-reviews-dataset.csv")
yelp_ext = pd.read_csv("DANE/combat-ai-restaurants-test.csv")
general_gpt3 = pd.read_csv("DANE/reviews_generated4_cut.csv")
general_gpt4 = pd.read_csv("DANE/reviews_generated5_cut.csv")
llama3 = pd.read_csv("DANE/reviews_generated6_cut.csv")

human = pd.read_excel("DANE/restaurant_reviews_anonymized.xlsx")
# with open("DANE/restaurant_reviews_anonymized.csv", 'rb') as f:
#   human = f.read()

  warn(msg)


In [39]:
for df in [amazon_single,yelp_single,general_gpt3,general_gpt4,llama3]:
    df.drop("Unnamed: 0",axis=1,inplace=True)

In [40]:
amazon_single = amazon_single.rename(columns={"text_": "text"})
amazon_ext = amazon_ext.rename(columns={"text_": "text"})
amazon_ext['target'] = amazon_ext['label'].map({'CG': 1, 'OR': 0})
yelp_ext = yelp_ext.rename(columns={"label": "target"})

human = human.loc[:,['Review','Real']]
human.columns = ["text","target"]
human['target'] = 1 - human['target']

amazon_single = amazon_single[amazon_single.apply(lambda x: len(x['text']) >= 20, axis=1)]

In [41]:
zbiory = {"amazon_single": amazon_single, "yelp_single": yelp_single, "amazon_ext": amazon_ext, "yelp_ext": yelp_ext, "general_gpt3": general_gpt3, "general_gpt4": general_gpt4, "human": human, "llama3": llama3}

In [43]:
for name, df in zbiory.items():
    print("=" * 50)  # Dodajemy linie złożoną z 50 znaków "="
    print((f"PROCESSING DATASET {name}..."))
    df['text_without_ner'] = df['text'].apply(remove_ner)
    df['cleaned_text'] = df['text_without_ner'].apply(clean_text)
    print(f"DATASET {name} CLEANED")
    df.to_csv(f"DANE/PREPROCESSED/{name}.csv")

PROCESSING DATASET amazon_single...
DATASET amazon_single CLEANED
PROCESSING DATASET yelp_single...
DATASET yelp_single CLEANED
PROCESSING DATASET amazon_ext...
DATASET amazon_ext CLEANED
PROCESSING DATASET yelp_ext...
DATASET yelp_ext CLEANED
PROCESSING DATASET general_gpt3...
DATASET general_gpt3 CLEANED
PROCESSING DATASET general_gpt4...
DATASET general_gpt4 CLEANED
PROCESSING DATASET human...
DATASET human CLEANED
PROCESSING DATASET llama3...
DATASET llama3 CLEANED
