## Data Preparation & Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv("/content/google_reviews_cleaned.csv")

df.head()

Unnamed: 0,Name,starRating,Komment,reviews.createTime,review_clean,sentiment,rating_num,YearMonth,text_len,word_count
0,Karl Frühauf,FIVE,Ich bin mit dem Service sehr zufrieden\n\n(Tra...,2025-06-27 07:34:26.963,ich bin mit dem service sehr zufrieden\n\ntran...,pos,5,2025-06,97,17
1,Judith Baum,FIVE,"gutes Preis - Leistungsverhältnis, der Kundend...",2025-06-24 11:21:21.697,gutes preis leistungsverhltnis der kundendien...,pos,5,2025-06,170,23
2,Matta Botros,FIVE,"preise ok,kunden freundlich,kommen immer entge...",2025-06-16 07:24:37.607,preise okkunden freundlichkommen immer entgege...,pos,5,2025-06,128,16
3,Iris Schneider,THREE,Ab und zu ist der Kundendienst nicht kompetent...,2025-06-09 06:12:36.403,ab und zu ist der kundendienst nicht kompetent...,neu,3,2025-06,705,116
4,Ralf Mader,FIVE,Gutes Transparentes Unternehmen\n\n(Translated...,2025-06-02 11:06:18.280,gutes transparentes unternehmen\n\ntranslated ...,pos,5,2025-06,78,9


In [3]:
# Example: Average word length per review
df["avg_word_len"] = df["review_clean"].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split())>0 else 0)

# Example: Count of exclamation marks
df["num_exclaims"] = df["review_clean"].str.count("!")

# Example: Count of question marks
df["num_questions"] = df["review_clean"].str.count("\?")

  df["num_questions"] = df["review_clean"].str.count("\?")


In [4]:
# Fill missing numeric features with median
numeric_cols = ["text_len", "word_count", "avg_word_len", "num_exclaims", "num_questions"]
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

In [5]:
# Encode sentiment as numbers for models
sentiment_mapping = {"neg":0, "neu":1, "pos":2}
df["sentiment_label"] = df["sentiment"].map(sentiment_mapping)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

# Ensure stopwords are downloaded and defined
nltk.download('stopwords', quiet=True) # Download the stopwords corpus silently
stop_words = set(stopwords.words("german")) | set(stopwords.words("english"))
custom_stopwords = {
    "translated", "google", "maxenergy", "energie", "durchblicker", "wien",
    "the", "a", "to", "and", "by", "is", "with", "of", "for", "im", "ich", "uns", "wir", "leider", "unfortunately", "max energy", "und"
}
stop_words = stop_words.union(custom_stopwords)

tfidf = TfidfVectorizer(
    max_features=5000,  # limit features to top 5k words
    ngram_range=(1,2),  # unigrams + bigrams
    stop_words=list(stop_words)  # your custom stopword list
)

X_text = tfidf.fit_transform(df["review_clean"])
print("TF-IDF shape:", X_text.shape)



TF-IDF shape: (2473, 5000)


In [8]:
from sklearn.model_selection import train_test_split

# 1. Split the original dataframe into training/validation and test sets
df_trainval, df_test = train_test_split(df, test_size=0.15, random_state=42)

# 2. Split the training/validation set into training and validation sets
df_train, df_val = train_test_split(df_trainval, test_size=0.15, random_state=42)

# 3. Extract features and labels for each split
# Training set
X_tab_train = df_train[["text_len", "word_count", "avg_word_len", "num_exclaims", "num_questions"]].values
y_train = df_train["sentiment_label"].values
X_text_train = tfidf.transform(df_train["review_clean"]) # Use transform as tfidf is already fitted

# Validation set
X_tab_val = df_val[["text_len", "word_count", "avg_word_len", "num_exclaims", "num_questions"]].values
y_val = df_val["sentiment_label"].values
X_text_val = tfidf.transform(df_val["review_clean"])

# Test set
X_tab_test = df_test[["text_len", "word_count", "avg_word_len", "num_exclaims", "num_questions"]].values
y_test = df_test["sentiment_label"].values
X_text_test = tfidf.transform(df_test["review_clean"])


print("Train shape (tabular):", X_tab_train.shape)
print("Validation shape (tabular):", X_tab_val.shape)
print("Test shape (tabular):", X_tab_test.shape)

print("Train shape (text):", X_text_train.shape)
print("Validation shape (text):", X_text_val.shape)
print("Test shape (text):", X_text_test.shape)

Train shape (tabular): (1786, 5)
Validation shape (tabular): (316, 5)
Test shape (tabular): (371, 5)
Train shape (text): (1786, 5000)
Validation shape (text): (316, 5000)
Test shape (text): (371, 5000)


Дані були коректно розділені на навчальну, валідаційну та тестову вибірки.

Табличні ознаки містять 5 числових характеристик тексту, що описують його довжину та пунктуацію.

Текстові ознаки представлені за допомогою TF-IDF векторизації з розмірністю 5000 ознак.

Однакова кількість ознак у всіх вибірках гарантує коректне навчання та оцінку моделей.

Отримані ознаки готові до використання в моделях машинного навчання для класифікації сентименту.

In [26]:
# Save
import pickle
from scipy.sparse import save_npz
import os

# Define the output directory
output_dir = "/Users/mariiakostenko/Downloads/ML"
os.makedirs(output_dir, exist_ok=True)


# Табличні ознаки
with open(os.path.join(output_dir, "X_tab_train.pkl"), "wb") as f:
    pickle.dump(X_tab_train, f)
with open(os.path.join(output_dir, "X_tab_val.pkl"), "wb") as f:
    pickle.dump(X_tab_val, f)
with open(os.path.join(output_dir, "X_tab_test.pkl"), "wb") as f:
    pickle.dump(X_tab_test, f)


# Текстові ознаки (розріджена матриця)
save_npz(os.path.join(output_dir, "X_text_train.npz"), X_text_train)
save_npz(os.path.join(output_dir, "X_text_val.npz"), X_text_val)
save_npz(os.path.join(output_dir, "X_text_test.npz"), X_text_test)


# Мітки
with open(os.path.join(output_dir, "y_train.pkl"), "wb") as f:
    pickle.dump(y_train, f)
with open(os.path.join(output_dir, "y_val.pkl"), "wb") as f:
    pickle.dump(y_val, f)
with open(os.path.join(output_dir, "y_test.pkl"), "wb") as f:
    pickle.dump(y_test, f)

print("Data splits saved successfully!")

Data splits saved successfully!


In [9]:
# Збереження тестового DataFrame на диск
df_test.to_csv("df_test.csv", index=False)
print("df_test збережено у файл 'df_test.csv'")

df_test збережено у файл 'df_test.csv'
