In [None]:
# Code Cell 1: Revisiting Synthetic Dataset Generation (Same as Before)
import pandas as pd
import numpy as np

# 1. Set random seed for reproducibility
np.random.seed(42)

# 2. Number of samples
n_samples = 500

# 3. Generate synthetic numerical features (e.g., 'price', 'popularity')
price = np.random.uniform(10, 100, n_samples) # Price range from 10 to 100
popularity = np.random.randint(0, 1000, n_samples) # Popularity score from 0 to 1000

# 4. Generate synthetic text reviews (simplified - categories influence text)
categories = ['electronics', 'clothing', 'books', 'home_decor']
category_options = np.random.choice(categories, n_samples)

def generate_review_text(category):
    if category == 'electronics':
        keywords = ['device', 'battery', 'screen', 'performance', 'camera', 'sound', 'quality', 'fast', 'recommend', 'great']
    elif category == 'clothing':
        keywords = ['fabric', 'fit', 'size', 'comfortable', 'style', 'color', 'soft', 'wear', 'love', 'perfect']
    elif category == 'books':
        keywords = ['story', 'characters', 'plot', 'reading', 'author', 'recommend', 'enjoyed', 'interesting', 'page', 'written']
    elif category == 'home_decor':
        keywords = ['decor', 'design', 'style', 'room', 'color', 'beautiful', 'quality', 'look', 'home', 'recommend']
    else:
        keywords = ['product', 'good', 'nice', 'like', 'recommend'] # Default keywords

    review_length = np.random.randint(10, 30) # Review length (words)
    review_text = ' '.join(np.random.choice(keywords, review_length)) # Create review by randomly picking keywords
    return review_text

review_text_data = [generate_review_text(cat) for cat in category_options]

# 5. Generate synthetic ratings (numerical target - for regression or classification example)
ratings = []
for cat in category_options:
    if cat == 'electronics':
        ratings.append(np.random.normal(4.0, 0.8)) # Electronics tend to have slightly higher ratings
    elif cat == 'clothing':
        ratings.append(np.random.normal(3.5, 1.0))
    elif cat == 'books':
        ratings.append(np.random.normal(4.2, 0.7)) # Books often get good ratings
    elif cat == 'home_decor':
        ratings.append(np.random.normal(3.8, 0.9))
    else:
        ratings.append(np.random.normal(3.7, 1.0))

ratings = np.clip(ratings, 1, 5).round(1) # Clip ratings to be between 1 and 5 and round to 1 decimal place

# 6. Create Pandas DataFrame
data = pd.DataFrame({
    'review_text': review_text_data,
    'price': price,
    'popularity': popularity,
    'category': category_options, # Category (optional - for classification example)
    'rating': ratings # Numerical target variable (e.g., for regression or classification)
})

# 7. Display first few rows of the DataFrame
print("Sample of Synthetic Dataset:")
print(data.head())

# 8. Display data types and summary statistics
print("\nData Types and Summary Statistics:")
print(data.info()) # Data types
print(data.describe()) # Summary statistics for numerical columns

Sample of Synthetic Dataset:
                                         review_text      price  popularity  \
0  quality color beautiful beautiful look home de...  43.708611         501   
1  reading enjoyed plot characters written enjoye...  95.564288         958   
2  written characters enjoyed characters plot enj...  75.879455         144   
3  recommend quality performance camera device de...  63.879264         200   
4  great camera camera sound quality fast fast ba...  24.041678         928   

      category  rating  
0   home_decor     3.7  
1        books     5.0  
2        books     3.8  
3  electronics     4.9  
4  electronics     4.0  

Data Types and Summary Statistics:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   review_text  500 non-null    object 
 1   price        500 non-null    float64
 2   popularity   500 non-null    int6

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
numerical_features = ['price', 'popularity']
text_features = 'review_text'

X = data[numerical_features + [text_features]]
y = data['rating']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

preprocess_bow_scaler = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numerical_features),
        ('bow', CountVectorizer(stop_words='english', max_features=5000), text_features),
    ])

regression_pipeline = Pipeline([
        ('preprocess', preprocess_bow_scaler),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

regression_pipeline.fit(X_train, y_train)

In [None]:
y_test_predicted = regression_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_test_predicted)

0.7653603899999996