In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# Load the dataset
file_path = 'adjusted_features_speeddating.csv'
df = pd.read_csv(file_path)


In [3]:
# Step 1: Dummy Feature Creation
# ------------------------------
print("\nCreating Dummy Features...")
# Identify categorical columns to encode
categorical_cols = ['age_group']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("Dummy features created.")



Creating Dummy Features...
Dummy features created.


In [4]:
# Step 2: Scale Standardization
# -----------------------------
print("\nApplying Standard Scaling...")
scaler = StandardScaler()

# Identify numeric columns to scale
numeric_cols = ['funny_partner', 'attractive_partner', 'guess_prob_liked', 'expected_num_matches', 'like', 'match']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("Scaling applied to numeric features.")


Applying Standard Scaling...
Scaling applied to numeric features.


In [5]:
# Step 3: Data Splitting
# ----------------------
print("\nSplitting Data into Training and Testing Sets...")
# Define target variables
X = df.drop(['like', 'match'], axis=1)
y_like = df['like']
y_match = df['match']

# Split for 'like' prediction
X_train_like, X_test_like, y_train_like, y_test_like = train_test_split(X, y_like, test_size=0.2, random_state=42)

# Split for 'match' prediction
X_train_match, X_test_match, y_train_match, y_test_match = train_test_split(X, y_match, test_size=0.2, random_state=42)

# Save the training and testing sets
X_train_like.to_csv('X_train_like.csv', index=False)
X_test_like.to_csv('X_test_like.csv', index=False)
y_train_like.to_csv('y_train_like.csv', index=False)
y_test_like.to_csv('y_test_like.csv', index=False)

X_train_match.to_csv('X_train_match.csv', index=False)
X_test_match.to_csv('X_test_match.csv', index=False)
y_train_match.to_csv('y_train_match.csv', index=False)
y_test_match.to_csv('y_test_match.csv', index=False)

print("Data preprocessing and splitting complete. Datasets saved as CSV files.")


Splitting Data into Training and Testing Sets...
Data preprocessing and splitting complete. Datasets saved as CSV files.
