In [1]:
import pandas as pd
import numpy as np

In [2]:
import numpy as np
import pandas as pd

np.random.seed(42)

n = 1200

# Columns
age = np.random.randint(18, 61, size=n)
gender = np.random.choice(["Male", "Female"], size=n, p=[0.55, 0.45])
education = np.random.choice(["school", "ug", "pg"], size=n, p=[0.35, 0.45, 0.20])

# Reviews ONLY like: good / poor / average / excellent / bad
review_levels = np.array(["excellent", "good", "average", "poor", "bad"])
review_probs  = np.array([0.18, 0.32, 0.20, 0.18, 0.12])
review = np.random.choice(review_levels, size=n, p=review_probs)

# Create a learnable target (purchased) using a simple probability model
review_score = pd.Series(review).map({
    "bad": -1.0, "poor": -0.6, "average": 0.0, "good": 0.6, "excellent": 1.0
}).to_numpy()

edu_score = pd.Series(education).map({"school": -0.2, "ug": 0.0, "pg": 0.2}).to_numpy()
gender_score = (pd.Series(gender) == "Female").astype(int).to_numpy() * 0.05
age_score = (age - 35) / 20.0  # roughly centered

# Logistic-like probability (so Logistic Regression can learn it)
logit = -0.3 + 1.4*review_score + 0.7*edu_score + 0.25*age_score + gender_score
prob = 1 / (1 + np.exp(-logit))
purchased = (np.random.rand(n) < prob).astype(int)

df = pd.DataFrame({
    "age": age,
    "gender": gender,
    "review": review,
    "education": education,
    "purchased": purchased
})

df.head()


Unnamed: 0,age,gender,review,education,purchased
0,56,Female,poor,school,1
1,46,Male,good,ug,1
2,32,Female,average,pg,0
3,60,Female,excellent,pg,1
4,25,Male,excellent,school,1


In [3]:
df.shape

(1200, 5)

In [5]:
df = df.iloc[:, 2:]

In [6]:
df


Unnamed: 0,review,education,purchased
0,poor,school,1
1,good,ug,1
2,average,pg,0
3,excellent,pg,1
4,excellent,school,1
...,...,...,...
1195,average,school,1
1196,excellent,school,1
1197,excellent,pg,1
1198,poor,ug,1


In [8]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .25, random_state = 41, stratify = y)

In [12]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories= [[ 'bad', 'poor', 'average', 'good', 'excellent'], [["school", "ug", "pg"]]])




In [15]:
from sklearn.preprocessing import OrdinalEncoder
# Re-initialize enc with corrected categories for education
enc = OrdinalEncoder(categories=[['bad', 'poor', 'average', 'good', 'excellent'], ['school', 'ug', 'pg']])
x_train = enc.fit_transform(x_train)

In [16]:
x_test = enc.transform(x_test)

In [17]:
x_train

array([[1., 2.],
       [3., 1.],
       [3., 0.],
       ...,
       [1., 1.],
       [3., 2.],
       [2., 1.]])

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [20]:
y_train

array([1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,