<a href="https://colab.research.google.com/github/mishraparth/3d-globe/blob/main/MY_ML_Notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
import joblib

In [60]:
data = {
    'age': [25, 32, 47, 19, 50],      # Numeric Feature
    'salary': [50000, 60000, 75000, 30000, 90000],  # Numeric Feature
    'city': ['New York', 'Paris', 'London', 'New York', 'Paris'],  # Categorical (Ordinal Encoding)
    'job_role': ['Engineer', 'Doctor', 'Artist', 'Engineer', 'Doctor'],  # Categorical (OneHot Encoding)
    'target_price': [250000, 300000, 500000, 180000, 600000]  # Target Variable
}
df = pd.DataFrame(data)


In [61]:
df

Unnamed: 0,age,salary,city,job_role,target_price
0,25,50000,New York,Engineer,250000
1,32,60000,Paris,Doctor,300000
2,47,75000,London,Artist,500000
3,19,30000,New York,Engineer,180000
4,50,90000,Paris,Doctor,600000


In [62]:
# 2️⃣ 🔍 **Separate Features & Target Variable**
X = df.drop(columns=['target_price'])  # Features
y = df['target_price']  # Target Variable


In [63]:
X

Unnamed: 0,age,salary,city,job_role
0,25,50000,New York,Engineer
1,32,60000,Paris,Doctor
2,47,75000,London,Artist
3,19,30000,New York,Engineer
4,50,90000,Paris,Doctor


In [64]:
y

Unnamed: 0,target_price
0,250000
1,300000
2,500000
3,180000
4,600000


# 3️⃣ 🎯 Apply Encoding on Categorical Features

In [65]:
# 🔹 Ordinal Encoding for 'city'
ordinal_encoder = OrdinalEncoder(categories=[['New York', 'London', 'Paris']])  # Define custom order
X['city_encoded'] = ordinal_encoder.fit_transform(X[['city']])

In [66]:
X

Unnamed: 0,age,salary,city,job_role,city_encoded
0,25,50000,New York,Engineer,0.0
1,32,60000,Paris,Doctor,2.0
2,47,75000,London,Artist,1.0
3,19,30000,New York,Engineer,0.0
4,50,90000,Paris,Doctor,2.0


In [67]:
# 🔹 OneHot Encoding for 'job_role'
onehot_encoder = OneHotEncoder()
job_role_encoded = onehot_encoder.fit_transform(X[['job_role']])
job_role_df = pd.DataFrame(job_role_encoded.toarray(), columns=onehot_encoder.get_feature_names_out(['job_role']))

In [68]:
X

Unnamed: 0,age,salary,city,job_role,city_encoded
0,25,50000,New York,Engineer,0.0
1,32,60000,Paris,Doctor,2.0
2,47,75000,London,Artist,1.0
3,19,30000,New York,Engineer,0.0
4,50,90000,Paris,Doctor,2.0


In [69]:
job_role_df

Unnamed: 0,job_role_Artist,job_role_Doctor,job_role_Engineer
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [70]:
# 🔹 Merge Encoded Data & Drop Original Categorical Columns
# Ensure OneHot Encoding is done before dropping columns
X = pd.concat([X, job_role_df], axis=1)
X = X.drop(columns=['city', 'job_role'])

In [71]:
X

Unnamed: 0,age,salary,city_encoded,job_role_Artist,job_role_Doctor,job_role_Engineer
0,25,50000,0.0,0.0,0.0,1.0
1,32,60000,2.0,0.0,1.0,0.0
2,47,75000,1.0,1.0,0.0,0.0
3,19,30000,0.0,0.0,0.0,1.0
4,50,90000,2.0,0.0,1.0,0.0


In [72]:
# 4️⃣ ✨ **Apply Standardization & Normalization**
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

In [73]:
X

Unnamed: 0,age,salary,city_encoded,job_role_Artist,job_role_Doctor,job_role_Engineer
0,25,50000,0.0,0.0,0.0,1.0
1,32,60000,2.0,0.0,1.0,0.0
2,47,75000,1.0,1.0,0.0,0.0
3,19,30000,0.0,0.0,0.0,1.0
4,50,90000,2.0,0.0,1.0,0.0


In [74]:
y

Unnamed: 0,target_price
0,250000
1,300000
2,500000
3,180000
4,600000


In [75]:
X[['age', 'salary']] = scaler_standard.fit_transform(X[['age', 'salary']])  # Standardization
X[['city_encoded']] = scaler_minmax.fit_transform(X[['city_encoded']])  # Normalization

In [76]:
X

Unnamed: 0,age,salary,city_encoded,job_role_Artist,job_role_Doctor,job_role_Engineer
0,-0.792766,-0.534207,0.0,0.0,0.0,1.0
1,-0.214707,-0.048564,1.0,0.0,1.0,0.0
2,1.023989,0.6799,0.5,1.0,0.0,0.0
3,-1.288245,-1.505493,0.0,0.0,0.0,1.0
4,1.271729,1.408365,1.0,0.0,1.0,0.0


In [77]:
# 5️⃣ 🤖 **Train the Machine Learning Model**
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor()
model.fit(X_train, y_train)

# 6️⃣ 💾 **Save Model & Preprocessors**


In [78]:
joblib.dump(model, "trained_model.pkl")
joblib.dump(ordinal_encoder, "ordinal_encoder.pkl")
joblib.dump(onehot_encoder, "onehot_encoder.pkl")
joblib.dump(scaler_standard, "scaler_standard.pkl")
joblib.dump(scaler_minmax, "scaler_minmax.pkl")

['scaler_minmax.pkl']

In [79]:
X_test

Unnamed: 0,age,salary,city_encoded,job_role_Artist,job_role_Doctor,job_role_Engineer
1,-0.214707,-0.048564,1.0,0.0,1.0,0.0


In [80]:
X_train

Unnamed: 0,age,salary,city_encoded,job_role_Artist,job_role_Doctor,job_role_Engineer
4,1.271729,1.408365,1.0,0.0,1.0,0.0
2,1.023989,0.6799,0.5,1.0,0.0,0.0
0,-0.792766,-0.534207,0.0,0.0,0.0,1.0
3,-1.288245,-1.505493,0.0,0.0,0.0,1.0


In [81]:
y_test

Unnamed: 0,target_price
1,300000


In [82]:
y_train

Unnamed: 0,target_price
4,600000
2,500000
0,250000
3,180000


# 7️⃣ 🔮 **Predict New Data**

In [90]:
def predict_new(age, salary, city, job_role):
    # Load Saved Model & Preprocessors
    model = joblib.load("trained_model.pkl")
    ordinal_encoder = joblib.load("ordinal_encoder.pkl")
    onehot_encoder = joblib.load("onehot_encoder.pkl")
    scaler_standard = joblib.load("scaler_standard.pkl")
    scaler_minmax = joblib.load("scaler_minmax.pkl")

    # ✅ Step 1: Convert inputs to DataFrame
    new_data = pd.DataFrame([[age, salary, city, job_role]], columns=['age', 'salary', 'city', 'job_role'])

    # ✅ Step 2: Ordinal Encoding for 'city'
    new_data['city_encoded'] = ordinal_encoder.transform(new_data[['city']])

    # ✅ Step 3: OneHot Encoding for 'job_role'
    job_role_encoded = onehot_encoder.transform(new_data[['job_role']])  # Sparse matrix
    job_role_encoded = job_role_encoded.toarray()  # Convert to dense array
    job_role_df = pd.DataFrame(job_role_encoded, columns=onehot_encoder.get_feature_names_out(['job_role']))

    # ✅ Step 4: Drop original categorical columns
    new_data = new_data.drop(columns=['city', 'job_role'])

    # ✅ Step 5: Combine encoded job_role
    new_data = pd.concat([new_data.reset_index(drop=True), job_role_df], axis=1)

    # ✅ Step 6: Apply scaling
    new_data[['age', 'salary']] = scaler_standard.transform(new_data[['age', 'salary']])
    new_data[['city_encoded']] = scaler_minmax.transform(new_data[['city_encoded']])

    # ✅ Step 7: Predict
    prediction = model.predict(new_data)
    return prediction[0]

# 🔥 Test it
new_pred = predict_new(30, 65000, "London", "Doctor")
print(f"💡 Predicted Price: ${new_pred:.2f}")


💡 Predicted Price: $438600.00
