In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Load dataset
#file_path = "C:\Users\satyamanishankar\handson\code\data\Predict-The-Data-Scientists-Salary-In-India_Train_Dataset.csv"
df = pd.read_csv(r"C:\Users\satyamanishankar\handson\code\data\Predict-The-Data-Scientists-Salary-In-India_Train_Dataset.csv")

# Drop unnecessary columns
df.drop(columns=["Unnamed: 0", "job_description", "key_skills"], inplace=True)

# Split 'experience' into min and max experience
df[['min_experience', 'max_experience']] = df['experience'].str.extract(r'(\d+)-(\d+)').astype(float)
df.drop(columns=["experience"], inplace=True)

# Fill missing values in job_type with "Unknown"
df["job_type"].fillna("Unknown", inplace=True)

# Label Encoding for categorical variables
le = LabelEncoder()
df["location"] = le.fit_transform(df["location"])
df["job_type"] = le.fit_transform(df["job_type"])
df["job_desig"] = le.fit_transform(df["job_desig"])  # Encode job designation
df["salary"] = le.fit_transform(df["salary"])  # Encode target variable

# Split into features (X) and target (y)
X = df.drop(columns=["salary"])
y = df["salary"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers (Reduced estimators for faster execution)
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=50),
    "Extra Trees": ExtraTreesClassifier(n_estimators=50),
    "AdaBoost": AdaBoostClassifier(n_estimators=50),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=50)
}

# Evaluate models using train-test split
best_score = 0
best_model = None
best_clf = None

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)  # Train model
    score = clf.score(X_test, y_test)  # Test accuracy
    print(f"{name} Accuracy: {score:.4f}")

    if score > best_score:
        best_score = score
        best_model = name
        best_clf = clf

print(f"\nBest Model: {best_model} with Accuracy: {best_score:.4f}")

# --- Final Model Selection & Prediction ---
y_pred = best_clf.predict(X_test)  # Predict on test data

# Evaluate final model
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy on Test Data: {final_accuracy:.4f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["job_type"].fillna("Unknown", inplace=True)


Decision Tree Accuracy: 0.3368
Random Forest Accuracy: 0.3567
Extra Trees Accuracy: 0.3436
AdaBoost Accuracy: 0.3694
Gradient Boosting Accuracy: 0.4257

Best Model: Gradient Boosting with Accuracy: 0.4257
Final Model Accuracy on Test Data: 0.4257
