In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import numpy as np
import joblib


In [2]:

df = pd.read_csv("career_dataset.csv")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CandidateID           200 non-null    int64  
 1   Name                  200 non-null    object 
 2   Age                   200 non-null    int64  
 3   Education             200 non-null    object 
 4   Skills                200 non-null    object 
 5   Interests             200 non-null    object 
 6   Recommended_Career    200 non-null    object 
 7   Recommendation_Score  200 non-null    float64
dtypes: float64(1), int64(2), object(5)
memory usage: 12.6+ KB


Unnamed: 0,CandidateID,Name,Age,Education,Skills,Interests,Recommended_Career,Recommendation_Score
0,1,John Doe,28,Bachelor's,Python;Data Analysis;Machine Learning,Technology;Data Science,Data Scientist,0.95
1,2,Jane Smith,32,Master's,Java;System Design;Cloud Computing,Software Development;AI,Software Engineer,0.9
2,3,Bob Johnson,24,Bachelor's,Graphic Design;UI/UX;Adobe Creative Suite,Arts;Digital Media,UX Designer,0.88
3,4,Emily Davis,26,Bachelor's,Python;Deep Learning;Statistics,Healthcare;AI,AI Researcher,0.93
4,5,Michael Brown,30,Master's,Project Management;Communication;Agile,Business;Management,Project Manager,0.87


In [3]:
df.describe()
print(df.columns.tolist())


['CandidateID', 'Name', 'Age', 'Education', 'Skills', 'Interests', 'Recommended_Career', 'Recommendation_Score']


In [4]:
df = df.drop(columns=["CandidateID", "Name", "Recommendation_Score"])
# ✅ Normalize Age
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

# ✅ Create Age Groups for categorical pattern analysis
df['AgeGroup'] = pd.cut(df['Age'], bins=[-np.inf, -1, 0, 1, np.inf], labels=['<20', '20-25', '26-30', '30+'])

# ✅ One-hot encode AgeGroup and Education
df = pd.get_dummies(df, columns=['AgeGroup', 'Education'])

# ✅ Proceed with the cleaned data for training/testing



In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split

# ✅ Make sure column names don't have extra spaces
df.columns = df.columns.str.strip()

# ✅ Convert 'Skills' and 'Interests' to list format
df["Skills"] = df["Skills"].astype(str).apply(lambda x: [i.strip() for i in x.split(';')])
df["Interests"] = df["Interests"].astype(str).apply(lambda x: [i.strip() for i in x.split(';')])

# ✅ MultiLabelBinarizer for one-hot encoding
mlb_skills = MultiLabelBinarizer()
mlb_interests = MultiLabelBinarizer()

skills_encoded = pd.DataFrame(
    mlb_skills.fit_transform(df["Skills"]),
    columns=["skill_" + col for col in mlb_skills.classes_]
)

interests_encoded = pd.DataFrame(
    mlb_interests.fit_transform(df["Interests"]),
    columns=["interest_" + col for col in mlb_interests.classes_]
)

# ✅ Merge encoded features with original DataFrame
df = pd.concat([df, skills_encoded, interests_encoded], axis=1)

# ✅ Drop original text-based columns
df.drop(columns=["Skills", "Interests"], inplace=True)

# ✅ Remove conflicting combinations
#group_cols = ["Education"] + list(skills_encoded.columns) + list(interests_encoded.columns)
group_cols = ["Education"] + list(skills_encoded.columns) + list(interests_encoded.columns)
education_cols = [col for col in df.columns if col.startswith('Education_')]
skill_cols = list(skills_encoded.columns)
interest_cols = list(interests_encoded.columns)
group_cols = education_cols + skill_cols + interest_cols

grouped = df.groupby(group_cols)["Recommended_Career"].nunique()
conflicting_combinations = grouped[grouped > 1].index
mask = df.set_index(group_cols).index.isin(conflicting_combinations)
df_cleaned = df[~mask].reset_index(drop=True)

# ✅ Encode target label
label_encoder = LabelEncoder()
df_cleaned["Recommended_Career"] = label_encoder.fit_transform(df_cleaned["Recommended_Career"])

# ✅ Separate features and target
#X = df_cleaned.drop(columns=["Recommended_Career", "Name", "CandidateID"], errors='ignore')
#y = df_cleaned["Recommended_Career"]
X = df_cleaned.drop(columns=['CandidateID', 'Name', 'Recommended_Career', 'Recommendation_Score'], errors='ignore')

y = df_cleaned['Recommended_Career']




# ✅ Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
 #✅ Step 10: Handle imbalance using SMOTE
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# ✅ Step 11: Train RandomForestClassifier
# After splitting X and y and training model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 Model Accuracy  {accuracy:.2f}")

# Save model, encoders, and feature names
joblib.dump(model, "career_model.joblib")
joblib.dump(mlb_skills, "mlb_skills.joblib")
joblib.dump(mlb_interests, "mlb_interests.joblib")
joblib.dump(label_encoder, "label_encoder.joblib")
joblib.dump(X.columns.tolist(), "feature_names.joblib")   # ✅ Save columns




🎯 Model Accuracy  0.77


['feature_names.joblib']