In [13]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'Career_Guidance_Dataset.csv'  # Replace with your dataset path
df = pd.read_csv(file_path)

print("Dataset loaded successfully.")

# Step 1: Initial dataset inspection
print("\nInitial Dataset Info:")
print(df.info())
print(df.head())

# Step 2: Handle multi-label columns
multi_label_columns = [
    'Favorite_Subjects', 'Weak_Subjects', 'Skills', 'Interests', 
    'Suitable_Careers', 'Suggested_Courses', 'Skills_To_Develop', 'Challenges_To_Overcome'
]
for col in multi_label_columns:
    print(f"\nProcessing multi-label column: {col}")
    mlb = MultiLabelBinarizer()
    df[col] = df[col].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
    transformed = mlb.fit_transform(df[col])
    for i, label in enumerate(mlb.classes_):
        df[f"{col}_{label}"] = transformed[:, i]
    df.drop(columns=[col], inplace=True)
print("Multi-label columns processed successfully.")

# Step 3: Encode categorical columns
categorical_columns = [
    'Gender', 'Personality_Type', 'Preferred_Career_Type',
    'Parent_Guidance', 'Peer_Influence', 'Financial_Affordability'
]
for col in categorical_columns:
    print(f"\nEncoding categorical column: {col}")
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
print("Categorical columns encoded successfully.")

# Step 4: Convert boolean columns to integers
boolean_columns = ['Hands_On_Learning', 'Long_Term_Education_Plan', 
                   'Scholarship_Eligibility', 'Availability_Of_Courses']
print("\nConverting boolean columns to integers.")
for col in boolean_columns:
    df[col] = df[col].astype(int)
print("Boolean columns converted successfully.")

# Step 5: Scale numerical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
print("\nNumerical columns scaled successfully.")

# Step 6: Prepare features (X) and target (y)
X = df.drop(columns=['Recommended_Path'])  # Replace with your actual target column
y = df['Recommended_Path']

# Debug: Check for non-numeric data in features
print("\nChecking for non-numeric data in features (X).")
non_numeric_columns = X.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns in X: {non_numeric_columns}")
    print(f"Sample problematic data:\n{X[non_numeric_columns].head()}")
else:
    print("All features are numeric.")

# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData split successfully.")

# Step 8: Train the Random Forest model
print("\nTraining the Random Forest model.")
try:
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    print("Model trained successfully.")
except Exception as e:
    print(f"Error training the model: {e}")

# Step 9: Evaluate the model
print("\nEvaluating the model.")
try:
    y_pred = model.predict(X_test)
    accuracy = (y_pred == y_test).mean()
    print(f"Model accuracy: {accuracy}")
except Exception as e:
    print(f"Error evaluating the model: {e}")


Dataset loaded successfully.

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Gender                    2000 non-null   object 
 1   Academic_Performance      2000 non-null   float64
 2   Favorite_Subjects         2000 non-null   object 
 3   Weak_Subjects             2000 non-null   object 
 4   Skills                    2000 non-null   object 
 5   Interests                 2000 non-null   object 
 6   Personality_Type          2000 non-null   object 
 7   Preferred_Career_Type     2000 non-null   object 
 8   Hands_On_Learning         2000 non-null   bool   
 9   Long_Term_Education_Plan  2000 non-null   bool   
 10  Financial_Affordability   2000 non-null   object 
 11  Scholarship_Eligibility   2000 non-null   bool   
 12  Technical_Aptitude        2000 non-null   float64
 13  Analytical_

In [15]:
import joblib

# Filepath to save the model
model_filename = "career_guidance_model.pkl"

# Save the trained model
joblib.dump(model, model_filename)

print(f"Model saved successfully as '{model_filename}'")


Model saved successfully as 'career_guidance_model.pkl'


In [4]:
import pandas as pd
import joblib
import numpy as np

# Load the trained model
model_path = 'career_guidance_model.pkl'  # Ensure this matches your model path
model = joblib.load(model_path)

# Load the feature names used during training
trained_features = model.feature_names_in_

# Create a sample test case with all features
# Initialize all features with zeros or default values
test_case = pd.DataFrame(np.zeros((1, len(trained_features))), columns=trained_features)

# Fill in specific values for the test case
test_case['Gender'] = 0  # Example: 0 for Male
test_case['Personality_Type'] = 2  # Example encoded value
test_case['Preferred_Career_Type'] = 1  # Example encoded value
test_case['Parent_Guidance'] = 1  # Example encoded value
test_case['Peer_Influence'] = 0  # Example encoded value
test_case['Financial_Affordability'] = 1  # Example encoded value
test_case['Hands_On_Learning'] = 1  # Example boolean feature
test_case['Long_Term_Education_Plan'] = 0  # Example boolean feature
test_case['Scholarship_Eligibility'] = 1  # Example boolean feature
test_case['Availability_Of_Courses'] = 0  # Example boolean feature
test_case['Academic_Performance'] = 0.85  # Example scaled value
test_case['Technical_Aptitude'] = 0.78  # Example scaled value
test_case['Analytical_Reasoning'] = 0.80  # Example scaled value
test_case['Creativity_Level'] = 0.65  # Example scaled value
test_case['Distance_To_College'] = 0.1  # Example scaled value

# For multi-label columns, provide appropriate binary indicators
# Replace these with the actual feature names from your dataset
test_case['Favorite_Subjects_Biology'] = 1
test_case['Skills_Coding'] = 1
test_case['Challenges_To_Overcome_Lack of Focus'] = 1

# Make predictions
predicted_path = model.predict(test_case)
print(f"Predicted Career Path: {predicted_path[0]}")
predicted_path


Predicted Career Path: Diploma


array(['Diploma'], dtype=object)

In [13]:
import pandas as pd

# Load the dataset
file_path = 'Career_Guidance_Dataset.csv'  # Replace with the correct path
df = pd.read_csv(file_path)

# Extract unique values for each column
unique_values = {}

# Loop through each column to find unique values
for column in df.columns:
    unique_values[column] = df[column].dropna().unique()

# Display unique values for each column
print("Unique Values for Each Column:")
for column, values in unique_values.items():
    print(f"{column}: {values}")


Unique Values for Each Column:
Gender: ['Male' 'Female']
Academic_Performance: [67.7 73.6 72.3 80.6 62.2 75.5 87.  86.5 70.9 83.5 89.  83.  74.8 87.9
 88.7 86.2 66.1 72.8 87.5 87.8 62.6 74.9 78.6 71.6 73.4 89.1 69.7 62.3
 87.4 64.4 93.1 93.3 71.8 78.7 94.5 79.  74.6 88.2 77.2 91.8 62.7 77.1
 69.  68.8 86.4 73.3 84.1 86.9 61.1 84.7 82.4 89.4 94.7 90.5 85.6 81.9
 85.5 79.5 91.2 89.9 74.2 85.2 65.  89.5 91.1 68.2 76.7 79.2 89.8 74.5
 82.2 91.6 62.5 74.4 75.7 61.5 75.2 72.9 95.  64.5 81.1 65.3 80.7 73.9
 77.9 75.8 63.6 77.  70.7 93.2 84.3 93.6 68.6 83.7 91.  81.8 87.1 92.4
 61.7 75.3 93.5 70.4 61.9 76.4 62.9 93.7 71.9 81.6 79.7 86.7 72.6 70.6
 78.8 82.  80.  69.3 65.1 76.9 90.2 77.5 90.  63.1 71.7 75.4 65.4 76.6
 75.1 69.5 77.8 81.4 92.1 91.9 84.5 80.4 68.3 63.9 82.7 83.4 66.2 66.9
 77.3 92.3 70.1 90.8 63.2 80.8 60.6 63.8 83.6 94.2 74.3 80.2 68.7 84.4
 79.9 60.3 66.8 81.2 72.1 70.5 72.4 89.2 61.3 82.8 73.  88.8 92.7 81.3
 64.9 75.9 70.2 73.8 82.3 65.2 91.5 64.1 67.  67.8 67.9 68.  67.4 74.

In [20]:
import pandas as pd

df = pd.read_csv('Career_Guidance_Dataset.csv')

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

# Load dataset
df = pd.read_csv('Career_Guidance_Dataset.csv')

# Select columns for features and target
features = [
    'Gender', 'Academic_Performance', 'Favorite_Subjects', 'Weak_Subjects', 
    'Skills', 'Interests', 'Personality_Type', 'Preferred_Career_Type', 
    'Hands_On_Learning', 'Long_Term_Education_Plan', 'Financial_Affordability', 
    'Scholarship_Eligibility', 'Technical_Aptitude', 'Analytical_Reasoning', 
    'Creativity_Level', 'Parent_Guidance', 'Peer_Influence', 'Distance_To_College', 
    'Availability_Of_Courses', 'Suggested_Courses', 'Skills_To_Develop'
]
target = 'Recommended_Path'

# Define preprocessing function
def preprocess_data(df):
    # Label encode categorical columns
    label_columns = ['Gender', 'Favorite_Subjects', 'Weak_Subjects', 'Skills', 'Interests', 
                     'Personality_Type', 'Preferred_Career_Type', 'Financial_Affordability', 
                     'Parent_Guidance', 'Peer_Influence', 'Suggested_Courses', 'Skills_To_Develop']
    
    le = LabelEncoder()
    for col in label_columns:
        df[col] = le.fit_transform(df[col])

    # For boolean columns, convert them to 0/1
    df['Hands_On_Learning'] = df['Hands_On_Learning'].astype(int)
    df['Long_Term_Education_Plan'] = df['Long_Term_Education_Plan'].astype(int)
    df['Scholarship_Eligibility'] = df['Scholarship_Eligibility'].astype(int)
    df['Availability_Of_Courses'] = df['Availability_Of_Courses'].astype(int)

    # Scale numerical columns
    numerical_columns = ['Academic_Performance', 'Technical_Aptitude', 'Analytical_Reasoning', 
                         'Creativity_Level', 'Distance_To_College']
    
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

# Preprocess the data
df = preprocess_data(df)

# Define the features and target variables
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model (Random Forest in this case)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

# Save the trained model
with open('career_guidance_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Example prediction
example_input = X_test.iloc[0].values.reshape(1, -1)
predicted_career_path = model.predict(example_input)
print(f"Predicted Career Path: {predicted_career_path[0]}")


Model accuracy: 0.8925
Predicted Career Path: PUC




In [32]:
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv("Career_Guidance_Dataset.csv")

# Step 2: Define a basic similarity function
def calculate_similarity(row, user_input):
    score = 0
    for col in user_input:
        if col in df.columns:
            if isinstance(user_input[col], str):  # For categorical fields
                if str(row[col]).lower() == str(user_input[col]).lower():
                    score += 1
            else:  # For numerical fields
                score -= abs(row[col] - user_input[col])  # Penalize based on distance
    return score

# Step 3: Get user input
user_input = {
    'Gender': 'Male',
    'Academic_Performance': 85,
    'Technical_Aptitude': 90,
    'Favorite_Subjects': 'Math',
    'Weak_Subjects': 'History',
    'Skills': 'Problem Solving',
    'Interests': 'Technology',
    'Personality_Type': 'Analytical',
    'Preferred_Career_Type': 'Engineering',
    'Financial_Affordability': 'High',
    'Parent_Guidance': 'Supportive',
    'Peer_Influence': 'Positive',
    'Hands_On_Learning': 1,
    'Long_Term_Education_Plan': 1,
    'Scholarship_Eligibility': 0,
    'Availability_Of_Courses': 1,
    'Distance_To_College': 10,
}

# Step 4: Calculate similarity scores for all rows
df['Similarity_Score'] = df.apply(lambda row: calculate_similarity(row, user_input), axis=1)

# Step 5: Find the closest match
closest_match = df.loc[df['Similarity_Score'].idxmax()]

# Step 6: Extract recommendations
recommendations = {
    'Recommended_Path': closest_match['Recommended_Path'],
    'Suitable_Careers': closest_match['Suitable_Careers'],
    'Suggested_Courses': closest_match['Suggested_Courses'],
    'Skills_To_Develop': closest_match['Skills_To_Develop'],
    'Challenges_To_Overcome': closest_match['Challenges_To_Overcome'],
}

# Display the results
print("Closest Match Found:")
print(closest_match)
print("\nRecommendations:")
print(recommendations)


Closest Match Found:
Gender                                                     Male
Academic_Performance                                       85.0
Favorite_Subjects                            English, Chemistry
Weak_Subjects                                           History
Skills                                  Coding, Problem Solving
Interests                                          Reading, Art
Personality_Type                                      Extrovert
Preferred_Career_Type                                   Neutral
Hands_On_Learning                                          True
Long_Term_Education_Plan                                   True
Financial_Affordability                                     Low
Scholarship_Eligibility                                    True
Technical_Aptitude                                          8.8
Analytical_Reasoning                                        7.2
Creativity_Level                                            9.9
Parent_Guidance    