In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
"""
PathFinder AI - Phase 2: KNN Career Recommender
Dataset: AI Career Recommendation System
"""

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
import joblib
import json

# Load Dataset 1
print("="*70)
print("PHASE 2: KNN CAREER RECOMMENDER")
print("="*70)

# Update this path to your dataset location
df = pd.read_csv('/kaggle/input/ai-based-career-recommendation-system/AI-based Career Recommendation System.csv')

print(f"\nDataset loaded: {len(df)} records")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 3 rows:")
print(df.head(3))

# Data Preprocessing
print("\n--- Preprocessing Skills Data ---")

# Parse skills (semicolon-separated)
df['Skills_List'] = df['Skills'].apply(lambda x: [skill.strip() for skill in str(x).split(';')])

# Parse interests (semicolon-separated)
df['Interests_List'] = df['Interests'].apply(lambda x: [interest.strip() for interest in str(x).split(';')])

print(f"Sample parsed skills: {df['Skills_List'].iloc[0]}")
print(f"Sample parsed interests: {df['Interests_List'].iloc[0]}")

# Create all unique skills
all_skills = set()
for skills in df['Skills_List']:
    all_skills.update(skills)

print(f"\nTotal unique skills in database: {len(all_skills)}")

# Save skills database
skills_database = {
    'all_skills': sorted(list(all_skills)),
    'career_skills_mapping': {}
}

for idx, row in df.iterrows():
    career = row['Recommended_Career']
    if career not in skills_database['career_skills_mapping']:
        skills_database['career_skills_mapping'][career] = []
    skills_database['career_skills_mapping'][career].extend(row['Skills_List'])

# Remove duplicates per career
for career in skills_database['career_skills_mapping']:
    skills_database['career_skills_mapping'][career] = list(set(skills_database['career_skills_mapping'][career]))

with open('skills_database.json', 'w') as f:
    json.dump(skills_database, f, indent=4)

print("Skills database saved as 'skills_database.json'")

# One-Hot Encode Skills
print("\n--- Creating Skill Vectors ---")

mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['Skills_List'])

print(f"Skill vector dimensions: {skills_encoded.shape}")
print(f"Total skills tracked: {len(mlb.classes_)}")

# Save the MultiLabelBinarizer
joblib.dump(mlb, 'skills_mlb.pkl')
print("MultiLabelBinarizer saved as 'skills_mlb.pkl'")

# Create Career-Skills Matrix
print("\n--- Building Career-Skills Matrix ---")

career_skills = df.groupby('Recommended_Career')['Skills_List'].apply(
    lambda x: list(set([skill for sublist in x for skill in sublist]))
).reset_index()

career_skills.columns = ['Career', 'Skills']

print(f"\nUnique careers: {len(career_skills)}")
print("\nCareer-Skills mapping (first 5):")
print(career_skills.head())

# Encode career skills
career_skills_encoded = mlb.transform(career_skills['Skills'])

# Train KNN Model
print("\n--- Training KNN Model ---")

knn_model = NearestNeighbors(
    n_neighbors=min(5, len(career_skills)),  # Top 5 or less if fewer careers
    metric='cosine',
    algorithm='brute'
)

knn_model.fit(career_skills_encoded)

print(f"KNN model trained with {len(career_skills)} careers")

# Save KNN model and career reference
joblib.dump(knn_model, 'knn_career_model.pkl')
joblib.dump(career_skills, 'career_reference.pkl')

print("KNN model saved as 'knn_career_model.pkl'")
print("Career reference saved as 'career_reference.pkl'")

# Test the model
print("\n" + "="*70)
print("TESTING KNN CAREER RECOMMENDER")
print("="*70)

def recommend_careers(user_skills, top_k=5):
    """
    Recommend careers based on user skills
    
    Parameters:
    user_skills: list of skills (e.g., ['Python', 'Machine Learning'])
    top_k: number of recommendations
    
    Returns:
    list of recommended careers with similarity scores
    """
    # Load models
    knn = joblib.load('knn_career_model.pkl')
    mlb = joblib.load('skills_mlb.pkl')
    career_ref = joblib.load('career_reference.pkl')
    
    # Encode user skills
    user_skills_encoded = mlb.transform([user_skills])
    
    # Find nearest careers
    distances, indices = knn.kneighbors(user_skills_encoded, n_neighbors=min(top_k, len(career_ref)))
    
    # Prepare results
    recommendations = []
    for dist, idx in zip(distances[0], indices[0]):
        similarity = 1 - dist  # Convert distance to similarity
        career = career_ref.iloc[idx]['Career']
        career_skills = career_ref.iloc[idx]['Skills']
        
        # Calculate skill overlap
        matching_skills = set(user_skills) & set(career_skills)
        
        recommendations.append({
            'career': career,
            'similarity_score': round(similarity * 100, 2),
            'matching_skills': list(matching_skills),
            'required_skills': career_skills
        })
    
    return recommendations

# Test with sample user
test_skills = ['Python', 'Data Analysis', 'Machine Learning']
print(f"\nTest User Skills: {test_skills}")

results = recommend_careers(test_skills, top_k=5)

print("\n--- Top 5 Career Recommendations ---")
for i, rec in enumerate(results, 1):
    print(f"\n{i}. {rec['career']}")
    print(f"   Similarity: {rec['similarity_score']}%")
    print(f"   Matching Skills: {rec['matching_skills']}")
    print(f"   All Required Skills: {rec['required_skills'][:5]}...")  # Show first 5

print("\n" + "="*70)
print("KNN CAREER RECOMMENDER - COMPLETE ✓")
print("="*70)
print("\nSaved artifacts:")
print("  - knn_career_model.pkl")
print("  - skills_mlb.pkl")
print("  - career_reference.pkl")
print("  - skills_database.json")

PHASE 2: KNN CAREER RECOMMENDER

Dataset loaded: 200 records
Columns: ['CandidateID', 'Name', 'Age', 'Education', 'Skills', 'Interests', 'Recommended_Career', 'Recommendation_Score']

First 3 rows:
   CandidateID         Name  Age   Education  \
0            1     John Doe   28  Bachelor's   
1            2   Jane Smith   32    Master's   
2            3  Bob Johnson   24  Bachelor's   

                                      Skills                Interests  \
0      Python;Data Analysis;Machine Learning  Technology;Data Science   
1         Java;System Design;Cloud Computing  Software Development;AI   
2  Graphic Design;UI/UX;Adobe Creative Suite       Arts;Digital Media   

  Recommended_Career  Recommendation_Score  
0     Data Scientist                  0.95  
1  Software Engineer                  0.90  
2        UX Designer                  0.88  

--- Preprocessing Skills Data ---
Sample parsed skills: ['Python', 'Data Analysis', 'Machine Learning']
Sample parsed interests: ['Tech