## Mock Candidata and Job data

In [5]:
import pandas as pd
import random
from faker import Faker
from collections import defaultdict
fake = Faker()

# Configuration
NUM_CANDIDATES = 500
NUM_JOBS = 30
TEAM_SIZES = [3, 4, 5, 6]

# Skills and roles with weights
ROLES = {
    'Backend Engineer': {'weight': 0.25, 'skills': ['Python', 'Java', 'C++', 'SQL', 'API Design', 'Microservices']},
    'Frontend Engineer': {'weight': 0.2, 'skills': ['JavaScript', 'React', 'TypeScript', 'HTML/CSS', 'Redux']},
    'Full-Stack Engineer': {'weight': 0.15, 'skills': ['JavaScript', 'Python', 'React', 'Django', 'Node.js']},
    'DevOps Engineer': {'weight': 0.1, 'skills': ['AWS', 'Docker', 'Kubernetes', 'CI/CD', 'Terraform']},
    'Data Engineer': {'weight': 0.08, 'skills': ['Python', 'SQL', 'Spark', 'ETL', 'Data Pipelines']},
    'Product Manager': {'weight': 0.07, 'skills': ['Product Strategy', 'Agile', 'Scrum', 'Market Research']},
    'UX Designer': {'weight': 0.05, 'skills': ['Figma', 'User Research', 'Prototyping', 'UI/UX']},
    'QA Engineer': {'weight': 0.05, 'skills': ['Testing', 'Automation', 'Selenium', 'JIRA']},
    'Data Scientist': {'weight': 0.05, 'skills': ['Python', 'Machine Learning', 'Statistics', 'Pandas']}
}

# Generate candidates
candidates = []
for i in range(NUM_CANDIDATES):
    role = random.choices(
        list(ROLES.keys()),
        weights=[r['weight'] for r in ROLES.values()],
        k=1
    )[0]
    
    base_skills = ROLES[role]['skills']
    extra_skills = random.sample([
        s for s in [
            'Git', 'REST', 'GraphQL', 'NoSQL', 'PostgreSQL', 
            'Azure', 'GCP', 'Jenkins', 'Kafka', 'Redis'
        ] if s not in base_skills
    ], random.randint(1, 3))
    
    candidates.append({
        'candidate_id': 1000 + i,
        'name': fake.name(),
        'role': role,
        'skills': ','.join(base_skills + extra_skills),
        'experience': random.randint(1, 15),
        'location': fake.city(),
        'current_company': fake.company(),
        'salary_expectation': random.randint(80, 220) * 1000
    })

candidates_df = pd.DataFrame(candidates)

# Generate team-based job postings
team_jobs = []
for i in range(NUM_JOBS):
    # Project types with typical team compositions
    project_types = [
        ('Web Application', {
            'Backend Engineer': 2,
            'Frontend Engineer': 2,
            'DevOps Engineer': 1,
            'Product Manager': 1
        }),
        ('Mobile App', {
            'Backend Engineer': 1,
            'Frontend Engineer': 3,
            'UX Designer': 1
        }),
        ('Data Platform', {
            'Data Engineer': 2,
            'Data Scientist': 2,
            'DevOps Engineer': 1
        }),
        ('Enterprise System', {
            'Backend Engineer': 3,
            'QA Engineer': 1,
            'DevOps Engineer': 1
        })
    ]
    
    project_name, team_composition = random.choice(project_types)
    required_roles = list(team_composition.keys())
    
    # Gather required skills
    required_skills = set()
    for role in required_roles:
        required_skills.update(ROLES[role]['skills'])
    
    team_jobs.append({
        'job_id': f"T{100+i}",
        'project_name': project_name,
        'project_description': fake.text(),
        'required_roles': ','.join(required_roles),
        'team_composition': ','.join(f"{k}:{v}" for k,v in team_composition.items()),
        'required_skills': ','.join(required_skills),
        'company': fake.company(),
        'location': fake.city(),
        'budget': f"${random.randint(500, 1200)}k",
        'duration': f"{random.randint(3, 12)} months"
    })

jobs_df = pd.DataFrame(team_jobs)

# Save data
candidates_df.to_csv('team_candidates.csv', index=False)
jobs_df.to_csv('team_jobs.csv', index=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class TeamRecommender:
    def __init__(self, candidates_df, jobs_df):
        self.candidates = candidates_df
        self.jobs = jobs_df
        
        self.tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(','))
        self.skill_matrix = self.tfidf.fit_transform(self.candidates['skills'])
        
        self.role_index = defaultdict(list)
        for idx, role in enumerate(self.candidates['role']):
            self.role_index[role].append(idx)
    
    def recommend_team(self, job_id, max_candidates_per_role=5):
        job = self.jobs[self.jobs['job_id'] == job_id].iloc[0]
        required_roles = {}
        for role_part in job['team_composition'].split(','):
            role, count = role_part.split(':')
            required_roles[role] = int(count)
        
        job_vector = self.tfidf.transform([job['required_skills']])
        
        team = []
        for role, count in required_roles.items():
            candidate_indices = self.role_index.get(role, [])
            if not candidate_indices:
                continue
                
            role_skill_matrix = self.skill_matrix[candidate_indices]
            similarities = cosine_similarity(job_vector, role_skill_matrix).flatten()
            
            top_indices = np.argsort(similarities)[-max_candidates_per_role:][::-1]
            
            selected = 0
            for idx in top_indices:
                if selected >= count:
                    break
                candidate_idx = candidate_indices[idx]
                candidate = self.candidates.iloc[candidate_idx].to_dict()
                candidate['role_match_score'] = similarities[idx]
                team.append(candidate)
                selected += 1
        
        if team:
            avg_score = sum(m['role_match_score'] for m in team) / len(team)
            coverage = self._calculate_skill_coverage(job['required_skills'], team)
            diversity = self._calculate_team_diversity(team)
            team_score = 0.5*avg_score + 0.3*coverage + 0.2*diversity
        else:
            team_score = 0
            
        return {
            'team': team,
            'team_score': team_score,
            'job_details': job.to_dict()
        }
    
    def _calculate_skill_coverage(self, required_skills, team):
        required = set(required_skills.split(','))
        covered = set()
        for member in team:
            covered.update(member['skills'].split(','))
        return len(required & covered) / len(required)
    
    def _calculate_team_diversity(self, team):
        companies = len(set(m['current_company'] for m in team))
        exp_levels = set()
        for m in team:
            if m['experience'] < 3:
                exp_levels.add('Junior')
            elif m['experience'] < 7:
                exp_levels.add('Mid')
            else:
                exp_levels.add('Senior')
        return 0.6*(companies/len(team)) + 0.4*(len(exp_levels)/3)

recommender = TeamRecommender(candidates_df, jobs_df)

In [10]:
job_id = jobs_df.sample(1)['job_id'].values[0]
recommendation = recommender.recommend_team(job_id)

print(f"\nRecommended Team for Job {job_id}: {recommendation['job_details']['project_name']}")
print(f"Team Score: {recommendation['team_score']:.2f}")
print(f"Required Roles: {recommendation['job_details']['team_composition']}")
print("\nTeam Members:")
for member in recommendation['team']:
    print(f"- {member['role']}: {member['name']} ({member['current_company']})")
    print(f"  Skills: {member['skills']}")
    print(f"  Experience: {member['experience']} years, Match Score: {member['role_match_score']:.2f}")

print("\nTeam Quality Metrics:")
print(f"Skill Coverage: {recommender._calculate_skill_coverage(recommendation['job_details']['required_skills'], recommendation['team']):.2f}")
print(f"Diversity Score: {recommender._calculate_team_diversity(recommendation['team']):.2f}")


Recommended Team for Job T128: Data Platform
Team Score: 0.74
Required Roles: Data Engineer:2,Data Scientist:2,DevOps Engineer:1

Team Members:
- Data Engineer: Jonathan Lopez (Holmes-Ball)
  Skills: Python,SQL,Spark,ETL,Data Pipelines,Jenkins
  Experience: 13 years, Match Score: 0.52
- Data Engineer: Sarah Brown (Stevens-Barajas)
  Skills: Python,SQL,Spark,ETL,Data Pipelines,Jenkins
  Experience: 12 years, Match Score: 0.52
- Data Scientist: Jessica Wood (Jackson, Hunt and Garcia)
  Skills: Python,Machine Learning,Statistics,Pandas,GCP
  Experience: 12 years, Match Score: 0.53
- Data Scientist: Carl Beltran (Boyd LLC)
  Skills: Python,Machine Learning,Statistics,Pandas,Redis
  Experience: 13 years, Match Score: 0.53
- DevOps Engineer: Kenneth Moody (Curry Inc)
  Skills: AWS,Docker,Kubernetes,CI/CD,Terraform,Jenkins
  Experience: 1 years, Match Score: 0.60

Team Quality Metrics:
Skill Coverage: 1.00
Diversity Score: 0.87
