In [10]:
import pandas as pd

# Load data
workers = pd.read_csv('workers.csv')
job_locations = pd.read_csv('job_locations.csv')

# Example preprocessing steps
# Convert categorical variables to numerical ones (e.g., skill set, language proficiency)
workers['skill_set'] = workers['skill_set'].astype('category').cat.codes
workers['language_proficiency'] = workers['language_proficiency'].astype('category').cat.codes
job_locations['required_skill_set'] = job_locations['required_skill_set'].astype('category').cat.codes
# # Calculate proximity (assuming lat/lon data for simplicity)
# def calculate_distance(lat1, lon1, lat2, lon2):
#     from geopy.distance import geodesic
#     return geodesic((lat1, lon1), (lat2, lon2)).km

# job_locations['proximity'] = job_locations.apply(
#     lambda row: calculate_distance(row['lat'], row['lon'], workers['lat'], workers['lon']), axis=1
# )


In [11]:
# Create features for matching
features = pd.DataFrame()
features['worker_id'] = workers['worker_id']
features['job_location_id'] = job_locations['job_location_id']
features['skill_match'] = (workers['skill_set'] == job_locations['required_skill_set']).astype(int)
# features['proximity'] = job_locations['proximity']
features['cost'] = workers['wage_rate']
print(features)

# You can add more features as necessary


   worker_id  job_location_id  skill_match  cost
0          1              101         True    25
1          2              102         True    30
2          3              103         True    28
3          4              104         True    35
4          5              105         True    26


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assume you have a label indicating successful matches
# features['label'] = ... # Your label data here

# Split data
X = features.drop('skill_match', axis=1)
y = features['skill_match']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))


Accuracy: 1.0


In [6]:
def calculate_productivity(experience_level, past_performance, health_status):
    # Assign numeric values to experience levels and health statuses
    experience_value = {'Beginner': 0.8, 'Intermediate': 1.0, 'Advanced': 1.2}
    health_value = {'Healthy': 1.0, 'Minor Issues': 0.8, 'Major Issues': 0.5}
    
    # Calculate productivity score
    score = (experience_value[experience_level] * past_performance * health_value[health_status])
    return round(score, 2)

Generate workers data (generated_workers.csv)

In [16]:
import pandas as pd
from faker import Faker
import random

fake = Faker()

# Define the number of records
num_records = 2000

# Predefined lists for random choices
skill_sets = ['Plumbing', 'Electrical', 'Carpentry', 'Painting', 'Landscaping', 'HVAC', 'Roofing', 'Drywall', 'Flooring', 'Heavy Machinery Operation']
experience_levels = ['Beginner', 'Intermediate', 'Advanced']
health_statuses = ['Healthy', 'Minor Issues', 'Major Issues']
job_sites = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# training_statuses = ['Completed', 'In Progress', 'Not Started']
# work_conditions = ['Indoor', 'Outdoor', 'High-altitude', 'Underwater']

def generate_record(worker_id):
    experience_level = random.choice(experience_levels)
    past_performance = random.randint(1, 10)  # Assuming 1 to 10 scale
    health_status = random.choice(health_statuses)
    
    return {
        'worker_id': worker_id,
        'name': fake.name(),
        'skill_set': random.choice(skill_sets),
        'experience_level': experience_level,
        'availability': random.choice(['Available', 'Unavailable']),
        'health_status': health_status,
        'past_performance': past_performance,
        # 'wage_rate': round(random.uniform(15, 50), 2),  # Assuming hourly rate in dollars
        'job_sites': random.choice(job_sites),  # New column for job sites
        'productivity_score': calculate_productivity(experience_level, past_performance, health_status)  # New column for productivity score
    }

# Generate records
records = [generate_record(i+1) for i in range(num_records)]

# Convert to DataFrame
df = pd.DataFrame(records)

# Save to CSV
df.to_csv('generated_workers.csv', index=False)

In [17]:
import random
import csv

# Assuming skill_set is a list of skills you want to use as keys
skill_set = ['Plumbing', 'Electrical', 'Carpentry', 'Painting', 'Landscaping', 'HVAC', 'Roofing', 'Drywall', 'Flooring', 'Heavy Machinery Operation']

# Generating a dictionary with skills as keys and random days (1-100) as values
skill_days_dict = {skill: random.randint(1, 100) for skill in skill_set}

job_sites = [chr(i) for i in range(ord('A'), ord('J')+1)]  # Generates list ['A', 'B', ..., 'J']
safety_requirements = ['low', 'med', 'high']

import statistics

records = []
for job_site in job_sites:
    unique_skill_days_dict = {skill: random.randint(1, 100) for skill in skill_set}
    skills_list = [{'skill': skill, 'days': days} for skill, days in unique_skill_days_dict.items()]
    
    mean_days = statistics.mean([skill['days'] for skill in skills_list])
    # Calculate the percentage of work done
    percentage_work_done = 100 - mean_days
    
    record = {
        'job_site': job_site,
        'skill_days_dict': skills_list,  # This will now be a unique list of dictionaries for each job site
        'safety_requirements': random.choice(safety_requirements),
        'percentage_work_done': percentage_work_done  # New field showing the percentage of work done
    }
    records.append(record)

with open('job_site.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Update the header row to include the new column
    writer.writerow(['job_site', 'skill_days_dict', 'safety_requirements', 'percentage_work_done'])
    for record in records:
        # Converting the list of dictionaries to a string for CSV writing
        skill_days_str = '; '.join([f"{skill['skill']}: {skill['days']}" for skill in record['skill_days_dict']])
        # Write the new field to the CSV
        writer.writerow([record['job_site'], skill_days_str, record['safety_requirements'], f"{record['percentage_work_done']:.2f}"])

In [18]:
import pandas as pd
import os

# Create a directory to store the ranks folder
os.makedirs('ranks', exist_ok=True)

# Read the generated workers data from the CSV file
df = pd.read_csv('generated_workers.csv')

# Group the workers by skill set
grouped = df.groupby('skill_set')

# Iterate over each skill set group
for skill_set, group in grouped:
    # Sort the group by productivity score in descending order
    ranked_group = group.sort_values('productivity_score', ascending=False)
    
    # Create a separate folder for each skill set
    folder_path = os.path.join('ranks', skill_set)
    os.makedirs(folder_path, exist_ok=True)
    
    # Save the ranked group to a CSV file in the corresponding folder
    file_path = os.path.join(folder_path, 'skill_rank.csv')
    ranked_group.to_csv(file_path, index=False)


In [27]:
import pandas as pd


# Initialize a list to store job site ID, skill, and percentage
skill_percentages = []

# Iterate through each row in the DataFrame
for index, row in df_job_sites.iterrows():
    # Split the string by ';' to get individual skill-day pairs
    skill_days_pairs = row['skill_days_dict'].split(';')
    
    # Initialize a variable to store total days for the current job site
    total_days_current_job_site = sum(int(pair.split(':')[1].strip()) for pair in skill_days_pairs)
    
    # Iterate through the pairs again to calculate the percentage for each skill
    for pair in skill_days_pairs:
        skill, days = pair.split(':')
        skill = skill.strip()
        days = int(days.strip())
        
        # Calculate the percentage of days for the current skill
        skill_percentage = (days / total_days_current_job_site) * 100 if total_days_current_job_site > 0 else 0
        
        # Append the job site ID (or name), skill, and percentage to the list
        skill_percentages.append([row['job_site'], skill, skill_percentage])  # Assuming there's a 'job_site_id' column

# Convert the list to a DataFrame
df_skill_percentages = pd.DataFrame(skill_percentages, columns=['Job Site ID', 'Skill', 'Percentage'])

print(df_skill_percentages)

   Job Site ID                      Skill  Percentage
0            A                   Plumbing    2.255639
1            A                 Electrical    4.010025
2            A                  Carpentry   14.786967
3            A                   Painting   13.784461
4            A                Landscaping    2.506266
..         ...                        ...         ...
95           J                       HVAC    5.222437
96           J                    Roofing   10.831721
97           J                    Drywall    3.288201
98           J                   Flooring   18.762089
99           J  Heavy Machinery Operation   11.218569

[100 rows x 3 columns]
