In [1]:
import pandas as pd
import random
import statistics

GROUP_SIZE = 5
required_skills = {'leadership', 'technical', 'creative', 'research', 'communication'}

In [2]:
df = pd.read_csv('students2.csv')

In [4]:
# Identify the project columns by adding "project" before every project name
project_columns = [col for col in df.columns if col.startswith("project")]

In [5]:
# Process the DataFrame into a list of student dictionaries.
students = []
for _, row in df.iterrows():
    # Process the skills: a comma-separated string.
    skills = [s.strip().lower() for s in str(row['skills']).split(',')]
    
    # Process the project ratings: each project column should contain a number (1 to 5).
    projects_ratings = {}
    for project in project_columns:
        try:
            rating = float(row[project])
        except ValueError:
            rating = 0  # or handle missing/invalid ratings as needed
        projects_ratings[project] = rating
    
    students.append({
        'name': row['name'],
        'skills': skills,
        'projects_ratings': projects_ratings
    })

In [6]:
def diversity_score_group(group, project_columns):
    """
    Calculates an overall diversity score for a group.
    The score is the sum of:
      - A skills score: the count of required skills represented in the group.
      - A projects score: the sum of standard deviations of ratings for each project.
    """
    # Skill diversity: count of unique required skills present in the group.
    group_skills = set()
    for student in group:
        group_skills.update(student['skills'])
    score_skills = len(group_skills.intersection(required_skills))
    
    # Project diversity: for each project, compute the standard deviation of ratings.
    score_projects = 0
    for project in project_columns:
        ratings = [student['projects_ratings'][project] for student in group]
        # Only compute standard deviation if there's more than one rating.
        if len(ratings) > 1:
            stdev = statistics.stdev(ratings)
        else:
            stdev = 0
        score_projects += stdev
    
    return score_skills + score_projects

In [15]:
def grouping(students, group_size, project_columns, alpha=1.0):
    """
    Creates groups based on similar project ratings and varied skills.
    Each student is assigned to the group where their addition maximizes the composite score.
    """
    n_groups = len(students) // group_size
    groups = [[] for _ in range(n_groups)]
    
    # Shuffle students to randomize the order.
    random.shuffle(students)
    
    # assign each student to the group with the best composite score improvement.
    for student in students:
        best_group_index = None
        best_increase = -float('inf')
        
        for i in range(n_groups):
            # Only consider groups that are not yet full.
            if len(groups[i]) < group_size:
                current_score = composite_score_group(groups[i], project_columns, alpha)
                new_score = composite_score_group(groups[i] + [student], project_columns, alpha)
                score_increase = new_score - current_score
                
                if score_increase > best_increase:
                    best_increase = score_increase
                    best_group_index = i
        
        groups[best_group_index].append(student)
    
    return groups


In [14]:
def composite_score_group(group, project_columns, alpha=1.0):
    """
    Calculates an overall score for a group that rewards similar project ratings 
    (lower variance) and varied skills.
    
    Higher skill diversity is good, while higher standard deviation in project ratings 
    is penalized.
    """
    # Skill diversity: count of unique required skills present in the group.
    group_skills = set()
    for student in group:
        group_skills.update(student['skills'])
    score_skills = len(group_skills.intersection(required_skills))
    
    # Project rating similarity: calculate the sum of standard deviations.
    total_std = 0
    for project in project_columns:
        ratings = [student['projects_ratings'][project] for student in group]
        if len(ratings) > 1:
            total_std += statistics.stdev(ratings)
    
    # We subtract the variance part (weighted by alpha) so that lower variance increases the score.
    total_score = score_skills - alpha * total_std
    return total_score

In [13]:
groups = grouping(students.copy(), GROUP_SIZE, project_columns, alpha=1.0)

for idx, group in enumerate(groups, 1):
    score = composite_score_group(group, project_columns, alpha=1.0)
    print(f"Group {idx} (Composite Score: {score:.2f}):")
    for student in group:
        ratings_str = ", ".join(f"{proj}: {student['projects_ratings'][proj]}" for proj in project_columns)
        print(f"  - {student['name']} | Skills: {', '.join(student['skills'])}")
    print()

Group 1 (Composite Score: -4.46):
  - Fiona | Skills: technical, research
  - Tina | Skills: communication, leadership
  - Alice | Skills: leadership, communication
  - Ian | Skills: technical, communication
  - Bob | Skills: technical, research

Group 2 (Composite Score: -3.39):
  - Julia | Skills: creative, leadership
  - Charlie | Skills: creative, technical
  - George | Skills: creative, communication
  - Evan | Skills: leadership, creative
  - Diana | Skills: research, communication

Group 3 (Composite Score: -3.03):
  - Nina | Skills: research, creative
  - Mike | Skills: leadership, technical
  - Hannah | Skills: leadership, research
  - Steve | Skills: technical, creative
  - Oliver | Skills: communication, leadership

Group 4 (Composite Score: -3.74):
  - Rachel | Skills: leadership, research
  - Quentin | Skills: creative, communication
  - Laura | Skills: creative, communication
  - Kevin | Skills: technical, research
  - Paula | Skills: technical, research

