``` {r}
install.packages("workflowr")
```

# Representing a tree with dictionaries

https://blog.finxter.com/5-best-ways-to-construct-and-manage-a-tree-in-python/
https://builtin.com/articles/tree-python #This one is more complex
https://bigtree.readthedocs.io/en/0.14.8/ #There is this package to
create trees, but maybe it is too complex for us Pouly, Marc.
“Estimating Text Similarity based on Semantic Concept Embeddings.” arXiv
preprint arXiv:2401.04422 (2024).

``` {python}
import torch
import einops
import math


from transformers import AutoModel
# Load the Jina AI embeddings model


model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)

taxonomy_tree = {
    '1': {
        '2': {
            'A': 'Lake',
            'B': 'River'
        },
        'C': 'House',
        '3': {
            '4': {
                'D': 'Mountain',
                'E': 'Everest',
                'F': 'Volcano'
            }
        }
    }
}


# Function to extract leaf nodes
def get_leaf_nodes(taxonomy):
    leaves = {}
    def traverse(node, path):
        if isinstance(node, dict):
            for k, v in node.items():
                traverse(v, path + [k])
        else:
            leaves[path[-1]] = node  # Leaf node with its path
    traverse(taxonomy, [])
    return leaves

# Function to calculate similarity using the Jina AI embeddings model
def calculate_similarity(text1, text2):
    # Encode texts to get embeddings
    embeddings = model.encode([text1, text2])
    # Calculate cosine similarity
    sim = torch.nn.functional.cosine_similarity(torch.tensor(embeddings[0]), torch.tensor(embeddings[1]), dim=0)
    return sim.item()

# Function to calculate R(T)
def calculate_r_t(taxonomy):
    leaves = get_leaf_nodes(taxonomy)
    leaf_names = list(leaves.values())
    groups = [leaf_names[i:i + 2] for i in range(0, len(leaf_names), 2)]  # Grouping pairs

    total_groups = len(groups)
    r_t_values = []

    for group in groups:
        # Calculate pairwise similarities within the group
        similarities = []
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                sim = calculate_similarity(group[i], group[j])
                similarities.append(sim)

        if similarities:
            min_similarity = min(similarities)
        else:
            min_similarity = 0  # No pairs means no intruders possible

        # Count intruders
        intruder_count = 0
        for leaf in leaf_names:
            if leaf not in group:
                sim_with_group = calculate_similarity(leaf, group[0])
                if sim_with_group > min_similarity:
                    intruder_count += 1

        # Calculate R(T) for this group
        n_ic = intruder_count
        n_gc = len(group)
        n_ac = len(leaf_names)

        r_t = (1 - (n_ic / (n_gc * (n_ac - n_gc)))) if n_gc * (n_ac - n_gc) > 0 else 0
        r_t_values.append(r_t)

    return sum(r_t_values) / total_groups if total_groups > 0 else 0
  

def extract_ncat(taxonomy):
    ncat = 0
    first_category_found = False  # Flag to track if the first category has been encountered

    def count_categories(node, is_root=True):
        nonlocal ncat, first_category_found
        if isinstance(node, dict):
            # Only count nodes that are not the root and not leaves
            if not is_root:
                if not first_category_found:
                    first_category_found = True  # Set the flag after the first category is found
                else:
                    ncat += 1  # Count the intermediate category
                    print(f"Found category: {list(node.keys())}")  # Print the keys of the current category
            # Recursively process children, marking them as non-root
            for child in node.values():
                count_categories(child, is_root=False)

    count_categories(taxonomy)
    return ncat



def extract_nchar(taxonomy):
    nchar = 0

    def count_characteristics(node):
        nonlocal nchar
        if isinstance(node, dict):
            for child in node.values():
                count_characteristics(child)
        else:
            nchar += 1  # Count the current characteristic

    count_characteristics(taxonomy)
    return nchar

def extract_depths_cat(taxonomy):
    depths_cat = []

    def find_depths(node, depth):
        if isinstance(node, dict):
            depths_cat.append(depth)  # Record the depth of this category
            for child in node.values():
                find_depths(child, depth + 1)

    find_depths(taxonomy, 0)  # Start from depth 0
    return depths_cat
  
  
def extract_depths_char(taxonomy):
    depths_char = []

    def find_characteristic_depths(node, depth):
        if isinstance(node, dict):
            for child in node.values():
                find_characteristic_depths(child, depth + 1)
        else:
            depths_char.append(depth)  # Record the depth of this characteristic

    find_characteristic_depths(taxonomy, 0)  # Start from depth 0
    return depths_char



import math

def calculate_conciseness(ncat, nchar, depths_cat, depths_char):
    """
    Calculate the conciseness of the taxonomy using the proposed formula.

    Parameters:
    ncat (int): The number of categories.
    nchar (int): The number of characteristics.
    depths_cat (list): A list of depths for categories.
    depths_char (list): A list of depths for characteristics.

    Returns:
    float: The conciseness value of the taxonomy.
    """
    # Calculate the sum of the inverses of the depths for categories and characteristics
    # Only include depths greater than 0 to avoid division by zero
    sum_cat = sum(1 / d for d in depths_cat if d > 0) if ncat > 0 else 0  # Sum for categories
    sum_char = sum(1 / d for d in depths_char if d > 0) if nchar > 0 else 0  # Sum for characteristics

    # Calculate the total sum of inverses of depths
    total_sum = sum_cat + sum_char

    # Calculate conciseness using the provided formula
    if total_sum > 0:
        C_T = 1 / (1 + math.log(total_sum - 1))
    else:
        C_T = 0  # Return 0 if total_sum is not positive

    return C_T

  
ncat = extract_ncat(taxonomy_tree)
nchar = extract_nchar(taxonomy_tree)
depths_cat = extract_depths_cat(taxonomy_tree)
depths_char = extract_depths_char(taxonomy_tree)

print("Number of categories (ncat):", ncat)
print("Number of characteristics (nchar):", nchar)
print("Depths of categories:", depths_cat)
print("Depths of characteristics:", depths_char)

# Calculate R(T) for the given taxonomy
leaves=get_leaf_nodes(taxonomy_tree)
print(leaves)
robustness_value = calculate_r_t(taxonomy_tree)
print(f"Robustness R(T): {robustness_value:.4f}")
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')

```

## 1st paper a software cost estimation taxonomy for global software development projects

``` {python}
new_taxonomy = {
    'Cost estimation for GSD': {
        'Cost estimation context': {
            'Planning': {
                "Conceptualization": "Conceptualization",
                "Feasibility study": "Feasibility study",
                "Preliminary planning": "Preliminary planning",
                "Detail Planning": "Detail planning",
                "Execution": "Execution",
                "Commissioning": "Commissioning"
            },
            'Project activities': {
                "System investigation": "System investigation",
                "Analysis": "Analysis",
                "Design": "Design",
                "Implementation": "Implementation",
                "Testing": "Testing",
                "Maintenance": "Maintenance",
                "Other Project Activities": "Project Activities.Other"
            },
            'Project domain': {
                "SE": "Systems Engineering",
                "Research & Dev": {
                    "Telecommunication": "Telecommunication"
                },
                "Finance": "Finance",
                "Healthcare": "Healthcare",
                "Other Project Domain": "Project Domain.Other"
            },
            'Project setting': {
                "Close onshore": "Close onshore",
                "Distant onshore": "Distant onshore",
                "Near offshore": "Near offshore",
                "Far offshore": "Far offshore"
            },
            'Planning approaches': {
                "Constructive Cost Model": "Constructive Cost Model",
                "Capability Maturity Model Integration": "Capability Maturity Model Integration",
                "Agile": "Agile",
                "Delphi": "Delphi",
                "GA": "Genetic Algorithms",
                "CBR": "Case-Based Reasoning",
                "Fuzzy similar": "Fuzzy similar",
                "Other planning approaches": "Planning Approaches.other"
            },
            'Number of sites': {
                "Value of number of sites": "Number of sites.Value"
            },
            'Team size': {
                "No of team members": "Number of team members"
            }
        },
        'Estimation technique': {
            'Estimation technique': {
                "Expert judgment": "Expert judgment",
                "Machine learning": "Machine learning",
                "Non-machine learning": "Non-machine learning"
            },
            'Use technique': {
                "Individual": "Individual",
                "Group-based estimation": "Group-based estimation"
            }
        },
        'Cost estimate': {
            'Estimated cost': {
                "Estimate value": "Estimated value"
            },
            'Actual cost': {
                "Value": "Actual cost.Value"
            },
            'Estimation dimension': {
                "Effort hours": "Effort hours",
                "Staff/cost": "Staff/cost",
                "Hardware": "Hardware",
                "Risk": "Risk",
                "Portfolio": "Portfolio"
            },
            'Accuracy measure': {
                "Baseline comparison": "Baseline comparison",
                "Variation reduction": "Variation reduction",
                "Sensitivity analysis": "Sensitivity analysis"
            }
        },
        'Cost estimators': {
            'Product size': {
                "Size report": "Size report",
                "Statistics analysis": "Statistics analysis"
            },
            'Team experience': {
                "Considered": "Team experience.Considered",
                "Not considered": "Team experience.Not considered"
            },
            'Team structure': {
                "Considered": "Team structure.Considered",
                "Not Considered": "Team structure.Not considered"
            },
            'Product requirement': {
                "Performance": "Performance",
                "Security": "Security",
                "Availability": "Availability",
                "Reliability": "Reliability",
                "Maintainability": "Maintainability",
                "Other requirement": "Producte requirement.Other"
            },
            'Distributed teams distances': {
                "Geographical distance": "Geographical distance",
                "Temporal distance": "Temporal distance",
                "Socio-cultural distance": "Socio-cultural distance"
            }
        }
    }
}

bajta_tax = new_taxonomy
```

\`\`\`{python, eval=FALSE} leaves = get_leaf_nodes(new_taxonomy)
print(leaves)

ncat = extract_ncat(new_taxonomy) nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy) depths_char =
extract_depths_char(new_taxonomy)

print(“Number of categories (ncat):”, ncat) print(“Number of
characteristics (nchar):”, nchar) print(“Depths of categories:”,
depths_cat) print(“Depths of characteristics:”, depths_char)

robustness_value = calculate_r\_t(new_taxonomy) print(f”Robustness R(T):
{robustness_value:.4f}“) conciseness= calculate_conciseness(ncat, nchar,
depths_cat, depths_char) print(f’The conciseness of the taxonomy is:
{conciseness}’)


    ## 2nd paper, A taxonomy of web effort predictors
    ```{python}
    new_taxonomy = {
        'Web Predictor': {
            'Size Metric': {
                'Length': {
                            'Web page count': 'Web page count',
                            'Media count': 'Media count',
                            'New media count': 'New media count',
                            'New Web page count': 'New Web page count',
                            'Link count': 'Link count',
                            'Program count': 'Program count',
                            'Reused component count': 'Reused component count',
                            'Lines of code': 'Lines of code',
                            'Reused program count': 'Reused program count',
                            'Reused media count': 'Reused media count',
                            'Web page allocation': 'Web page allocation',
                            'Reused lines of code': 'Reused lines of code',
                            'Media allocation': 'Media allocation',
                            'Reused media allocation': 'Reused media allocation',
                            'Entity count': 'Entity count',
                            'Attribute count': 'Attribute count',
                            'Component count': 'Component count',
                            'Statement count': 'Statement count',
                            'Node count': 'Node count',
                            'Collection slot size': 'Collection slot size',
                            'Component granularity level': 'Component granularity level',
                            'Slot granularity level': 'Slot granularity level',
                            'Model node size': 'Model node size',
                            'Cluster node size': 'Cluster node size',
                            'Node slot size': 'Node slot size',
                            'Publishing model unit count': 'Publishing model unit count',
                            'Model slot size': 'Model slot size',
                            'Association slot size': 'Association slot size',
                            'Client script count': 'Client script count',
                            'Server script count': 'Server script count',
                            'Information slot count': 'Information slot count',
                            'Association center slot count': 'Association center slot count',
                            'Collection center slot count': 'Collection center slot count',
                            'Component slot count': 'Component slot count',
                            'Semantic association count': 'Semantic association count',
                            'Segment count': 'Segment count',
                            'Slot count': 'Slot count',
                            'Cluster slot count': 'Cluster slot count',
                            'Cluster count': 'Cluster count',
                            'Publishing unit count': 'Publishing unit count',
                            'Section count': 'Section count',
                            'Inner/sub concern count': 'Inner/sub concern count',
                            'Indifferent concern count': 'Indifferent concern count',
                            'Module point cut count': 'Module point cut count',
                            'Module count': 'Module count',
                            'Module attribute count': 'Module attribute count',
                            'Operation count': 'Operation count',
                            'Comment count': 'Comment count',
                            'Reused comment count': 'Reused comment count',
                            'Media duration': 'Media duration',
                            'Diffusion cut count': 'Diffusion cut count',
                            'Concern module count': 'Concern module count',
                            'Concern operation count': 'Concern operation count',
                            'Anchor count': 'Anchor count'},
                'Functionality': {
                            'High feature count': 'High feature count',
                            'Low feature count': 'Low feature count',
                            'Reused high feature count': 'Reused high feature count',
                            'Reused low feature count': 'Reused low feature count',
                            'Web objects': 'Web objects',
                            'Common Software Measurement International Consortium': 'Common Software Measurement International Consortium',
                            'International Function Point Users Group': 'International Function Point Users Group',
                            'Object-Oriented Heuristic Function Points': 'Object-Oriented Heuristic Function Points',
                            'Object-Oriented Function Points': 'Object-Oriented Function Points',
                            'Use case count': 'Use case count',
                            'Feature count': 'Feature count',
                            'Data Web points': 'Data Web points'},
                
                'Object-oriented': {
                            'Cohesion': 'Cohesion',
                            'Class coupling': 'Class coupling',
                            'Concern coupling': 'Concern coupling'}, 

                'Complexity': {
                            'Connectivity density': 'Connectivity density',
                            'Cyclomatic complexity': 'Cyclomatic complexity',
                            'Model collection complexity': 'Model collection complexity',
                            'Model association complexity': 'Model association complexity',
                            'Model link complexity': 'Model link complexity',
                            'Page complexity': 'Page complexity',
                            'Component complexity': 'Component complexity',
                            'Total complexity': 'Total complexity',
                            'Adaptation complexity': 'Adaptation complexity',
                            'New complexity': 'New complexity',
                            'Data usage complexity': 'Data usage complexity',
                            'Data flow complexity': 'Data flow complexity',
                            'Cohesion complexity': 'Cohesion complexity',
                            'Interface complexity': 'Interface complexity',
                            'Control flow complexity': 'Control flow complexity',
                            'Class complexity': 'Class complexity',
                            'Layout complexity': 'Layout complexity',
                            'Input complexity': 'Input complexity',
                            'Output complexity': 'Output complexity'} 
                            },
            'Cost Driver': {
              'Product':{
                'Type of product': 'Product.Type',
                'Stratum': 'Stratum',
                'Compactness': 'Compactness',
                'Structure': 'Structure',
                'Architecture': 'Architecture',
                'Integration with legacy systems': 'Integration with legacy systems',
                'Concurrency level': 'Concurrency level',
                'Processing requirements': 'Processing requirements',
                'Database size': 'Database size',
                'Requirements volatility level': 'Requirements volatility level',
                'Requirements novelty level': 'Requirements novelty level',
                'Reliability level': 'Reliability level',
                'Maintainability level': 'Maintainability level',
                'Time efficiency level': 'Time efficiency level',
                'Memory efficiency level': 'Memory efficiency level',
                'Portability level': 'Portability level',
                'Scalability level': 'Scalability level',
                'Quality level': 'Quality level',
                'Usability level': 'Usability level',
                'Readability level': 'Readability level',
                'Security level': 'Security level',
                'Installability level': 'Installability level',
                'Modularity level': 'Modularity level',
                'Flexibility level': 'Flexibility level',
                'Testability level': 'Testability level',
                'Accessibility level': 'Accessibility level',
                'Trainability level': 'Trainability level',
                'Innovation level': 'Innovation level',
                'Technical factors': 'Technical factors',
                'Storage constraint': 'Storage constraint',
                'Reusability level': 'Reusability level',
                'Robustness level': 'Robustness level',
                'Design volatility': 'Design volatility',
                'Experience level': 'Experience level',
                'Requirements clarity level': 'Requirements clarity level'},
            'Client': {
                'Availability level': 'Availability level',
                'IT literacy': 'IT literacy',
                'Mapped workflows': 'Mapped workflows',
                'Personality of client': 'Client.Personality'},
                
            'Development Company': {
                'SPI program': 'SPI program',
                'Metrics’ program': 'Metrics’ program',
                'Number of projects in parallel': 'Number of projects in parallel',
                'Software reuse': 'Software reuse'},
            'Project': {
                'Documentation level': 'Documentation level',
                'Number of programming languages': 'Number of programming languages',
                'Type of project': 'Project.Type',
                'Process efficiency level': 'Process efficiency level',
                'Project management level': 'Project management level',
                'Infrastructure': 'Infrastructure',
                'Development restriction': 'Development restriction',
                'Time restriction': 'Time restriction',
                'Risk level': 'Risk level',
                'Rapid app development': 'Rapid app development',
                'Operational mode': 'Operational mode',
                'Resource level': 'Resource level',
                'Lessons learned repository': 'Lessons learned repository'},            
            'Team': {
                'Domain experience level': 'Domain experience level',
                'Team size': 'Team size',
                'Deployment platform experience level': 'Deployment platform experience level',
                'Team capability': 'Team capability',
                'Programming language experience level': 'Programming language experience level',
                'Tool experience level': 'Tool experience level',
                'Communication level': 'Communication level',
                'Software development experience': 'Software development experience',
                'Work Team level': 'Work Team level',
                'Stability level': 'Stability level',
                'Motivation level': 'Motivation level',
                'Focus factor': 'Focus factor',
                'Tool experience level': 'Tool experience level',
                'OO experience level': 'OO experience level',
                'In-house experience': 'In-house experience'},
            'Technology': {
                'Authoring tool type': 'Authoring tool type',
                'Productivity level': 'Productivity level',
                'Novelty level': 'Novelty level',
                'Platform volatility level': 'Platform volatility level',
                'Difficulty level': 'Difficulty level',
                'Platform support level': 'Platform support level'}}
              
    }
    }

    britto1_tax=new_taxonomy

\`\`\`{python, eval=FALSE}

leaves = get_leaf_nodes(new_taxonomy) print(leaves)

ncat = extract_ncat(new_taxonomy) nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy) depths_char =
extract_depths_char(new_taxonomy)

print(“Number of categories (ncat):”, ncat) print(“Number of
characteristics (nchar):”, nchar) print(“Depths of categories:”,
depths_cat) print(“Depths of characteristics:”, depths_char)

robustness_value = calculate_r\_t(new_taxonomy) print(f”Robustness R(T):
{robustness_value:.4f}“) conciseness= calculate_conciseness(ncat, nchar,
depths_cat, depths_char) print(f’The conciseness of the taxonomy is:
{conciseness}’)



    ## 3rd Paper A specialized global software engineering taxonomy for effort estimation
    ```{python}

    new_taxonomy = {
        'GSE': {
            'Project': {
                'Site': {
                    "Location": "Location",
                    "Legal Entity": "Legal Entity",
                    "Geographic Distance": "Geographic Distance",
                    "Temporal Distance": "Temporal Distance",
                    "Estimation stage": {
                        "Early Estimation stage": "Estimation stage.Early",
                        "Early & Late Estimation stage": "Estimation stage.Early & Late",
                        "Late Estimation stage": "Estimation stage.Late"
                    },
                    "Estimation process role": {
                        "Estimator": "Estimator",
                        "Estimator & Provider": "Estimator & Provider",
                        "Provider": "Provider"
                    }
                },
                'Relationship': {
                    "Location": "Location",
                    "Legal Entity": "Legal Entity",
                    "Geographic Distance": "Geographic Distance",
                    "Temporal Distance": "Temporal Distance",
                    "Estimation process architectural model": {
                        "Centralized": "Centralized",
                        "Distributed": "Distributed",
                        "Semi-distributed": "Semi-distributed"
                    }
                }
            }
        }
    }

    britto2_tax=new_taxonomy

\`\`\`{python, eval=FALSE}

leaves = get_leaf_nodes(new_taxonomy) print(leaves)

ncat = extract_ncat(new_taxonomy) nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy) depths_char =
extract_depths_char(new_taxonomy)

print(“Number of categories (ncat):”, ncat) print(“Number of
characteristics (nchar):”, nchar) print(“Depths of categories:”,
depths_cat) print(“Depths of characteristics:”, depths_char)

robustness_value = calculate_r\_t(new_taxonomy) print(f”Robustness R(T):
{robustness_value:.4f}“) conciseness= calculate_conciseness(ncat, nchar,
depths_cat, depths_char) print(f’The conciseness of the taxonomy is:
{conciseness}’)


    ## 4rth Paper: A taxonomy of Approaches and Methods for Software Effort Estimation
    ```{python}
    new_taxonomy = {
        'Software estimation': {
            'Basic Estimating Methods': {
                "Algorithmic": {
                    "Constructive Cost Model": "Constructive Cost Model",
                    "Software Life Cycle Management": "Software Life Cycle Management",
                    "Software Evaluation and Estimation for Risk": "Software Evaluation and Estimation for Risk"
                },
                "Non-Algorithmic": {
                    "Expert Judgment": "Expert Judgment",  # Corrected spelling
                    "Analogy-Based": "Analogy-Based"
                }
            },
            'Combined Estimating Methods': {
                "Basic-Combination": "Basic-Combination",
                "Legal Entity": "Legal Entity",
                "Estimation process architectural model": {
                    "Fuzzy Logic": "Fuzzy Logic",
                    "Artificial Neural Networks": "Artificial Neural Networks",
                    "Computational Intelligence": {  # Corrected spelling
                        "swarm": "swarm",
                        "evolutionary": "evolutionary"
                    }
                },
                "AI-Combined hybrid": "AI-Combined hybrid"
            }
        }
    }

    dashti_tax=new_taxonomy

\`\`\`{python, eval=FALSE}

leaves = get_leaf_nodes(new_taxonomy) print(leaves)

ncat = extract_ncat(new_taxonomy) nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy) depths_char =
extract_depths_char(new_taxonomy)

print(“Number of categories (ncat):”, ncat) print(“Number of
characteristics (nchar):”, nchar) print(“Depths of categories:”,
depths_cat) print(“Depths of characteristics:”, depths_char)

robustness_value = calculate_r\_t(new_taxonomy) print(f”Robustness R(T):
{robustness_value:.4f}“) conciseness= calculate_conciseness(ncat, nchar,
depths_cat, depths_char) print(f’The conciseness of the taxonomy is:
{conciseness}’)


    ## 5th Paper, Towards a Taxonomy of Hypermedia and Web Application Size Metrics. 
    ```{python}

    new_taxonomy = {
      "Hypermedia and Web Application Size Metrics":{
        "Motivation":{"Motivation":"Motivation"},
        "Harvesting time":{
          "Early":"Early size metric",
          "Late":"Late size metric"},
        "Metric foundation":{
          "Problem-oriented metric":"Problem-oriented metric",
          "Solution-oriented metric":"Solution-oriented metric"},
        "Class":{
          "Length":"Length",
          "Functionality":"Functionality",
          "Complexity":"Complexity"},
        "Entity":{
          "Web hypermedia application":"Web hypermedia application",
          "Web software application":"Web software application",
          "Web application":"Web application",
          "Media":"Media",
          "Program/Script":"Program/Sript"},
        "Measurement Scale":{
          "Nominal":"Nominal",
          "Ordinal":"Ordinal",
          "Interval":"Interval",
          "Ratio":"Ratio",
          "Absolute":"Absolute"},
        "Computation":{
          "Direct":"Direct",
          "Indirect":"Indirect"},
        "Validation":{
          "Validated Empirically":"Validated Empirically",
          "Validated Theoretically":"Validated Theoretically",
          "Both Empirically and Theoretically":"Validation.Both",
          "No Validation":"Validation.None"},
        "Model dependency":{
          "Specific":"Specific",
          "Nonspecific":"Nonspecific"}
    }
    }

    mendes_tax=new_taxonomy

\`\`\`{python, eval=FALSE}

leaves = get_leaf_nodes(new_taxonomy) print(leaves)

ncat = extract_ncat(new_taxonomy) nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy) depths_char =
extract_depths_char(new_taxonomy)

print(“Number of categories (ncat):”, ncat) print(“Number of
characteristics (nchar):”, nchar) print(“Depths of categories:”,
depths_cat) print(“Depths of characteristics:”, depths_char)

robustness_value = calculate_r\_t(new_taxonomy) print(f”Robustness R(T):
{robustness_value:.4f}“) conciseness= calculate_conciseness(ncat, nchar,
depths_cat, depths_char) print(f’The conciseness of the taxonomy is:
{conciseness}’)


    ##4 6th Paper, An Effort Estimation Taxonomy for Agile Software Development
    ```{python}
    new_taxonomy = {
        'Effort Estimation in ASD': {
            'Estimation context': {
                "Planning level": {
                    "Release Planning level": "Planning level.Release",
                    "Sprint Planning level": "Planning level.Sprint",
                    "Daily Planning level": "Planning level.Daily",
                    "Bidding Planning level": "Planning level.Bidding"
                },
                "Estimated activities": {
                    "Analysis": "Analysis",
                    "Design": "Design",
                    "Implementation": "Implementation",
                    "Testing": "Testing",
                    "Maintenance": "Maintenance",
                    "All estimateed activities": "Estimated activities.All"
                },
                "Agile methods": {
                    "Extreme Programming": "Extreme Programming",
                    "Scrum": "Scrum",
                    "Customized Extreme Programming": "Customized Extreme Programming",
                    "Customized Scrum": "Customized Scrum",
                    "Dynamic Systems Development Method": "Dynamic Systems Development Method",
                    "Crystal": "Crystal",
                    "Feature-Driven Development": "Feature-Driven Development",
                    "Kanban": "Kanban"
                },
                "Project domain": {
                    "Communications industry": "Communications industry",
                    "Transportation": "Transportation",
                    "Financial": "Financial",
                    "Education": "Education",
                    "Health": "Health",
                    "Retail/Wholesale": "Retail/Wholesale",
                    "Manufacturing": "Manufacturing",
                    "Government/Military": "Government/Military",
                    "Other project domain": "Project somain.Other"
                },
                "Project setting": {
                    "Co-located Project setting": "Project setting.Co-located",
                    "Distributed: Close Onshore": "Distributed: Close Onshore",
                    "Distributed: Distant Onshore": "Distributed: Distant Onshore",
                    "Distributed: Near Offshore": "Distributed: Near Offshore",
                    "Distributed: Far Offshore": "Distributed: Far Offshore"
                },
                "Estimation entity": {
                    "User story Estimation entity": "User story",
                    "Task Estimation entity": "Task",
                    "Use case Estimation entity": "Use case",
                    "Other Estimation entity": "Estimation entity.Other"
                },
                "Number of entities estimated": {
                    "Number of entities estimated": "Number of entities estimated"
                },
                "Team size": {
                    "No. of team members": "Team size.Value"
                }
            },
            'Estimation technique': {
                "Estimation Techniques": {
                    "Planning Poker": "Planning Poker",
                    "Expert Judgement": "Expert Judgement",
                    "Analogy": "Analogy",
                    "Use case points method": "Use case points method",
                    "Other estimation technique": "Estimation technique.Other"
                },
                "Type": {
                    "Single type": "Type.Single",
                    "Group type": "Type.Group"
                }
            },
            'Effort predictors': {
                "Size": {
                    "Story points": "Story points",
                    "User case points": "User case points",
                    "Function points": "Function points",
                    "Other Effort predictors": "Other Effort predictors",
                    "Not used Effort predictors": "Not used Effort predictors",
                    "Considered without any metric": "Considered without any metric"
                },
                "Team's prior experience": {
                    "Considered Team's prior experience": "Team's prior experience.Considered",
                    "Not Considered Team's prior experience": "Team's prior experience.Not Considered"
                },
                "Team's skill level": {
                    "Considered Team's skill level": "Team's skill level.Considered",
                    "Not Considered Team's skill level": "Team's skill level.Not Considered"
                },
                "Non functional requirements": {
                    "Performance": "Performance",
                    "Security": "Security",
                    "Availability": "Availability",
                    "Reliability": "Reliability",
                    "Maintainability": "Maintainability",
                    "Other Non functional requirements": "Non functional requirements.Other",  # Changed period to comma
                    "Not considered Non functional requirements": "Non functional requirements.Not considered"
                },
                "Distributed teams' issues": {
                    "Considered Distributed teams": "Distributed teams.Considered",
                    "Not Considered Distributed teams": "Distributed teams.Not Considered",
                    "Not applicable Distributed teams": "Distributed teams.Not applicable"
                },
                "Customer Communication": {
                    "Considered Customer Communication": "Customer Communication.Considered",
                    "Not Considered Customer Communication": "Customer Communication.Not Considered"
                }
            },
            'Effort estimate': {
                "Estimated effort": {
                    "Estimate value(s)": "Estimate value(s)"
                },
                "Actual effort": {
                    "Actual effort Value": "Actual effort.Value"
                },
                "Type": {
                    "Point Type": "Point Type",
                    "Three point Type": "Three point Type",
                    "Distribution Type": "Distribution Type",
                    "Other Type": "Other Type"
                },
                "Unit": {
                    "House/days": "House/days",
                    "Pair days": "Pair/days",
                    "Ideal hours": "Ideal hours",
                    "Other Unit": "Unit.Other"
                },
                "Accuracy Level": {
                    "Accuracy Level Value": "Accuracy Level.Value"
                },
                "Accuracy measure": {
                    "Mean Magnitude of Relative Error": "Mean Magnitude of Relative Error",
                    "Median Magnitude of Relative Error": "Median Magnitude of Relative Error",
                    "Bias of Relative Error": "Bias of Relative Error",
                    "Other Accuracy measure": "Accuracy measure.Other",
                    "Not used Accuracy measure": "Accuracy measure.Not used"
                }
            }
        }
    }

    usman_tax=new_taxonomy

\`\`\`{python, eval=FALSE}

leaves = get_leaf_nodes(new_taxonomy) print(leaves)

ncat = extract_ncat(new_taxonomy) nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy) depths_char =
extract_depths_char(new_taxonomy)

print(“Number of categories (ncat):”, ncat) print(“Number of
characteristics (nchar):”, nchar) print(“Depths of categories:”,
depths_cat) print(“Depths of characteristics:”, depths_char)

robustness_value = calculate_r\_t(new_taxonomy) print(f”Robustness R(T):
{robustness_value:.4f}“) conciseness= calculate_conciseness(ncat, nchar,
depths_cat, depths_char) print(f’The conciseness of the taxonomy is:
{conciseness}’)


    ```{python}
    import pandas as pd
    import numpy as np
    from sklearn.manifold import TSNE
    from sklearn.preprocessing import LabelEncoder
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D  # Import for 3D plotting
    from transformers import AutoTokenizer, AutoModel
    import torch
    import matplotlib
    plt.clf()

    plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

    plt.rcParams['font.family'] = 'serif'

    # Extracting categories sets
    def extract_intermediate_elements(taxonomy, result=None):
        if result is None:
            result = set()

        for key, value in taxonomy.items():
            if isinstance(value, dict):
                result.add(key)
                extract_intermediate_elements(value, result)

        return result

    bajta_tax_categories = extract_intermediate_elements(bajta_tax)
    britto1_tax_categories = extract_intermediate_elements(britto1_tax)
    britto2_tax_categories = extract_intermediate_elements(britto2_tax)
    dashti_tax_categories = extract_intermediate_elements(dashti_tax)
    mendes_tax_categories = extract_intermediate_elements(mendes_tax)
    usman_tax_categories = extract_intermediate_elements(usman_tax)

    print({'Bajta': bajta_tax_categories})
    print({'Britto_2017':britto1_tax_categories})
    print({'Britto_2016':britto2_tax_categories})
    print({'Dashti':dashti_tax_categories})
    print({'Mendes':mendes_tax_categories})
    print({'Usman':usman_tax_categories})

    sets = {
        'Bajta': bajta_tax_categories,
        'Britto_2017': britto1_tax_categories,
        'Britto_2016': britto2_tax_categories,
        'Dashti': dashti_tax_categories,
        'Mendes': mendes_tax_categories,
        'Usman': usman_tax_categories
    }


    # Extract characteristics (Execute only one of the two, or characteristics set or categories set)

    def extract_leaf_elements(nested_dict):
        """Recursively extract leaf nodes from a nested dictionary."""
        leaves = set()
        for key, value in nested_dict.items():
            if isinstance(value, dict):
                # Recursively process sub-dictionaries
                leaves.update(extract_leaf_elements(value))
            else:
                # Add the current key as it's a leaf node
                leaves.add(value)
        return leaves

    # Example usage with your taxonomies
    bajta_tax_leaves = extract_leaf_elements(bajta_tax)
    britto1_tax_leaves = extract_leaf_elements(britto1_tax)
    britto2_tax_leaves = extract_leaf_elements(britto2_tax)
    dashti_tax_leaves = extract_leaf_elements(dashti_tax)
    mendes_tax_leaves = extract_leaf_elements(mendes_tax)
    usman_tax_leaves = extract_leaf_elements(usman_tax)

    sets = {
        'Bajta': bajta_tax_leaves,
        'Britto_2017': britto1_tax_leaves,
        'Britto_2016': britto2_tax_leaves,
        'Dashti': dashti_tax_leaves,
        'Mendes': mendes_tax_leaves,
        'Usman': usman_tax_leaves
    }

    # Output the final dictionary

    # Step 2: Flatten the sets into a dataframe (assuming 'sets' is already defined)
    words = []
    labels = []
    for label, words_set in sets.items():
        for word in words_set:
            words.append(word)
            labels.append(label)

    # Create a dataframe
    df = pd.DataFrame({'Word': words, 'Set': labels})

    # Step 3: Load the pre-trained model and tokenizer
    model_name = "jinaai/jina-embeddings-v3"
    if 'model' not in locals() or 'tokenizer' not in locals():
        print("Loading model and tokenizer...")
        model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    else:
        print("Model and tokenizer are already loaded.")

    # Step 4: Get the embeddings for each word
    def get_embeddings(word):
        inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    embeddings = np.array([get_embeddings(word) for word in df['Word']])

    # Step 5: Perform t-SNE (now in 2D)
    tsne = TSNE(n_components=2, perplexity=30, random_state=5)
    embeddings_2d = tsne.fit_transform(embeddings)

    # Step 6: Convert string labels to numeric labels for coloring
    label_encoder = LabelEncoder()
    numeric_labels = label_encoder.fit_transform(labels)

    # Step 7: Create the 2D scatter plot
    fig, ax = plt.subplots(figsize=(10, 7))

    # Plot the 2D scatter with the numeric labels for colors
    scatter = ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                         c=numeric_labels, cmap='Set1', s=100)

    # Annotate each point with the word
    for i, word in enumerate(df['Word']):
        ax.text(embeddings_2d[i, 0] + 0.1, embeddings_2d[i, 1] + 0.1, word, fontsize=9)

    # Step 8: Add labels and title
    ax.set_title("2D t-SNE Visualization of Word Embeddings")
    ax.set_xlabel("t-SNE Dimension 1")
    ax.set_ylabel("t-SNE Dimension 2")

    # Step 9: Move the legend outside of the plot
    legend_labels = label_encoder.classes_
    handles = [plt.Line2D([0], [0], marker='o', color='w', 
                          markerfacecolor=plt.cm.Set2(i / len(legend_labels)), markersize=5) 
               for i in range(len(legend_labels))]
    ax.legend(handles, legend_labels, title="Set", loc="center left", bbox_to_anchor=(1.05, 0.5), borderaxespad=0.)

    # Step 10: Show the plot
    plt.tight_layout()  # Ensures proper spacing with the legend outside
    plt.savefig('word_embeddings.png', dpi=300, bbox_inches='tight')
    plt.show()

# K-means Plot

``` {python}
import random
import umap.umap_ as umap
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib.lines import Line2D  # Add this import at the top of your code
colorstyle = "Set2"
seed=5
marker_styles = ['o', '^', 's', 'p', '*', 'D']
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'


# Step 2: Flatten the sets into a dataframe
words = []
labels = []
for label, words_set in sets.items():
    for word in words_set:
        words.append(word.lower())
        labels.append(label)

# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})

# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")

# Step 4: Get the embeddings for each word
def get_embeddings(word):
    inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

embeddings = np.array([get_embeddings(word) for word in df['Word']])

# Step 5: Perform 2D UMAP
umap_model = umap.UMAP(n_components=2, random_state=5)
embeddings_2d = umap_model.fit_transform(embeddings)

# Step 6: Create a color map that reflects the set labels
unique_labels = list(df['Set'].unique())  # Get the unique set labels
cmap = plt.cm.get_cmap(colorstyle, len(unique_labels))  # Create a colormap with enough colors

# Step 7: Run K-means on UMAP embeddings
num_clusters = len(unique_labels)  # Set number of clusters to match unique labels
kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=5)
kmeans_labels = kmeans.fit_predict(embeddings_2d)

# Step 8: Generate top 4 names for each cluster
top_n = 3  # Set how many top words to display for each cluster
cluster_names = []

for i in range(num_clusters):
    # Get the embeddings for words in the current cluster
    cluster_indices = np.where(kmeans_labels == i)[0]
    cluster_embeddings = embeddings[cluster_indices]
    
    # Calculate the centroid of the cluster
    cluster_centroid = np.mean(cluster_embeddings, axis=0).reshape(1, -1)
    
    # Calculate cosine similarity of centroid to all words' embeddings to find closest words
    similarities = cosine_similarity(cluster_centroid, embeddings).flatten()
    
    # Get the indices of the top 4 closest words
    closest_word_indices = np.argsort(similarities)[-top_n:][::-1]  # Get indices of top 4 closest words
    
    # Get the words corresponding to these indices
    closest_words = df['Word'].iloc[closest_word_indices].tolist()
    
    # Store the top 4 closest words as the cluster name
    cluster_names.append(closest_words)

# Step 9: Plot with translucent shapes for each K-means cluster and annotate with top 4 names
plt.figure(figsize=(10, 7))
color_map = {label: cmap(i) for i, label in enumerate(unique_labels)}

# Create a list of marker styles to use for each label
marker_styles = ['o', '^', 's', 'p', '*', 'D']  # Add more marker styles if needed

# Loop through each label and plot with the corresponding marker style
plt.figure(figsize=(10, 7))

for i, label in enumerate(unique_labels):
    # Get the data for the current label
    label_data = df[df['Set'] == label]
    
    # Plot with a different marker for each label
    plt.scatter(embeddings_2d[df['Set'] == label, 0], 
                embeddings_2d[df['Set'] == label, 1],
                c=[color_map[label]] * len(label_data), 
                s=80, 
                label=label,
                marker=marker_styles[i % len(marker_styles)], alpha=0.6)  # Use modulo to cycle through marker styles


# Draw convex hulls around each cluster and annotate with cluster names
for i in range(num_clusters):
    cluster_points = embeddings_2d[kmeans_labels == i]
    
    if len(cluster_points) >= 3:  # ConvexHull requires at least 3 points
        hull = ConvexHull(cluster_points)
        hull_points = cluster_points[hull.vertices]
        plt.fill(hull_points[:, 0], hull_points[:, 1], alpha=0.2, 
                 color=cmap(i), label=f'Cluster {i+1}')
    
    # Annotate with the top 4 cluster names at the centroid location
    cluster_centroid_2d = np.mean(cluster_points, axis=0)
    # Join the top 4 words into a string with commas for cleaner display
    cluster_name_text = '\n'.join(cluster_names[i]).upper() 
    
    # Annotate with the top words at the centroid, with slightly smaller font size
    plt.text(cluster_centroid_2d[0], cluster_centroid_2d[1], cluster_name_text, 
             fontsize=8, ha='center', color='black')

# Step 10: Custom legend to show colors and shapes for each label
plt.title("2D UMAP Visualization of Word Embeddings with K-means Clusters")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")

legend_elements = [Line2D([0], [0], marker=marker_styles[i % len(marker_styles)], color='w', 
                          markerfacecolor=color_map[label], markersize=10, label=label)
                   for i, label in enumerate(unique_labels)]
plt.legend(
    handles=legend_elements,
    title="Literature",
    loc="lower center",
    bbox_to_anchor=(0.5, -0.2),  # Position it just below the plot
    ncol=len(unique_labels),      # Arrange legend items in a single row
    frameon=False                 # Optional: Remove legend box frame
)
# Adjust layout to ensure the legend is not clipped
plt.tight_layout()

# Step 11: Save the plot in high resolution
plt.savefig('word_embeddings_kmeans.png', dpi=600, bbox_inches='tight')

# Show the plot
plt.show()
```

``` {python}
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'

# Define the sets of words

# Combine all sets into a single list with labels
word_sets = sets

word_sets = {label: {word.lower() for word in words} for label, words in word_sets.items()}

# Load model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")

# Function to get embedding for a word
def get_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Collect embeddings
embeddings = []
words = []
labels = []

for label, words_set in word_sets.items():
    for word in words_set:
        embedding = get_embedding(word)
        embeddings.append(embedding)
        words.append(word)
        labels.append(label)

# Create a DataFrame with words, labels, and embeddings
embedding_df = pd.DataFrame({
    "Word": words,
    "Label": labels,
    "Embedding": [emb[0] for emb in embeddings]
})

# Pivot the DataFrame to have the set labels as columns
pivoted_df = embedding_df.pivot(index="Word", columns="Label", values="Embedding")

# Flatten the embeddings (if you want to display them properly as vectors, you might want to separate them)
# Convert the embedding vectors to string for display purposes (or keep them as arrays if you're working with them in computations)
pivoted_df = pivoted_df.applymap(lambda x: str(x.tolist()) if isinstance(x, np.ndarray) else x)

# Display the pivoted DataFrame
print(pivoted_df)
```

#3D PLOT

``` {python}
import plotly.express as px
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
import umap.umap_ as umap
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'
# Step 2: Flatten the sets into a dataframe (assuming sets is already defined)
words = []
labels = []
for label, words_set in sets.items():
    for word in words_set:
        words.append(word)
        labels.append(label)

# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})

# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")

# Step 4: Get the embeddings for each word
def get_embeddings(word):
    inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

embeddings = np.array([get_embeddings(word) for word in df['Word']])

# Step 5: Perform 3D UMAP (with 3 components)
umap_model = umap.UMAP(n_components=3, random_state=5)
embeddings_3d = umap_model.fit_transform(embeddings)

# Step 6: Convert string labels to numeric labels for coloring
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(labels)

# Step 7: Create the interactive 3D plot with Plotly
fig = px.scatter_3d(df, x=embeddings_3d[:, 0], y=embeddings_3d[:, 1], z=embeddings_3d[:, 2],
                    color=labels, text=words,
                    labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2', 'z': 'UMAP Dimension 3'},
                    title="3D UMAP Visualization of Word Embeddings")

# Customize the layout for better viewing
fig.update_traces(marker=dict(size=5, opacity=0.8), selector=dict(mode='markers+text'))
fig.update_layout(scene=dict(xaxis_title='UMAP Dimension 1',
                             yaxis_title='UMAP Dimension 2',
                             zaxis_title='UMAP Dimension 3'))
plt.savefig('3d_word_embedding.png', dpi=300, bbox_inches='tight')

# Show the interactive plot
fig.show()
```

# Another table showing the common words between papers, a bit harder to read #The defining one. Needed to execute the following chunks of code

``` {python}
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'

# Define the sets of words
sets=sets


# Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")

# Function to normalize text to lowercase
def normalize_words(words):
    return {word.lower() for word in words}

# Normalize all words in the sets to lowercase
normalized_sets = {set_name: normalize_words(word_set) for set_name, word_set in sets.items()}

# Function to get embeddings for a list of words
def get_embeddings(words):
    inputs = tokenizer(list(words), padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Create a dictionary to store the embeddings of each set
embeddings = {}
for set_name, word_set in normalized_sets.items():
    embeddings[set_name] = get_embeddings(word_set)

# Create a function to calculate the semantic similarity between sets
def compute_similarity(set1, set2):
    # Get the embeddings for both sets
    embeddings1 = embeddings[set1]
    embeddings2 = embeddings[set2]
    
    # Calculate cosine similarity between all pairs of words in set1 and set2
    sim_matrix = cosine_similarity(embeddings1, embeddings2)
    
    return sim_matrix

# Create a similarity matrix for each pair of sets
similarity_results = {}
for set1 in normalized_sets.keys():
    for set2 in normalized_sets.keys():
        if set1 != set2:
            sim_matrix = compute_similarity(set1, set2)
            similarity_results[(set1, set2)] = sim_matrix

# Create a simple table to store the similarity values
similarity_table = []

# Populate the table with word pairs and their cosine similarity values
for (set1, set2), sim_matrix in similarity_results.items():
    for i, word1 in enumerate(normalized_sets[set1]):
        for j, word2 in enumerate(normalized_sets[set2]):
            similarity_table.append({
                "Set 1": set1,
                "Word 1": word1,
                "Set 2": set2,
                "Word 2": word2,
                "Cosine Similarity": sim_matrix[i, j]
            })

# Convert the table to a DataFrame for better display
similarity_df = pd.DataFrame(similarity_table)

# Filter the DataFrame to keep only cosine similarities above 0.7
similarity_df_filtered = similarity_df[similarity_df['Cosine Similarity'] > 0]

# Create an empty table to store the words that are similar
common_words_table = pd.DataFrame(index=sets.keys(), columns=sets.keys(), dtype=object)

# Populate the table with word pairs that have similarity above 0.7
for index, row in similarity_df_filtered.iterrows():
    set1 = row['Set 1']
    word1 = row['Word 1']
    set2 = row['Set 2']
    word2 = row['Word 2']
    
    # Check if the cell is empty or needs to be updated with word pairs
    if pd.isna(common_words_table.at[set1, set2]):
        common_words_table.at[set1, set2] = f"{word1} - {word2}"
    else:
        common_words_table.at[set1, set2] += f", {word1} - {word2}"

# Display the table showing the common word pairs
print(common_words_table)
```

# making a color table

``` {python}
import pandas as pd
from pandas.io.formats.style import Styler

# Step 1: Define the color map for each set
set_colors = {
    "Bajta": "yellow",
    "Britto_2016": "blue",
    "Britto_2017": "green",
    "Dashti": "red",
    "Mendes": "purple",
    "Usman": "orange"
}

# Create the HTML content for the legend
legend_html = "<div style='font-weight: bold; margin-bottom: 10px;'>Legend:</div>"
for set_name, color in set_colors.items():
    legend_html += f"<div><span style='color:{color};'>●</span> {set_name}</div>"


# Step 2: Create the common words table (we will assume this step has already been completed and filtered)
common_words_table = pd.DataFrame(index=sets.keys(), columns=["Words", "Relations"])

# Step 3: Populate the common words table with colored word pairs
for index, row in similarity_df_filtered.iterrows():
    set1 = row['Set 1']
    word1 = row['Word 1']
    set2 = row['Set 2']
    word2 = row['Word 2']
    
    # Color the words based on the sets
    word1_colored = f'<span style="color:{set_colors[set1]}">{word1}</span>'
    word2_colored = f'<span style="color:{set_colors[set2]}">{word2}</span>'
    
    # Find the row corresponding to set1
    current_row = common_words_table.loc[set1]
    
    if pd.isna(current_row['Words']):
        common_words_table.at[set1, 'Words'] = word1
        common_words_table.at[set1, 'Relations'] = word2_colored
    else:
        common_words_table.at[set1, 'Words'] += f", {word1}"
        common_words_table.at[set1, 'Relations'] += f", {word2_colored}"

# Step 4: Use `map` to apply the coloring function
def colorize_words(word):
    """
    Function to apply color to words using the <span> HTML tag.
    """
    return f"color: {word.split(':')[1]}" if isinstance(word, str) and word.startswith('<span') else ''

# Step 5: Apply the styling to the DataFrame
styled_table = common_words_table.style.applymap(colorize_words, subset=["Words", "Relations"])

# Display the styled table (if using Jupyter or IPython environment)
styled_table


# Save the styled table to an HTML file using to_html()
html_output = styled_table.to_html()
html_output = legend_html + "<br>" + html_output

# Specify the filename where you want to save the table
file_path = 'colored_table.html'

# Write the HTML content to the file
with open(file_path, 'w') as file:
    file.write(html_output)

print(f"Styled table saved to {file_path}")
```

# New table

``` {python}
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles.colors import Color
from bs4 import BeautifulSoup

from collections import defaultdict

# Define the color map for sets
set_colors = {
    "Bajta": "#66c2a5",  # Light green
    "Britto_2016": "#fc8d62",  # Orange
    "Britto_2017": "#8da0cb",  # Blue
    "Dashti": "#e78ac3",  # Pink
    "Mendes": "#a6d854",  # Light green
    "Usman": "#ff7f00"  # Dark orange
}


# Initialize a dictionary to store words grouped by similarity
grouped_words = defaultdict(set)

# Iterate over the similarity dataframe

similarity_df_filtered.loc[similarity_df_filtered["Set 1"] == "Bajta", "Word 1"].unique()

similarity=0.9
for index, row in similarity_df_filtered.iterrows():
    if row['Cosine Similarity'] >= similarity:
        word1 = row['Word 1']
        word2 = row['Word 2']
        set1 = row['Set 1']
        set2 = row['Set 2']
        
        # Add words to the grouped dictionary with their respective sets
        grouped_words[word1].add(set1)
        grouped_words[word2].add(set2)

# Prepare the data for the final table
table_rows = []
for word, sets in grouped_words.items():
    # Color the words based on the sets they belong to
    color_coded_words = []
    for set_name in sets:
        # Assign the color based on the set
        color = set_colors[set_name]
        colored_word = f'<span style="color:{color}">{word}</span>'
        color_coded_words.append(colored_word)
    
    # Prepare the row for this word and its sets
    sets_str = ", ".join([f'<span style="color:{set_colors[set_name]}">{set_name}</span>' for set_name in sets])
    table_rows.append({
        "Categories": ", ".join(color_coded_words),  # Join words with their color applied
        "Taxonomies": sets_str  # Color-coded sets
    })

# Convert to a DataFrame
final_table = pd.DataFrame(table_rows)

# Collapse the table by the "Sets" column, merging words that have the same "Sets"
collapsed_table = final_table.groupby('Taxonomies', as_index=False).agg({
    'Categories': lambda x: ''.join(
        [f'<div style="text-align:center;">{word.title()}</div>' for word in sorted(x.str.cat(sep=', ').split(', '))]
    )  # Capitalize, center-align each word, and add line breaks
})


# Display the collapsed table with HTML rendering
table_title = f"<h1>Grouped Words with Color-Coded Taxonomies (Similarity ≥ {similarity})</h1>"

collapsed_html_output_table = collapsed_table.to_html(escape=False)  # escape=False allows HTML in the table

collapsed_html_output = table_title + collapsed_html_output_table


# Save the collapsed HTML output to a file
collapsed_file_path = 'collapsed_grouped_words_table_colored.html'
with open(collapsed_file_path, 'w') as file:
    file.write(collapsed_html_output)

print(f"Collapsed styled table saved to {collapsed_file_path}")

# Save the Excel file with colors
excel_file_path = 'collapsed_grouped_words_table_colored.xlsx'
wb = Workbook()
ws = wb.active
ws.title = "Colored Table"

# Write the header
header = list(collapsed_table.columns)
ws.append(header)

# Write the rows with formatting
for row in collapsed_table.itertuples(index=False):
    # Parse "Categories" HTML for coloring
    categories_cell = row.Categories
    taxonomies_cell = row.Taxonomies

    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(categories_cell, "html.parser")
    formatted_words = []
    for div in soup.find_all('div'):
        word = div.text.strip()
        style = div.get('style', '')
        color = None
        if 'color:' in style:
            color = style.split('color:')[1].split(';')[0].strip()
        formatted_words.append((word, color))
    
    # Create the row for Excel
    ws_row = [None] * len(header)
    ws_row[0] = formatted_words  # Categories with colors
    ws_row[1] = taxonomies_cell  # Taxonomies plain text

    # Write the row to the sheet
    row_num = ws.max_row + 1
    for col_num, value in enumerate(ws_row, start=1):
        cell = ws.cell(row=row_num, column=col_num)
        if col_num == 1 and value:  # Apply formatting to "Categories"
            for word, color in value:
                cell.value = word
                cell.alignment = Alignment(horizontal="center", wrap_text=True)
                if color:
                    cell.font = Font(color=color)
        else:
            cell.value = value

# Save Excel file
wb.save(excel_file_path)
print(f"Excel file saved to {excel_file_path}")
```

# Colorless table

``` {python}
import pandas as pd
from openpyxl import Workbook

# Prepare the data for the final table (no grouping by cosine similarity or color coding)
table_rows = []
for index, row in similarity_df_filtered.iterrows():
    word1 = row['Word 1']
    set1 = row['Set 1']
    
    # Add the word and its associated taxonomy
    table_rows.append({
        "Categories": word1.capitalize(),  # Capitalize the word
        "Taxonomies": set1    # The taxonomy (set)
    })

# Convert to a DataFrame
final_table = pd.DataFrame(table_rows)

# Collapse the table by the "Taxonomies" column, merging words that have the same "Taxonomies" and removing duplicates
collapsed_table = final_table.groupby('Taxonomies', as_index=False).agg({
    'Categories': lambda x: ', '.join(sorted(set(x)))  # Remove duplicates by converting to set and sort alphabetically
})

# Now we need to group the words in "Categories" by their starting letter
def group_by_first_letter(categories):
    # Split categories into a list
    words = categories.split(', ')
    # Group words by the first letter
    grouped = {}
    for word in words:
        first_letter = word[0].upper()  # Get the first letter and capitalize it
        if first_letter not in grouped:
            grouped[first_letter] = []
        grouped[first_letter].append(word)
    
    # Sort each group alphabetically
    for letter in grouped:
        grouped[letter] = sorted(grouped[letter])

    # Create the formatted string with bold letter and line breaks
    formatted_output = ""
    for letter, words in sorted(grouped.items()):
        # Bold the first letter and add a line break after each group
        formatted_output += f"<b>{letter}:</b> {', '.join(words)}<br><br>"
    
    return formatted_output

# Apply the grouping function to the "Categories" column
collapsed_table['Categories'] = collapsed_table['Categories'].apply(group_by_first_letter)

# Display the collapsed table without HTML rendering (plain table)
collapsed_html_output_table = collapsed_table.to_html(escape=False)  # escape=False if any HTML is present

# Save the collapsed HTML output to a file
collapsed_file_path = 'collapsed_grouped_words_table_grouped_bold.html'
with open(collapsed_file_path, 'w') as file:
    file.write(collapsed_html_output_table)

print(f"Collapsed grouped table saved to {collapsed_file_path}")

# Save the Excel file (plain, no color formatting)
excel_file_path = 'collapsed_grouped_words_table_grouped_bold.xlsx'
wb = Workbook()
ws = wb.active
ws.title = "Grouped Table"

# Write the header
header = list(collapsed_table.columns)
ws.append(header)

# Write the rows without any color formatting
for row in collapsed_table.itertuples(index=False):
    ws_row = [row.Categories, row.Taxonomies]
    ws.append(ws_row)

# Save Excel file
wb.save(excel_file_path)
print(f"Excel file saved to {excel_file_path}")
```

# Improving the color table

\`\`\`{python, eval=FALSE} import pandas as pd import numpy as np from
matplotlib import cm

# Define the colormap

viridis_colormap = cm.Set2(np.linspace(0, 1, 6)) \# Get 6 distinct
colors from the colormap

# Create a color mapping based on the viridis colormap

color_mapping = { “Bajta”: f’rgb({int(viridis_colormap\[0, 0\]*255)},
{int(viridis_colormap\[0, 1\]*255)}, {int(viridis_colormap\[0,
2\]*255)})‘, “Britto_2016”: f’rgb({int(viridis_colormap\[1, 0\]*255)},
{int(viridis_colormap\[1, 1\]*255)}, {int(viridis_colormap\[1,
2\]*255)})’, “Britto_2017”: f’rgb({int(viridis_colormap\[2, 0\]*255)},
{int(viridis_colormap\[2, 1\]*255)}, {int(viridis_colormap\[2,
2\]*255)})’, “Dashti”: f’rgb({int(viridis_colormap\[3, 0\]*255)},
{int(viridis_colormap\[3, 1\]*255)}, {int(viridis_colormap\[3,
2\]*255)})‘, “Mendes”: f’rgb({int(viridis_colormap\[4, 0\]*255)},
{int(viridis_colormap\[4, 1\]*255)}, {int(viridis_colormap\[4,
2\]*255)})’, “Usman”: f’rgb({int(viridis_colormap\[5, 0\]*255)},
{int(viridis_colormap\[5, 1\]*255)}, {int(viridis_colormap\[5,
2\]*255)})’ }

# Create an empty DataFrame to store the new table

colored_words_table = pd.DataFrame(index=sets.keys(),
columns=\[“Words”\])

# Iterate through the filtered similarity DataFrame to populate the colored table

for index, row in similarity_df_filtered.iterrows(): set1 = row\[‘Set
1’\] word1 = row\[‘Word 1’\] set2 = row\[‘Set 2’\] word2 = row\[‘Word
2’\]

    # Apply colors to words
    colored_word1 = f'<span style="color: {color_mapping[set1]}">{word1}</span>'
    colored_word2 = f'<span style="color: {color_mapping[set2]}">{word2}</span>'

    # For each set, append the related words in colored format
    for set_name in sets.keys():
        if set_name == set1:
            current_words = colored_words_table.at[set_name, "Words"]
            new_entry = f"{colored_word1} --> {colored_word2}"
            if pd.isna(current_words):
                colored_words_table.at[set_name, "Words"] = new_entry
            else:
                # Add the new entry and sort all entries alphabetically
                current_entries = current_words.split("<br>")
                current_entries.append(new_entry)
                sorted_entries = sorted(current_entries, key=lambda x: x.lower())  # Sort alphabetically (case insensitive)
                colored_words_table.at[set_name, "Words"] = "<br>".join(sorted_entries)

# Generate the legend HTML

legend_html = “

<strong>Legend:</strong><br>” for set_name, color in
color_mapping.items(): legend_html += f’<span
style="background-color: {color}; padding: 5px; color: white; margin-right: 10px;">{set_name}</span>’
legend_html += “

<br>”

# Convert the DataFrame to an HTML string, preserving the inline style

html_output = legend_html + colored_words_table.to_html(escape=False)

# Specify the filename where you want to save the table

file_path = ‘colored_table_with_breaks_and_sorted.html’

# Write the HTML content to the file

with open(file_path, ‘w’) as file: file.write(html_output)

print(f”Styled and sorted table saved to {file_path}“)



    ```{python, eval=FALSE}
    import pandas as pd

    # Define the color for each set
    set_colors = {
        "Bajta": "yellow",
        "Britto_2016": "blue",
        "Britto_2017": "green",
        "Dashti": "red",
        "Mendes": "purple",
        "Usman": "orange"
    }

    # Define the word pairs between sets (example, based on your actual word pairs)
    # The format will be like (Set name, Word, Related Set, Related Word)
    word_pairs = [
        ("Bajta", "word1", "Britto_2016", "word2"),
        ("Bajta", "word3", "Britto_2017", "word4"),
        ("Britto_2016", "word5", "Dashti", "word6"),
        ("Mendes", "word7", "Usman", "word8"),
        # Add more word pairs here as needed
    ]

    # Create a dictionary to hold the word pairs for each set
    word_dict = {set_name: [] for set_name in set_colors.keys()}

    # Fill the word dictionary with word pairs
    for set1, word1, set2, word2 in word_pairs:
        word_dict[set1].append(f"<span style='color:{set_colors[set1]};'>{word1}</span>")
        word_dict[set2].append(f"<span style='color:{set_colors[set2]};'>{word2}</span>")

    # Create a DataFrame to store the table data
    # Each row will represent a set and the column will contain the words in that set
    table_data = []

    # For each set, add the words it contains and their related words from other sets
    for set_name, words in word_dict.items():
        related_words = ', '.join(words)
        table_data.append([set_name, related_words])

    # Convert the data to a DataFrame
    df = pd.DataFrame(table_data, columns=["Set", "Related Words"])

    # Function to style the table (custom colorize applied already in the word pairs)
    def colorize_table(val):
        return val  # No extra styling is needed as the words are already colored

    # Apply the style
    styled_table = df.style.applymap(colorize_table)

    # Add the legend at the top (HTML)
    legend_html = "<div style='font-weight: bold; margin-bottom: 10px;'>Legend:</div>"
    for set_name, color in set_colors.items():
        legend_html += f"<div><span style='color:{color};'>●</span> {set_name}</div>"

    # Convert the styled table to HTML (use to_html instead of render)
    html_output = styled_table.to_html()

    # Combine the legend and the table
    full_html_output = legend_html + "<br>" + html_output

    # Specify the filename where you want to save the table
    file_path = 'colored_table_with_legend.html'

    # Write the combined HTML (legend + table) to the file
    with open(file_path, 'w') as file:
        file.write(full_html_output)

    print(f"Styled table with legend saved to {file_path}")


# t sne

``` {python}
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from adjustText import adjust_text
import numpy as np
import seaborn as sns

sets = {
    'Bajta': {"Agile", "Analysis", "Availability", "Baseline comparison", "Bidding", "CBR", "CMMI", "COCOMO", "Commissioning", "Conceptualization", "Delphi", "Detail planning", "Design", "Distant onshore", "Expert judgment", "Estimated value", "Execution", "Effort hours", "Feasibility study", "Finance", "Fuzzy similarity", "GA", "Group-based estimation", "Healthcare", "Hardware", "Implementation", "Individual", "Machine learning", "Maintainability", "Maintenance", "Near offshore", "Non-machine learning", "Not considered", "Number of team members", "Performance", "Portfolio", "Preliminary planning", "Reliability", "Research & development", "Risk", "Security", "Sensitivity analysis", "Size report", "Socio-cultural distance", "Statistical analysis", "Staff/cost", "System investigation", "Temporal distance", "Testing", "Value", "Variation reduction"},
    'Britto_2017': {"Accessibility level", "Adaptation complexity", "Anchor count", "Architecture", "Association center slot count", "Association slot size", "Attribute count", "Authoring tool type", "Availability level", "Class complexity", "Class coupling", "Client script count", "Cluster count", "Cluster node size", "Cluster slot count", "Cohesion", "Cohesion complexity", "Collection center slot count", "Collection slot size", "Comment count", "Communication level", "Compactness", "Component complexity", "Component count", "Component granularity level", "Component slot count", "Concern coupling", "Concern module count", "Concern operation count", "Concurrency level", "Connectivity density", "Control flow complexity", "Cyclomatic complexity", "Data Web points", "Data flow complexity", "Data usage complexity", "Database size", "Deployment platform experience level", "Design volatility", "Development restriction", "Difficulty level", "Diffusion cut count", "Documentation level", "Domain experience level", "Entity count", "Experience level", "Feature count", "Flexibility level", "Focus factor", "High feature count", "IT literacy", "In-house experience", "Indifferent concern count", "Information slot count", "Infrastructure", "Inner/sub concern count", "Innovation level", "Input complexity", "Installability level", "Integration with legacy systems", "Interface complexity", "International Function Point Users Group", "Layout complexity", "Lessons learned repository", "Lines of code", "Link count", "Low feature count", "Maintainability level", "Mapped workflows", "Media allocation", "Media count", "Media duration", "Memory efficiency level", "Metrics program", "Model association complexity", "Model collection complexity", "Model link complexity", "Model node size", "Model slot size", "Modularity level", "Module attribute count", "Module count", "Module point cut count", "Motivation level", "New Web page count", "New complexity", "New media count", "Node count", "Node slot size", "Novelty level", "Number of programming languages", "Number of projects in parallel", "OO experience level", "Object-Oriented Function Points", "Operation count", "Operational mode", "Output complexity", "Page complexity", "Personality", "Platform support level", "Platform volatility level", "Portability level", "Process efficiency level", "Processing requirements", "Productivity level", "Program count", "Programming language experience level", "Project management level", "Publishing model unit count", "Publishing unit count", "Quality level", "Rapid app development", "Readability level", "Reliability level", "Requirements clarity level", "Requirements novelty level", "Requirements volatility level", "Resource level", "Reusability level", "Reused comment count", "Reused component count", "Reused high feature count", "Reused lines of code", "Reused low feature count", "Reused media allocation", "Reused media count", "Reused program count", "Risk level", "Robustness level", "SPI program", "Scalability level", "Section count", "Security level", "Segment count", "Semantic association count", "Server script count", "Slot count", "Slot granularity level", "Software development experience", "Software reuse", "Stability level", "Statement count", "Storage constraint", "Structure", "Team capability", "Team size", "Technical factors", "Testability level", "Time efficiency level", "Time restriction", "Tool experience level", "Total complexity", "Trainability level", "Type", "Usability level", "Use case count", "Web objects", "Web page allocation", "Web page count", "Work Team level"},
    'Britto_2016': {"Centralized", "distributed", "Early", "Estimator", "Early & Late", "Estimator & Provider", "geographic distance", "geographic distance", "late", "legal entity", "location", "provider", "semi-distributed", "temporal distance", "temporal distance"},
    'Dasthi': {"ANN", "Analogy Base", "COCOMO", "Evolutionary", "Expert Judgment", "FUZZY", "SEER-SEM", "SLIM", "Swarm"},
    'Mendes': {"Absolute", "both", "complexity", "functionality", "Directly", "Early size metric", "Empirically", "indirectly", "interval", "Length", "late size metric", "media", "none", "Nominal", "nonspecific", "ordinal", "other", "Problem oriented metric", "program/script", "ratio", "solution oriented metric", "Specific", "theoretically", "Web application", "Web hypermedia application", "Web software application"},
    'Usman': {"Analysis", "all", "analogy", "availability", "bidding", "Close Onshore", "Co-located", "Communications industry", "Considered", "crystal", "customized XP", "customized scrum", "daily", "design", "distribution", "education", "expert judgement", "DSDM", "Distant Onshore", "Estimate value(s)", "FDD", "Far Offshore", "financial", "function points", "Hours/days", "health", "ideal hours", "implementation", "kanban", "maintainability", "maintenance", "manufacturing", "MMRE", "MdMRE", "Near Offshore", "No. of team members", "not applicable", "not considered", "not used", "Other", "Performance", "Planning poker", "Point", "pair days", "Release", "reliability", "retail/wholesale", "Single", "scrum", "security", "sprint", "Story points", "testing", "three point", "task", "transportation", "UC points", "User story", "Value", "XP"}
}

normalized_sets = {set_name: normalize_words(word_set) for set_name, word_set in sets.items()}


# Create a dictionary to store the embeddings of each set
embeddings = {}
all_words = []
word_to_set = {}
for set_name, word_set in normalized_sets.items():
    embeddings[set_name] = get_embeddings(word_set)
    all_words.extend(list(word_set))
    for word in word_set:
        word_to_set[word] = set_name

# Create an array of all embeddings
all_embeddings = torch.cat([embeddings[set_name] for set_name in normalized_sets], dim=0)


# Map sets to colors
set_colors = {set_name: sns.color_palette("Set2")[i] for i, set_name in enumerate(sets.keys())}
word_colors = [set_colors[word_to_set[word]] for word in all_words]

# Apply t-SNE to reduce the dimensionality of the embeddings to 2D
tsne = TSNE(n_components=2, random_state=5)
reduced_embeddings = tsne.fit_transform(all_embeddings)

# Initialize figure
plt.figure(figsize=(16, 12))

# Track words already labeled
labeled_words = {}

# Scatter plot with words colored by their set and label duplicates only once
for i, word in enumerate(all_words):
    # Color and position each word's dot
    plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], 
                c=[set_colors[word_to_set[word]]], s=50, alpha=0.6)
    
    # Check if the word has appeared before
    if word not in labeled_words:
        # If first occurrence, label it and choose red if shared
        color = 'red' if all_words.count(word) > 1 else 'black'
        text = plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], word.upper(), 
                        fontsize=5, color=color)
        labeled_words[word] = text  # Track labeled words for adjustText
    
# Adjust the positions of labels to avoid overlap
adjust_text(list(labeled_words.values()), only_move={'points': 'xy'}, force_text=0.75, expand_text=(1.5, 1.5))

# First legend for word sets
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10) for color in set_colors.values()]
labels = list(sets.keys())
legend1 = plt.legend(handles=handles, labels=labels, title="Literature", loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6)

# Second legend for duplicate words
duplicate_legend = plt.Line2D([0], [1], color='red', lw=2)
legend2 = plt.legend([duplicate_legend], ["In red: duplicate words"], loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# Re-add the first legend
plt.gca().add_artist(legend1)

plt.title("t-SNE Visualization of Word Embeddings")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")

# Save the plot as an image
plt.savefig('tsne_word_embeddings.png', dpi=600, bbox_inches='tight')

# Display the plot
plt.show()
```

``` {python}
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from adjustText import adjust_text
import numpy as np
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D  # Import 3D plotting tools

# Create a dictionary to store the embeddings of each set
embeddings = {}
all_words = []
word_to_set = {}
for set_name, word_set in normalized_sets.items():
    embeddings[set_name] = get_embeddings(word_set)
    all_words.extend(list(word_set))
    for word in word_set:
        word_to_set[word] = set_name

# Create an array of all embeddings
all_embeddings = torch.cat([embeddings[set_name] for set_name in normalized_sets], dim=0)

# Map sets to colors
set_colors = {set_name: sns.color_palette("Set2")[i] for i, set_name in enumerate(sets.keys())}
word_colors = [set_colors[word_to_set[word]] for word in all_words]

# Apply t-SNE to reduce the dimensionality of the embeddings to 3D
tsne = TSNE(n_components=3, random_state=5)
reduced_embeddings = tsne.fit_transform(all_embeddings)

# Initialize figure for 3D plot
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot(111, projection='3d')

# Track words already labeled
labeled_words = {}

# Scatter plot with words colored by their set and label duplicates only once
for i, word in enumerate(all_words):
    # Color and position each word's dot in 3D
    ax.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], reduced_embeddings[i, 2], 
               c=[set_colors[word_to_set[word]]], s=50, alpha=0.6)
    
    # Check if the word has appeared before
    if word not in labeled_words:
        # If first occurrence, label it and choose red if shared
        color = 'red' if all_words.count(word) > 1 else 'black'
        ax.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], reduced_embeddings[i, 2], 
                word.upper(), fontsize=5, color=color)
        labeled_words[word] = True  # Track labeled words for avoid overlap
    
# Adjust the positions of labels to avoid overlap (this part doesn't adjust in 3D directly, 
# but you could explore 3D label adjustments using other techniques like manually adjusting the positions)
# For now, we keep the label text without adjustment in 3D (more complex adjustments can be done with other libraries).

# First legend for word sets
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10) for color in set_colors.values()]
labels = list(sets.keys())
legend1 = plt.legend(handles=handles, labels=labels, title="Literature", loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6)

# Second legend for duplicate words
duplicate_legend = plt.Line2D([0], [1], color='red', lw=2)
legend2 = plt.legend([duplicate_legend], ["In red: duplicate words"], loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# Re-add the first legend
plt.gca().add_artist(legend1)

ax.set_title("t-SNE Visualization of Word Embeddings in 3D")
ax.set_xlabel("t-SNE Component 1")
ax.set_ylabel("t-SNE Component 2")
ax.set_zlabel("t-SNE Component 3")

# Save the plot as an image
plt.savefig('tsne_word_embeddings_3d.png', dpi=600, bbox_inches='tight')

# Display the plot
plt.show()
```

# Same as before but UMAP

``` {python}
import matplotlib.pyplot as plt
from adjustText import adjust_text
import numpy as np
import umap.umap_ as umap

# Apply UMAP to reduce the dimensionality of the embeddings to 2D
umap_model = umap.UMAP(n_components=2, random_state=5)
reduced_embeddings = umap_model.fit_transform(all_embeddings)

# Initialize figure
plt.figure(figsize=(16, 12))

# Track words already labeled
labeled_words = {}

# Scatter plot with words colored by their set and label duplicates only once
for i, word in enumerate(all_words):
    # Color and position each word's dot
    plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], 
                c=[set_colors[word_to_set[word]]], s=50, alpha=0.6)
    
    # Check if the word has appeared before
    if word not in labeled_words:
        # If first occurrence, label it and choose red if shared
        color = 'red' if all_words.count(word) > 1 else 'black'
        text = plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], word.upper(), 
                        fontsize=5, color=color)
        labeled_words[word] = text  # Track labeled words for adjustText
    
# Adjust the positions of labels to avoid overlap
adjust_text(list(labeled_words.values()), only_move={'points': 'xy'}, force_text=0.75, expand_text=(1.5, 1.5))

# First legend for word sets
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10) for color in set_colors.values()]
labels = list(sets.keys())
legend1 = plt.legend(handles=handles, labels=labels, title="Literature", loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6)

# Second legend for duplicate words
duplicate_legend = plt.Line2D([0], [1], color='red', lw=2)
legend2 = plt.legend([duplicate_legend], ["In red: duplicate words"], loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False)

# Re-add the first legend
plt.gca().add_artist(legend1)

plt.title("UMAP Visualization of Word Embeddings")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Save the plot as an image
plt.savefig('umap_word_embeddings.png', dpi=600, bbox_inches='tight')

# Display the plot
plt.show()
```

# Similarity Counts Heatmap

``` {python}
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Clear any existing plots and set plot style
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'serif'

df = similarity_df_filtered

# Group by Set 1 and Set 2 and count the number of shared words
count_table = df.groupby(["Set 1", "Set 2"]).size().reset_index(name="Shared Word Count")

# Pivot the DataFrame to create a matrix of shared word counts
pivot_count = count_table.pivot(index="Set 1", columns="Set 2", values="Shared Word Count").fillna(0)

# Reindex the pivot table to ensure symmetry in rows and columns
pivot_count = pivot_count.reindex(index=pivot_count.columns, columns=pivot_count.columns, fill_value=0)

# Mask only the upper triangle, excluding the diagonal
mask = np.triu(np.ones_like(pivot_count, dtype=bool), k=1)

# Plot the heatmap
plt.figure(figsize=(12, 8))
#sns.heatmap(pivot_count, annot=True, mask=mask, cmap="Blues", cbar_kws={'label': 'Number of Shared Words'})
sns.heatmap(pivot_count, annot=True, mask=mask, cmap="Blues", cbar_kws={'label': 'Number of Similar Words'})

plt.title("Heatmap of Similar Words between Literature")
plt.xlabel("Literature")
plt.ylabel("Literature")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('word_counts.png', dpi=300, bbox_inches='tight')
plt.show()
```

# Barplot

``` {python}
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Clear any existing plots and set plot style
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'serif'

# Data preparation
df = similarity_df_filtered

# Group by Set 1 and Set 2 and count the number of shared words
count_table = df.groupby(["Set 1", "Set 2"]).size().reset_index(name="Shared Word Count")

# Remove redundant comparisons (keeping only Set1 < Set2)
count_table = count_table[count_table["Set 1"] < count_table["Set 2"]]

# Create all possible combinations of Set 1 and Set 2
all_sets = pd.MultiIndex.from_product([count_table["Set 1"].unique(), count_table["Set 2"].unique()], names=["Set 1", "Set 2"])

# Create a DataFrame with all combinations and zero counts
count_table_full = pd.DataFrame(index=all_sets).reset_index()

# Merge count_table_full with the original count_table to get the actual shared word counts
count_table_full = pd.merge(count_table_full, count_table, on=["Set 1", "Set 2"], how="left").fillna(0)

# Create the bar plot (dodge=True)
plt.figure(figsize=(12, 8))
ax = sns.barplot(x="Set 1", y="Shared Word Count", hue="Set 2", data=count_table_full, palette="Set2", width=0.8, dodge=True)

# Title and labels
plt.title("Barplot of Similar Words Between Literature (Upper 70% similarity threshold)")
plt.xlabel("Number of Similar Words")
plt.ylabel("Literature")

# Add value labels inside each bar with the same color as the bars
for container in ax.containers:
    for bar in container:
        bar_color = bar.get_facecolor()  # Get the color of the bar
        ax.bar_label(container, label_type="edge", padding=3, fontsize=12, color=bar_color, fontweight='bold')  # Set the label color to the bar's color

# Change the legend title and position it at the bottom horizontally
plt.legend(title="Literature", loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=len(count_table['Set 2'].unique()))

# Adjust layout to prevent clipping and save the plot
plt.tight_layout()
plt.savefig('shared_words_barplot_with_labels.png', dpi=300, bbox_inches='tight')
plt.show()

```

# Table showing what words are related to words in other papers

``` {python}
# Group by Set 1 and Set 2
count_table = df.groupby(["Set 1", "Set 2"]).size().reset_index(name="Shared Word Count")

# Create empty lists to store the shared words for Set 1 and Set 2
words_set1 = []
words_set2 = []

# For each pair of sets, collect the shared words
for _, row in count_table.iterrows():
    set1 = row['Set 1']
    set2 = row['Set 2']
    
    # Get the rows from the filtered DataFrame that match the current pair of sets
    shared_rows = df[(df['Set 1'] == set1) & (df['Set 2'] == set2)]
    
    # Collect the shared words for Set 1 and Set 2
    set1_words = sorted(shared_rows['Word 1'].unique())  # Unique words from Set 1
    set2_words = sorted(shared_rows['Word 2'].unique())  # Unique words from Set 2
    
    # Join the words into a comma-separated string
    words_set1.append(", ".join(set1_words))
    words_set2.append(", ".join(set2_words))

# Add the new columns to the count_table
count_table['Words From Set 1'] = words_set1
count_table['Words From Set 2'] = words_set2



# Display the resulting DataFrame
print(count_table)

# Make sure that Set 1 is always smaller than Set 2 (lexicographically)
count_table['Set Pair'] = count_table.apply(lambda row: tuple(sorted([row['Set 1'], row['Set 2']])), axis=1)

# Drop duplicates based on the 'Set Pair' column
count_table_filtered = count_table.drop_duplicates(subset=['Set Pair'])

# Drop the 'Set Pair' column now that we don't need it anymore
count_table_filtered = count_table_filtered.drop(columns=['Set Pair'])

# Display the filtered result
print(count_table_filtered)
```

``` {python}

##############################COLORIZE TABLE############################

import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import PatternFill, Font

color_palette = [
    'ADD8E6',  # Light Blue
    'FFFFFF',  # White
]

# Initialize tracking variables
word1_to_color = {}
color_index = 0
previous_word = None
previous_color = None

# Create a new column "Marker" to hold "*" if "Word 1" equals "Word 2"
similarity_df_filtered['Marker'] = similarity_df_filtered.apply(
    lambda row: '*' if row['Word 1'] == row['Word 2'] else '', axis=1
)

# Function to assign colors to rows, skipping color assignment if the "Word 1" is the same as the previous row's "Word 1"
def assign_colors(row):
    global previous_word, previous_color, color_index

    word1 = row['Word 1']
    
    # If the current "Word 1" is the same as the previous "Word 1", reuse the color
    if word1 == previous_word:
        color = previous_color
    else:
        # Otherwise, assign the next color and update tracking variables
        color = color_palette[color_index % len(color_palette)]
        previous_color = color
        previous_word = word1
        color_index += 1
    
    # Return a list of styles for all columns except for "Marker" (bold)
    return [f'background-color: {color}'] * (len(row) - 1) + ['font-weight: bold; color: black;']

# Apply the function to style the DataFrame
styled_similarity_df = similarity_df_filtered.style.apply(assign_colors, axis=1)

# Display the styled DataFrame
styled_similarity_df
styled_similarity_df.to_html('similarity_table.html')

######################## EXCEL ###################
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import PatternFill


# Initialize the Excel workbook and sheet
wb = Workbook()
ws = wb.active
ws.title = "Similarity Results"

# Write the headers to the Excel sheet
headers = list(similarity_df_filtered.columns)
ws.append(headers)

# Define the color palette (matching Excel-friendly hex colors)
excel_palette = [
    'FFFFFF',  # Soft Orange
    'cfebff',  # Soft Blue
]

# Initialize tracking variables for coloring
word1_to_color = {}
color_index = 0
previous_word = None
previous_color = None

# Function to get the next color for a word pair, reusing previous color if the word matches
def get_color(word1):
    global previous_word, previous_color, color_index
    if word1 == previous_word:
        color = previous_color
    else:
        color = excel_palette[color_index % len(excel_palette)]
        previous_word = word1
        previous_color = color
        color_index += 1
    return color

# Populate the Excel sheet with data and apply conditional coloring
for _, row in similarity_df_filtered.iterrows():
    # Determine the color for this row
    color_hex = get_color(row['Word 1'])
    fill = PatternFill(start_color=color_hex, end_color=color_hex, fill_type="solid")
    
    # Create a list to hold row values, including the new "Marker" column
    row_values = list(row)

    # Add an asterisk "*" in the "Marker" column if Word 1 == Word 2
    marker = "*" if row['Word 1'] == row['Word 2'] else ""
    row_values.append(marker)  # Append the marker to the row values

    # Write row to Excel and apply styling
    ws.append(row_values)
    for col_idx in range(1, len(row_values)):
        cell = ws.cell(row=ws.max_row, column=col_idx)
        cell.fill = fill  # Apply the color fill to each cell in the row

    # Apply bold font to the marker cell if it contains "*"
    marker_cell = ws.cell(row=ws.max_row, column=len(row_values))
    if marker_cell.value == "*":
        marker_cell.font = Font(bold=True)

# Save the workbook
wb.save("similarity_results_colored.xlsx")
```

#Trying to collapse the excel

``` {python}
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import PatternFill, Font
from openpyxl.utils import get_column_letter

# Initialize Excel workbook and worksheet
wb = Workbook()
ws = wb.active
ws.title = "Collapsed Similarity Results"

# Create a collapsed column header
headers = ["Set Pair", "Word Pair", "Cosine Similarity", "Marker"]
ws.append(headers)

# Define color palette for coloring words
excel_palette = [
    'ADD8E6',  # Light Blue
    'FFFFFF',  # Sky Blue
]

# Initialize tracking for word color and color index
word1_to_color = {}
color_index = 0
previous_word = None
previous_color = None

# Function to get the next color for a word, preventing consecutive repetitions
def get_color(word1):
    global previous_word, previous_color, color_index
    if word1 == previous_word:
        color = previous_color
    else:
        color = excel_palette[color_index % len(excel_palette)]
        previous_word = word1
        previous_color = color
        color_index += 1
    return color

# Populate the sheet with merged "Set Pair" and "Word Pair" columns
for _, row in similarity_df_filtered.iterrows():
    set_pair = f"{row['Set 1']} - {row['Set 2']}"
    word_pair = f"{row['Word 1']} - {row['Word 2']}"
    cosine_similarity = row['Cosine Similarity']
    marker = "*" if row['Word 1'] == row['Word 2'] else ""

    # Append collapsed data to Excel sheet
    ws.append([set_pair, word_pair, cosine_similarity, marker])

    # Retrieve the current row in Excel (last row)
    current_row = ws.max_row

    # Apply color fill to "Word Pair" cell, splitting the color between the words
    word1_color = get_color(row['Word 1'])
    word2_color = get_color(row['Word 2'])

    # Use PatternFill to set color on individual words in the "Word Pair" cell
    cell = ws.cell(row=current_row, column=2)
    cell_value = cell.value

    # Apply color fills
    ws.cell(row=current_row, column=2).fill = PatternFill(
        start_color=word1_color, end_color=word1_color, fill_type="solid"
    )
    # Format each word with its designated color
    # Since openpyxl doesn't support in-cell color splitting directly,
    # we’d visually inspect it or use external software for more advanced formatting.

    # Apply bold font to the "Marker" column if it has "*"
    marker_cell = ws.cell(row=current_row, column=4)
    if marker:
        marker_cell.font = Font(bold=True)

# Adjust column width for readability
for col in range(1, ws.max_column + 1):
    ws.column_dimensions[get_column_letter(col)].width = 20

# Save the workbook
wb.save("collapsed_similarity_results_colored.xlsx")
```

# Trying to collapse the excel even more

``` {python}
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.utils import get_column_letter

# Create a workbook and a worksheet
wb = Workbook()
ws = wb.active
ws.title = "Collapsed Similarity Results"

# Define the headers for the new table
headers = ["Set Pair", "Word Pair"]
ws.append(headers)

# Define color palette for coloring words
excel_palette = [
    'FF6384',  # Light Red
    '36A2EB',  # Light Blue
    'FFCE56',  # Light Yellow
    '4BC0C0',  # Light Green
    '9966FF',  # Light Purple
    'FF9F40'   # Light Orange
]

# Initialize the color index and word-to-color mapping
color_index = 0
word_colors = {}

# Function to assign a color to a word, and avoid repeating colors for the same word in consecutive rows
def get_word_color(word):
    global color_index
    if word not in word_colors:
        word_colors[word] = excel_palette[color_index % len(excel_palette)]
        color_index += 1
    return word_colors[word]

# Group the words by set pair and track the font color for each word
set_pairs_dict = {}

# Populate the set_pairs_dict with the relevant data
for _, row in similarity_df_filtered.iterrows():
    set_pair = f"{row['Set 1']} - {row['Set 2']}"
    word_pair = f"{row['Word 1']} - {row['Word 2']}"
    
    # Group words by set pairs
    if set_pair not in set_pairs_dict:
        set_pairs_dict[set_pair] = []
    
    set_pairs_dict[set_pair].append((row['Word 1'], row['Word 2'], row['Cosine Similarity']))

# Function to apply colors to words in the final merged table
def apply_word_colors(word_pair_str, word_colors):
    colored_str = []
    for word in word_pair_str.split(" - "):
        color = word_colors.get(word, '000000')  # Default to black if no color is found
        word_font = Font(color=color)
        colored_str.append(f"{word} ({color})")  # Track color alongside the word
    return " - ".join(colored_str)

# Now we will merge words and apply colors to the font
for set_pair, word_pairs in set_pairs_dict.items():
    combined_words = []
    for word1, word2, _ in word_pairs:
        color_word1 = get_word_color(word1)
        color_word2 = get_word_color(word2)

        # Append words to the combined list, ensuring no duplicates
        combined_words.extend([word1, word2])

    # Remove duplicates and join them into one string for the word pair column
    combined_words = sorted(set(combined_words), key=lambda x: combined_words.index(x))
    word_pair_str = " - ".join(combined_words)

    # Add the set pair and word pair to the Excel sheet
    current_row = ws.max_row + 1
    ws.append([set_pair, word_pair_str])

    # Apply font color for each word in the final word pair
    current_cell = ws.cell(row=current_row, column=2)
    current_cell.value = word_pair_str

    for word in word_pair_str.split(" - "):
        # Set font color for each word (as per the color assigned previously)
        color = word_colors.get(word, '000000')

        # Check if the word is identical and bold it
        is_bold = False
        for word1, word2, _ in word_pairs:
            if word1 == word2:
                is_bold = True

        # Apply the font color and bold if necessary
        current_cell.font = Font(color=color, bold=is_bold)

# Save the workbook to a file
wb.save("colored_word_pairs.xlsx")

print("Excel file with colored words and bold identical words has been created!")


```

``` {python}
# Initialize tracking variables
word1_to_color = {}
color_index = 0
previous_word = None
previous_color = None

# Create a new column "Marker" to hold "*" if "Word 1" equals "Word 2"
similarity_df_filtered['Marker'] = similarity_df_filtered.apply(
    lambda row: '*' if row['Word 1'] == row['Word 2'] else '', axis=1
)

# Function to assign colors to rows, skipping color assignment if the "Word 1" is the same as the previous row's "Word 1"
def assign_colors(row):
    global previous_word, previous_color, color_index

    word1 = row['Word 1']
    
    # If the current "Word 1" is the same as the previous "Word 1", reuse the color
    if word1 == previous_word:
        color = previous_color
    else:
        # Otherwise, assign the next color and update tracking variables
        color = color_palette[color_index % len(color_palette)]
        previous_color = color
        previous_word = word1
        color_index += 1
    
    # Return a list of styles for all columns except for "Marker" (bold)
    return [f'background-color: {color}'] * (len(row) - 1) + ['font-weight: bold; color: black;']

# Apply the function to style the DataFrame
styled_similarity_df = similarity_df_filtered.style.apply(assign_colors, axis=1)

# Display the styled DataFrame
styled_similarity_df
styled_similarity_df.to_html('similarity_table.html')
```