In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from itertools import product

## Shortlisted 100 concepts

In [5]:
data = pd.read_csv('survey.csv')  # Your dataset with all the columns

In [6]:
data.head(2)

Unnamed: 0,CUI,English Term,Semantic Type,German Equivalent,Translation Quality Score,Frequency of Occurrence,Word_Count,Normalized Word Count,Valid Medical Concept,POS Tags,Combination Score,Frequency of Occurrence Score,Goodness Score
0,C2700217,functional,"Conceptual Entity,Functional Concept",funktional,1.0,16056760,1,0.98,True,['adjective'],0.3,0.604134,0.687034
1,C0680730,determination,Governmental or Regulatory Activity,Bestimmung,1.0,12611783,1,0.98,True,['noun'],0.4,0.474517,0.684629


## Dentist's Baseline

In [7]:
survey_results = pd.read_csv('survey_results.csv')  # Dentist's ratings

In [8]:
survey_results.head(2)

Unnamed: 0,concept,rating
0,tobacco smoke pollution,Bad
1,obstetric goals: ensuring proper intrauterine ...,Good


## Trying out all the combinations (Grid Search)

In [9]:
# Map dentist's ratings to numeric values
rating_map = {'Good': 1, 'Moderate': 0.5, 'Bad': 0}
survey_results['Rating'] = survey_results['rating'].map(rating_map)

# Merge survey results with your dataset on the concept name or identifier
merged_data = pd.merge(data, survey_results, left_on='English Term', right_on='concept')

In [13]:
merged_data

Unnamed: 0,CUI,English Term,Semantic Type,German Equivalent,Translation Quality Score,Frequency of Occurrence,Word_Count,Normalized Word Count,Valid Medical Concept,POS Tags,Combination Score,Frequency of Occurrence Score,Goodness Score,concept,rating,Rating
0,C5652313,vertebrata cymatophila,Vertebrate,Wirbelkomatophila,1.0,26563041,2,0.96,False,[],0.0,0.999432,0.691858,vertebrata cymatophila,Moderate,0.5
1,C0042567,subphylum vertebrata,Vertebrate,Subphylumwirbel,1.0,26562951,2,0.96,False,[],0.0,0.999428,0.691857,subphylum vertebrata,Moderate,0.5
2,C0086418,"human, general",Human,"Mensch, allgemein",1.0,22279537,2,0.96,False,[],0.0,0.838265,0.651566,"human, general",Good,1.0
3,C0086418,man and woman,Human,Mann und Frau,1.0,22235747,3,0.94,False,[],0.0,0.836618,0.647154,man and woman,Good,1.0
4,C0086418,human - origin (qualifier value),Human,Mensch - Ursprung (Qualifikationswert),1.0,22234334,5,0.9,False,[],0.0,0.836565,0.639141,human - origin (qualifier value),Good,1.0
5,C0040334,tobacco smoke pollution,Environmental Effect of Humans,Tabakrauchverschmutzung,1.0,15926,3,0.94,True,"['noun', 'verb', 'noun']",0.75,0.000599,0.66315,tobacco smoke pollution,Bad,0.0
6,C0220781,synthesis,Biologic Function,Synthese,1.0,8065385,1,0.98,True,['noun'],0.4,0.303459,0.641865,synthesis,Good,1.0
7,C0700276,anatomy,Anatomical Structure,Anatomie,1.0,5801874,1,0.98,True,['noun'],0.4,0.218295,0.620574,anatomy,Good,1.0
8,C1705428,group,Conceptual Entity,Gruppe,1.0,5672619,1,0.98,True,['noun'],0.4,0.213432,0.619358,group,Good,1.0
9,C0564500,phobic avoidance - mental defence mechanism,Mental Process,Phobische Vermeidung - Mechanismus zur mentale...,0.0,5,6,0.88,False,[],0.0,0.0,0.176,phobic avoidance - mental defence mechanism,Moderate,0.5


## Step 2: Define the Objective

Define a function to calculate the goodness scores using different weights and a loss function to evaluate the alignment:

In [11]:
def calculate_goodness_scores(df, weights):
    return (
        df['Combination Score'] * weights['Combination Score'] +
        df['Normalized Word Count'] * weights['Normalized Word Count'] +
        df['Translation Quality Score'] * weights['Translation Quality Score'] +
        df['Frequency of Occurrence Score'] * weights['Frequency of Occurrence Score']
    )

def loss_function(predicted_scores, actual_ratings):
    return mean_squared_error(predicted_scores, actual_ratings)

## Perform Grid Search

Define the range for the weights and perform a grid search to find the best combination:

In [12]:
# Define the range for the weights
weight_range = np.linspace(0, 1, 11)
weight_combinations = list(product(weight_range, repeat=4))

# Filter out invalid combinations (weights must sum to 1)
valid_weight_combinations = [comb for comb in weight_combinations if np.isclose(sum(comb), 1)]

# Grid search
best_loss = float('inf')
best_weights = None

for weights in valid_weight_combinations:
    weight_dict = {
        'Combination Score': weights[0],
        'Normalized Word Count': weights[1],
        'Translation Quality Score': weights[2],
        'Frequency of Occurrence Score': weights[3]
    }
    predicted_scores = calculate_goodness_scores(merged_data, weight_dict)
    loss = loss_function(predicted_scores, merged_data['Rating'])
    if loss < best_loss:
        best_loss = loss
        best_weights = weight_dict

print(f"Best Weights: {best_weights}")
print(f"Best Loss: {best_loss}")

Best Weights: {'Combination Score': 0.1, 'Normalized Word Count': 0.4, 'Translation Quality Score': 0.2, 'Frequency of Occurrence Score': 0.30000000000000004}
Best Loss: 0.1147805475393041


## No of valid combinations

In [15]:

# Define the range for the weights
weight_range = np.linspace(0, 1, 11)

# Generate all possible combinations
weight_combinations = list(product(weight_range, repeat=4))

# Filter out invalid combinations (weights must sum to 1)
valid_weight_combinations = [comb for comb in weight_combinations if np.isclose(sum(comb), 1)]

print(f"Total combinations: {len(weight_combinations)}")
print(f"Valid combinations: {len(valid_weight_combinations)}")

Total combinations: 14641
Valid combinations: 286


## Bayesian Optimization

In [17]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyaml>=16.9
  Downloading pyaml-24.7.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.7.0 scikit-optimize-0.10.2
Note: you may need to restart the kernel to use updated packages.


## Dr Koppikar

In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from skopt import gp_minimize
from skopt.space import Real

# Load the survey results
survey_results = pd.read_csv('survey_results.csv')  # Dentist's ratings
data = pd.read_csv('survey.csv')  # Your dataset with all the columns

# Map dentist's ratings to numeric values
rating_map = {'Good': 1, 'Moderate': 0.5, 'Bad': 0}
survey_results['Rating'] = survey_results['rating'].map(rating_map)

# Merge survey results with your dataset on the concept name or identifier
merged_data = pd.merge(data, survey_results, left_on='English Term', right_on='concept')

# Function to calculate goodness scores
def calculate_goodness_scores(weights):
    scores = (
        merged_data['Combination Score'] * weights[0] +
        merged_data['Normalized Word Count'] * weights[1] +
        merged_data['Translation Quality Score'] * weights[2] +
        merged_data['Frequency of Occurrence Score'] * weights[3]
    )
    return scores

# Objective function for Bayesian optimization
def objective(weights):
    weights = weights / np.sum(weights)  # Ensure weights sum to 1
    predicted_scores = calculate_goodness_scores(weights)
    loss = mean_squared_error(predicted_scores, merged_data['Rating'])
    return loss

# Define the search space
space = [Real(0, 1), Real(0, 1), Real(0, 1), Real(0, 1)]

# Perform Bayesian optimization
result = gp_minimize(objective, space, n_calls=100, random_state=0)

# Best weights and loss
best_weights = result.x / np.sum(result.x)
best_loss = result.fun

# Mapping weights to their respective features
weight_names = ['Combination Score', 'Normalized Word Count', 'Translation Quality Score', 'Frequency of Occurrence Score']
best_weights_named = dict(zip(weight_names, best_weights))

print(f"Best Weights: {best_weights_named}")
print(f"Best Loss: {best_loss}")

Best Weights: {'Combination Score': 0.08637652379821915, 'Normalized Word Count': 0.43002990156415366, 'Translation Quality Score': 0.21433350360059686, 'Frequency of Occurrence Score': 0.26926007103703026}
Best Loss: 0.11420094950924083


## Dr Kamlesh

In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from skopt import gp_minimize
from skopt.space import Real

# Load the survey results
survey_results = pd.read_csv('survey_results.csv')  # Dentist's ratings
data = pd.read_csv('survey.csv')  # Your dataset with all the columns

# Map dentist's ratings to numeric values
rating_map = {'Good': 1, 'Moderate': 0.5, 'Bad': 0}
survey_results['Rating'] = survey_results['rating pcp'].map(rating_map)

# Merge survey results with your dataset on the concept name or identifier
merged_data = pd.merge(data, survey_results, left_on='English Term', right_on='concept')

# Function to calculate goodness scores
def calculate_goodness_scores(weights):
    scores = (
        merged_data['Combination Score'] * weights[0] +
        merged_data['Normalized Word Count'] * weights[1] +
        merged_data['Translation Quality Score'] * weights[2] +
        merged_data['Frequency of Occurrence Score'] * weights[3]
    )
    return scores

# Objective function for Bayesian optimization
def objective(weights):
    weights = weights / np.sum(weights)  # Ensure weights sum to 1
    predicted_scores = calculate_goodness_scores(weights)
    loss = mean_squared_error(predicted_scores, merged_data['Rating'])
    return loss

# Define the search space
space = [Real(0, 1), Real(0, 1), Real(0, 1), Real(0, 1)]

# Perform Bayesian optimization
result = gp_minimize(objective, space, n_calls=100, random_state=0)

# Best weights and loss
best_weights = result.x / np.sum(result.x)
best_loss = result.fun

# Mapping weights to their respective features
weight_names = ['Combination Score', 'Normalized Word Count', 'Translation Quality Score', 'Frequency of Occurrence Score']
best_weights_named = dict(zip(weight_names, best_weights))

print(f"Best Weights: {best_weights_named}")
print(f"Best Loss: {best_loss}")

Best Weights: {'Combination Score': 0.20169325928914209, 'Normalized Word Count': 0.4979151731231969, 'Translation Quality Score': 0.10861818897801598, 'Frequency of Occurrence Score': 0.19177337860964502}
Best Loss: 0.13959970708119282
