In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,recall_score,precision_score,confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
unlabeled_df = pd.read_csv('unlabeled_pool.csv')


In [3]:
#Remove multiword phrases  e.g ("good afternoon" )
unlabeled_df = unlabeled_df[~unlabeled_df['headword'].str.contains(r'\s')]

In [4]:
# Extract the first form from slash-separated words (e.g., "color/colour" → "color")
unlabeled_df['headword'] = unlabeled_df['headword'].str.split('/').str[0]

In [5]:
unlabeled_df = unlabeled_df.drop_duplicates(subset='headword')

In [6]:
from nltk.corpus import wordnet as wn
def max_depth(word):
    synsets = wn.synsets(word)
    return max([len(hyp_path) for s in synsets for hyp_path in s.hypernym_paths()] or [0])


In [7]:
unlabeled_df['wordnet_depth'] = unlabeled_df['headword'].apply(max_depth)

In [8]:
import random

# Map CEFR to numeric complexity
cefr_map = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
unlabeled_pool = unlabeled_df.copy()
unlabeled_pool = unlabeled_pool[unlabeled_pool['CEFR'].isin(cefr_map)]  # drop NaNs or unknowns
unlabeled_pool['cefr_score'] = unlabeled_pool['CEFR'].map(cefr_map)

In [9]:


# Dictionary to store user labels
train_word_labels = {}

# Loop: exactly 30 user-labeled words
while len(train_word_labels) < 50 and not unlabeled_pool.empty:
    num_labeled = len(train_word_labels)
    num_simple = sum(1 for lbl in train_word_labels.values() if lbl == 0)
    simple_ratio = num_simple / (num_labeled + 1e-6)

    # Adjust sampling strategy based on how many "0" (simple) were labeled
    if simple_ratio < 0.5:
        # Balanced or complex leaning — random sample
        sample_pool = unlabeled_pool.sample(n=10, random_state=random.randint(0, 10000))
    else:
        # User thinks many words are simple — bias toward complex
        top_half = unlabeled_pool.sort_values(by='cefr_score', ascending=False)
        sample_pool = top_half.head(30).sample(n=10, random_state=random.randint(0, 10000))

    # Choose one word from the sample
    row = sample_pool.sample(1, random_state=random.randint(0, 10000)).iloc[0]
    word = row['headword']

    # Ask user to annotate
    while True:
        label = input(f"Label word {num_labeled+1}/30 — Enter 0 (simple) or 1 (complex) for '{word}': ")
        if label in ['0', '1']:
            train_word_labels[word] = int(label)
            # Remove word from pool
            unlabeled_pool = unlabeled_pool[unlabeled_pool['headword'] != word]
            break
        else:
            print("❌ Invalid input. Please enter 0 or 1.")

print("\n✅ Done! You labeled 30 words.")
print(train_word_labels)



✅ Done! You labeled 30 words.
{'facial': 0, 'solicit': 1, 'perceive': 0, 'proprietary': 1, 'naughty': 0, 'imperceptible': 1, 'glorious': 0, 'connotation': 1, 'temptation': 0, 'archetype': 1, 'exclusion': 0, 'exaltedly': 1, 'inevitable': 0, 'venomous': 0, 'exalted': 1, 'hitherto': 1, 'page': 0, 'catastrophically': 0, 'indolence': 1, 'anthology': 1, 'cancer': 0, 'philanthropy': 0, 'chronologically': 0, 'drabness': 1, 'melancholy': 0, 'precocious': 1, 'innuendo': 1, 'conduit': 1, 'ginger': 0, 'imperceptibly': 1, 'chase': 0, 'philistine': 1, 'burglary': 0, 'philanthropist': 0, 'chronology': 0, 'contraption': 1, 'induce': 1, 'painstaking': 1, 'strawberry': 0, 'melodious': 0, 'brink': 1, 'angst': 1, 'advantage': 0, 'deviantly': 1, 'muscle-bound': 1, 'trophy': 0, 'supporter': 0, 'lyrical': 0, 'enabler': 1, 'fluctuation': 0}


In [10]:
unlabeled_features = unlabeled_df.copy()
if 'is_complex' in unlabeled_features.columns:
    unlabeled_features = unlabeled_features.drop(columns=['is_complex'])

# Step 3 — filter for the labeled words
labeled_words_df = unlabeled_features[
    unlabeled_features['headword'].isin(train_word_labels.keys())
].copy()

# Step 4 — map the labels into a new column
labeled_words_df['is_complex'] = labeled_words_df['headword'].map(train_word_labels)

# This is now your training set
train_df = labeled_words_df.reset_index(drop=True)

# Optional: Show the result
print(train_df[['headword', 'wordnet_depth', 'freq', 'is_complex','len']].head(40))

print(len(train_df))


            headword  wordnet_depth  freq  is_complex  len
0          advantage              7  4.78           0    9
1           burglary             11  3.44           0    8
2             cancer             13  4.93           0    6
3              chase             13  4.44           0    5
4          exclusion             10  3.68           0    9
5             ginger             10  3.93           0    6
6           glorious              1  4.08           0    8
7         inevitable              7  4.08           0   10
8            naughty              1  3.83           0    7
9               page             12  5.12           0    4
10          perceive              2  3.72           0    8
11        strawberry             12  3.72           0   10
12         supporter             10  4.07           0    9
13        temptation              8  3.72           0   10
14            trophy              7  4.17           0    6
15            facial             12  4.10           0   

<h1> Load the test and train words for the user <h1>

In [11]:
import random

# Map CEFR to numeric complexity
cefr_map = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}


# Dictionary to store user labels
test_word_labels = {}

# Loop: exactly 30 user-labeled words
while len(test_word_labels) < 30 and not unlabeled_pool.empty:
    num_labeled = len(test_word_labels)
    num_simple = sum(1 for lbl in test_word_labels.values() if lbl == 0)
    simple_ratio = num_simple / (num_labeled + 1e-6)

    # Adjust sampling strategy based on how many "0" (simple) were labeled
    if simple_ratio < 0.5:
        # Balanced or complex leaning — random sample
        sample_pool = unlabeled_pool.sample(n=10, random_state=random.randint(0, 10000))
    else:
        # User thinks many words are simple — bias toward complex
        top_half = unlabeled_pool.sort_values(by='cefr_score', ascending=False)
        sample_pool = top_half.head(30).sample(n=10, random_state=random.randint(0, 10000))

    # Choose one word from the sample
    row = sample_pool.sample(1, random_state=random.randint(0, 10000)).iloc[0]
    word = row['headword']

    # Ask user to annotate
    while True:
        label = input(f"Label word {num_labeled+1}/30 — Enter 0 (simple) or 1 (complex) for '{word}': ")
        if label in ['0', '1']:
            test_word_labels[word] = int(label)
            # Remove word from pool
            unlabeled_pool = unlabeled_pool[unlabeled_pool['headword'] != word]
            break
        else:
            print("❌ Invalid input. Please enter 0 or 1.")

print("\n✅ Done! You labeled 30 words.")
print(test_word_labels)



✅ Done! You labeled 30 words.
{'disparaging': 1, 'lavatory': 1, 'craft': 0, 'conquer': 0, 'announcement': 0, 'wordsmith': 1, 'some': 0, 'moribund': 1, 'recipe': 0, 'remittance': 1, 'merchandise': 1, 'simplicity': 0, 'receptionist': 0, 'intrinsic': 0, 'materialize': 0, 'promotable': 1, 'munificence': 1, 'meteorological': 1, 'factor': 0, 'dissemination': 1, 'father': 0, 'meteorology': 1, 'corporation': 0, 'splinter': 0, 'reproach': 1, 'deviant': 1, 'operator': 0, 'choreography': 1, 'purposeful': 0, 'cosmic': 0}


In [12]:
train_df = train_df.drop_duplicates(subset='headword', keep='first').reset_index(drop=True)

In [30]:
unlabeled_features = unlabeled_pool.copy()
if 'is_complex' in unlabeled_features.columns:
    unlabeled_features = unlabeled_features.drop(columns=['is_complex'])

# Step 3 — filter for the labeled words
labeled_words_df = unlabeled_features[
    unlabeled_features['headword'].isin(test_word_labels.keys())
].copy()

# Step 4 — map the labels into a new column
labeled_words_df['is_complex'] = labeled_words_df['headword'].map(test_word_labels)

# This is now your training set
test_df = labeled_words_df.reset_index(drop=True)

# Optional: Show the result
print(test_df[['headword', 'wordnet_depth', 'freq', 'is_complex','len']].head(40))

print(len(test_df))

Empty DataFrame
Columns: [headword, wordnet_depth, freq, is_complex, len]
Index: []
0


In [31]:
test_df = test_df.drop_duplicates(subset='headword', keep='first').reset_index(drop=True)


<h1> Choose features <h1>

In [15]:
features =['freq','len','wordnet_depth']


$$
H(\mathbf{p}) = -\sum_{i=1}^{C} p_i \log(p_i + \varepsilon)
$$

<h1> Active Learning process <h1>

In [16]:
def initiate_active_learning(n, model, scaler, unlabeled_df, train_df, features):
    for _ in range(n):
        # Scale unlabeled data
        unlabeled_train_scaled = scaler.transform(unlabeled_df[features])
        class_ratio = train_df['is_complex'].mean()
        

        # Make predictions
        probas = model.predict_proba(unlabeled_train_scaled)
        
        # Select sample (most uncertain)
        uncertainties = -np.sum(probas * np.log(probas + 1e-10), axis=1)

        if class_ratio < 0.5:
            selected_idx =  np.argmax(probas[:, 1]) 
        else:    
         selected_idx = np.argsort(uncertainties)[-1]
         
        selected_index = unlabeled_df.index[selected_idx]
        selected_sample = unlabeled_df.loc[[selected_index]].copy()
        
        # Annotation
        while True:
            try:
                word = selected_sample['headword'].iloc[0]
                label = int(input(f"Enter 0 if simple, 1 if complex for '{word}': "))
                if label in [0, 1]:
                    break
                print("Please enter 0 or 1.")
            except ValueError:
                print("Invalid input. Enter 0 or 1.")
        
        # Update data
        selected_sample['is_complex'] = label
        unlabeled_df.drop(index=selected_index, inplace=True)
        train_df = pd.concat([train_df, selected_sample], ignore_index=True)
        
        # Retrain model
        X_train_scaled = scaler.transform(train_df[features])
        model = LogisticRegression().fit(X_train_scaled, train_df['is_complex'])
    
    return model, scaler, unlabeled_df, train_df

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_df[features])
model = LogisticRegression().fit(X_train_scaled, train_df['is_complex'])





model, scaler, unlabeled_df, train_df = initiate_active_learning(
    n=30,
    model=model,
    scaler=scaler,
    unlabeled_df=unlabeled_pool,
    train_df=train_df,
    features=features
)

<h1> Analyse the performance metrics <h1>

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.metrics import cohen_kappa_score

# Make predictions


# Calculate accuracy



In [21]:
def frequency_predictor(test_df, freq_threshold=4):
    """
    Predicts word complexity based on frequency:
    - If frequency > threshold → Simple (0)
    - Else → Complex (1)
    
    Args:
        test_df (pd.DataFrame): DataFrame containing 'freq' column.
        freq_threshold (int): Frequency cutoff (default=4).
    
    Returns:
        y_pred (list): Predicted labels (0 for simple, 1 for complex).
    """
    y_pred = (test_df['freq'] <= freq_threshold).astype(int).tolist()
    return y_pred


In [27]:

print(train_df[features])

    freq  len  wordnet_depth
0   4.78    9              7
1   3.44    8             11
2   4.93    6             13
3   4.44    5             13
4   3.68    9             10
..   ...  ...            ...
75  3.08    9              1
76  3.30    7              4
77  0.00    9              1
78  3.42    6              8
79  3.85    3              9

[80 rows x 3 columns]


In [33]:
print(test_df[features])

Empty DataFrame
Columns: [freq, len, wordnet_depth]
Index: []


In [32]:
y_f = frequency_predictor(test_df, freq_threshold=test_df['is_complex'].mean())
y_pred = model.predict(scaler.transform(test_df[features]))
y_true = test_df['is_complex']

personalized_accuracy = accuracy_score(y_true, y_pred)
personalized_confusion_matrix = confusion_matrix(y_true, y_pred)
personalized_f1 = f1_score(y_true, y_pred)
personalized_recall = recall_score(y_true, y_pred)
personalized_precision = precision_score(y_true, y_pred)
personalized_kappa = cohen_kappa_score(y_true, y_pred)


print(f"Model Evaluation Metrics:")
print(f"{'Accuracy:':<20} {personalized_accuracy:.2f}")
print(f"{'Confusion Matrix:':<20}\n{personalized_confusion_matrix}")
print(f"{'F1 Score:':<20} {personalized_f1:.2f}")
print(f"{'Recall Score:':<20} {personalized_recall:.2f}")
print(f"{'Precision Score:':<20} {personalized_precision:.2f}")
print(f"{'Kohen:':<20} {personalized_kappa:.2f}")

# Calculate frequency-based metrics
freq_accuracy = accuracy_score(y_true, y_f)
freq_confusion_matrix = confusion_matrix(y_true, y_f)
freq_f1 = f1_score(y_true, y_f)
freq_recall = recall_score(y_true, y_f)
freq_precision = precision_score(y_true, y_f)
freq_kappa = cohen_kappa_score(y_true, y_f)




# Print the metrics (same as before)
print(f"Model Evaluation Metrics for frequency:")
print(f"{'Accuracy:':<20} {freq_accuracy:.2f}")
print(f"{'Confusion Matrix:':<20}\n{freq_confusion_matrix}")
print(f"{'F1 Score:':<20} {freq_f1:.2f}")
print(f"{'Recall Score:':<20} {freq_recall:.2f}")
print(f"{'Precision Score:':<20} {freq_precision:.2f}")
print(f"{'Kohens Kappa:':<20} {freq_kappa:.2f}")





import pandas as pd
from datetime import datetime

# Sample data or replace with your actual data



# Define the columns for our metrics DataFrame
columns = [
    'timestamp', 
    'accuracy', 
    'f1_score', 
    'recall', 
    'precision', 
    'kappa', 
    'confusion_matrix', 
    'threshold_used'
]

# Try to load existing DataFrame or create new one
try:
    df = pd.read_csv('metrics_log.csv')  # Or read from your existing storage
except FileNotFoundError:
    df = pd.DataFrame(columns=columns)

# Create metrics dictionary
metrics = {
    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'accuracy': (accuracy_score(y_true, y_pred), accuracy_score(y_true, y_f)),
    'f1_score': (f1_score(y_true, y_pred), f1_score(y_true, y_f)),
    'recall': (recall_score(y_true, y_pred), recall_score(y_true, y_f)),
    'precision': (precision_score(y_true, y_pred), precision_score(y_true, y_f)),
    'kappa': (cohen_kappa_score(y_true, y_pred), cohen_kappa_score(y_true, y_f)),
    'confusion_matrix': (
        str(confusion_matrix(y_true, y_pred)), 
        str(confusion_matrix(y_true, y_f))
    ),
    'threshold_used': test_df['is_complex'].mean()
}

# Append metrics (using modern pandas concat instead of deprecated append)
df = pd.concat([df, pd.DataFrame([metrics])], ignore_index=True)

# Save to CSV (or your preferred storage)
df.to_csv('metrics_log.csv', index=False)

print("Metrics successfully logged:")
print(df.tail())







ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required by StandardScaler.

In [None]:
print(y_f)
print(y_pred)

[0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1]
[0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 1]
