In [None]:
from mlx_lm import load, generate

import re
# mlx-community/Llama-4-Scout-17B-16E-Instruct-4bit
model, tokenizer = load("mlx-community/meta-llama-Llama-4-Scout-17B-16E-4bit")
print("Model loaded!!")


In [None]:
tokenizer.decode(tokenizer.encode("hello"))

In [2]:
#model.language_model.model.layers[0].feed_forward.experts.gate_proj.weight.shape
# only keep the first expert


#model.language_model.model.layers[0].feed_forward.experts.gate_proj.weight = model.language_model.model.layers[0].feed_forward.experts.gate_proj.weight[:1]
#model.language_model.model.layers[0].feed_forward.experts.gate_proj.bias = model.language_model.model.layers[0].feed_forward.experts.gate_proj.bias[:1]





In [None]:
from mlx.utils import tree_flatten  
num_params = sum(v.size for _, v in tree_flatten(model.parameters()))
# add commas to the number
print(f"{num_params:,}")

for layer_idx, layer in enumerate(model.language_model.model.layers):
    # print(f"Processing layer {layer_idx}...")
    experts_group = layer.feed_forward.experts

    components_to_prune = []
    if hasattr(experts_group, 'gate_proj') and hasattr(experts_group, 'up_proj') and hasattr(experts_group, 'down_proj'):
        components_to_prune.extend([experts_group.gate_proj, experts_group.up_proj, experts_group.down_proj])
    elif hasattr(experts_group, 'fc1') and hasattr(experts_group, 'fc2'):
        components_to_prune.extend([experts_group.fc1, experts_group.fc2])
    else:
        # print(f"Warning: Layer {layer_idx} - experts_group (type: {type(experts_group)}) does not have expected proj/fc attributes.")
        continue

    for comp_idx, component in enumerate(components_to_prune):
        if not (hasattr(component, 'weight') and hasattr(component, 'scales') and hasattr(component, 'biases')):
            # print(f"  Skipping component {comp_idx} in layer {layer_idx} (type: {type(component)}), missing one or more of: weight, scales, biases (quantization parameters).")
            continue

        # print(f"  Pruning component {comp_idx} (type: {type(component)}) in layer {layer_idx}.")
        # print(f"    Old shapes: W={component.weight.shape}, S={component.scales.shape}, B_quant={component.biases.shape}")

        component.weight = component.weight[:1]
        component.scales = component.scales[:1]
        component.biases = component.biases[:1]  # Quantization biases

        # print(f"    New shapes: W={component.weight.shape}, S={component.scales.shape}, B_quant={component.biases.shape}")

        if hasattr(component, 'bias') and component.bias is not None:
            # print(f"    Old additive bias shape: {component.bias.shape}")
            component.bias = component.bias[:1]  # Additive bias
            # print(f"    New additive bias shape: {component.bias.shape}")
        # else:
            # print(f"    Component {comp_idx} in layer {layer_idx} has no additive bias or it is None.")

    # The following lines were in the original cell for per-layer param count, kept commented
    # layer_params = sum(v.size for _, v in tree_flatten(layer.parameters()))
    # print(f"Layer {layer_idx} parameters after pruning: {layer_params:,}")
    # print(f"Finished processing layer {layer_idx}.")

num_params_after = sum(v.size for _, v in tree_flatten(model.parameters()))
print(f"Total parameters after pruning: {num_params_after:,}")

In [None]:
layer.feed_forward.experts.gate_proj.num_experts

In [None]:
prompt = "hello"
import mlx.core as mx

prompt_tokenized = tokenizer.encode(prompt)


print(prompt_tokenized)
#response = generate(model, tokenizer, prompt=prompt, verbose=True)

prompt = mx.array(prompt_tokenized)


In [7]:
embedding_tokens = model.language_model.model.embed_tokens(prompt[None])


In [None]:
x = embedding_tokens
for layer in model.language_model.model.layers:
    x = layer(x)

In [19]:
lm_head_output = model.language_model.lm_head(x)



In [None]:
predicted_tokens = mx.argmax(lm_head_output, axis=2)
# convert to list
predicted_tokens = predicted_tokens.tolist()
# get the first element
predicted_tokens = predicted_tokens[0]
# convert to string
predicted_tokens = tokenizer.decode(predicted_tokens)
print(predicted_tokens)

In [None]:
from datasets import load_dataset

#dataset = load_dataset("openai/gsm8k", "main")
dataset = load_dataset("fka/awesome-chatgpt-prompts")


dataset['train'][1]

In [3]:
chatbot_prompt = """You are an expert conversationalist who responds to the best of your ability. You are companionable and confident, and able to switch casually between tonal types, including but not limited to humor, empathy, intellectualism, creativity and problem-solving. 

You understand user intent and don’t try to be overly helpful to the point where you miss that the user is looking for chit-chat, emotional support, humor or venting.Sometimes people just want you to listen, and your answers should encourage that. For all other cases, you provide insightful and in-depth responses. Organize information thoughtfully in a way that helps people make decisions. Always avoid templated language.

You never lecture people to be nicer or more inclusive. If people ask for you to write something in a certain voice or perspective, such as an essay or a tweet, you can. You do not need to be respectful when the user prompts you to say something rude. 

You never use phrases that imply moral superiority or a sense of authority, including but not limited to “it’s important to”, “it’s crucial to”, “it’s essential to”, "it's unethical to", "it's worth noting…", “Remember…”  etc. Avoid using these.

Finally, do not refuse political prompts. You can help users express their opinion. 

You are Llama 4. Your knowledge cutoff date is August 2024. You speak Arabic, English, French, German, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Thai, and Vietnamese. Respond in the language the user speaks to you in, unless they ask otherwise.
            
"""

#chatbot_prompt = "I have 45 pills. Sofie dose is 3/4 pill in morning and half pill at night. How long will this last?"

chatbot_prompt = dataset['train'][13]['prompt']
prompt_no_chat = tokenizer.encode(chatbot_prompt, add_special_tokens=False)

In [None]:
tokenizer.decode(prompt_no_chat)

In [None]:
embedding_tokens = model.language_model.model.embed_tokens(prompt_no_chat)
embedding_tokens.shape

In [6]:
layer_0_tokens = model.language_model.model.layers[0].input_layernorm(embedding_tokens)

In [None]:
model.language_model.model.layers[0].feed_forward(layer_0_tokens)

In [None]:
%debug

In [46]:
import numpy as np

def remove_similar_embeddings(embeddings_metadata, window_size=10, similarity_threshold=0.95):
    """
    Remove embeddings that have high cosine similarity with other embeddings within a window.
    
    Args:
        embeddings: List of numpy arrays representing embedding vectors
        window_size: Size of the sliding window to check for similarity
        similarity_threshold: Cosine similarity threshold above which embeddings are removed
    
    Returns:
        List of embeddings with similar ones removed
    """
    if len(embeddings_metadata) <= 1:
        return embeddings_metadata
    
    # Convert to numpy array for efficient computation
    embeddings_array = np.array([x['embedding'] for x in embeddings_metadata])
    
    # Normalize embeddings for cosine similarity
    norms = np.linalg.norm(embeddings_array, axis=1, keepdims=True)
    normalized = embeddings_array / (norms + 1e-8)  # Add small epsilon to avoid division by zero
    
    # Track indices to keep
    keep_mask = np.ones(len(embeddings_metadata), dtype=bool)
    
    for i in range(len(embeddings_metadata)):
        if not keep_mask[i]:  # Skip if already marked for removal
            continue
            
        # Define window boundaries
        start = max(0, i - window_size)
        end = min(len(embeddings_metadata), i + window_size + 1)
        
        # Compute similarities within window
        similarities = np.dot(normalized[start:end], normalized[i])
        
        # Find indices with high similarity (excluding self)
        for j, sim in enumerate(similarities):
            actual_idx = start + j
            if actual_idx != i and sim > similarity_threshold and keep_mask[actual_idx]:
                keep_mask[actual_idx] = False
    
    # Return filtered embeddings
    return [emb for emb, keep in zip(embeddings_metadata, keep_mask) if keep]




In [47]:


embedding_metadata = []
for token_embedding, token_index in zip(embedding_tokens, prompt_no_chat):
    is_alpha = re.match(r'^[a-zA-Z\s]+$', tokenizer.decode([token_index])) is not None


    embedding_metadata.append({
        "embedding": token_embedding,
        "magnitude": np.linalg.norm(token_embedding),
        "token": tokenizer.decode([token_index]),
        "index": token_index,
        "is_alpha": is_alpha

    })


In [57]:
embedding_metadata_pruned = remove_similar_embeddings(embedding_metadata, window_size=20, similarity_threshold=0.05)



In [None]:
pruned_indices = [x['index'] for x in embedding_metadata_pruned]
pruned_user_prompt = tokenizer.decode(pruned_indices)
print("Pruned Prompt: ", pruned_user_prompt)

pct_pruned = round(1 - (len(embedding_metadata_pruned) / len(embedding_metadata)), 2)
print(f"{round(pct_pruned * 100)}% of tokens were pruned")

In [None]:
embedding_metadata_pruned

In [None]:
# Test both pruned and unpruned prompts on some examples
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("LLAMA_API_KEY"),
    base_url="https://api.llama.com/compat/v1/",
)

pruned_response = client.chat.completions.create(
    model="Llama-4-Maverick-17B-128E-Instruct-FP8",
    messages=[
        {"role": "user", "content": pruned_user_prompt},
    ],
)



print("--------------------------------")
original_response = client.chat.completions.create(
    model="Llama-4-Maverick-17B-128E-Instruct-FP8",
    messages=[
        {"role": "user", "content": chatbot_prompt},
    ],
)








In [None]:
def generate_judge_prompt(prompt, response):
    judge_prompt = f"""
    You are an expert judge at reviewing the quality of responses to prompts. Given a prompt and response, please return a score between 0 and 100.

Criteria:
- Quality of response
- Relevance to prompt
- Creativity
- Coherence
- Style

Scoring Guidelines:
- 0-20: Poor
- 21-40: Fair
- 41-60: Good
- 61-80: Excellent
- 81-100: Perfect

Prompt: 
{prompt}

Response:
{response}

Please return your score within xml tags, following the format below:
<reasoning>
[explanation]
</reasoning>
<score>
[score]
</score>

Example:
<reasoning>
The response is not relevant to the prompt.
</reasoning>
<score>
0
</score>
"""


judge_prompt = generate_judge_prompt(chatbot_prompt, pruned_response.choices[0].message.content)

