<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/mlflow/custom/Microsoft_phi3_128k_gguf_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phi3
https://github.com/microsoft/Phi-3CookBook

In [None]:
%pip install flash_attn einops timm mlflow pyngrok accelerate --quiet
! CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.8/25.8 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m29.5 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import userdata
import os
MLFLOW_TRACKING_URI="databricks"
# Specify the workspace hostname and token
DATABRICKS_HOST="https://dbc-4156d1ac-fdf5.cloud.databricks.com"
DATABRICKS_TOKEN=userdata.get('DATABRCKS_TTOKEN')

In [None]:
if "MLFLOW_TRACKING_URI" not in os.environ:
    os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI
if "DATABRICKS_HOST" not in os.environ:
    os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
if "DATABRICKS_TOKEN" not in os.environ:
    os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

In [62]:
# Disable tokenizers warnings when constructing pipelines
%env TOKENIZERS_PARALLELISM=false

import warnings

# Disable a few less-than-useful UserWarnings from setuptools and pydantic
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=Warning)

env: TOKENIZERS_PARALLELISM=false


In [64]:
import mlflow
import accelerate
import torch
import transformers
from huggingface_hub import snapshot_download
from llama_cpp import Llama

In [65]:
mlflow.__version__

'2.14.1'

In [66]:
# Download the  instruct model and tokenizer to a local directory cache
snapshot_location = "/content/drive/MyDrive/models/home_made/phi-3-mini-128k-instruct_q8_0.gguf"

In [67]:
class Phi3(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        """
        This method initializes the tokenizer and language model
        using the specified model snapshot directory.
        """
        # If you are running this in a system that has a sufficiently powerful GPU with available VRAM,
        # uncomment the configuration setting below to leverage triton.
        # Note that triton dramatically improves the inference speed performance

        #config.attn_config["attn_impl"] = "triton"

        self.model =Llama(
          model_path=context.artifacts["snapshot"],  # Download the model file first
          n_ctx=2048,  # The max sequence length to use - note that longer sequence lengths require much more resources
          n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
          n_gpu_layers=100         # The number of layers to offload to GPU, if you have GPU acceleration available
        )


    def _build_prompt(self, instruction):
        """
        This method generates the prompt for the model.
        """
        chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'

        prompt = f'{chat_template.format(input=instruction)}'
        return prompt

    def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        prompt = model_input["prompt"][0]
         # Retrieve or use default values for temperature and max_tokens
        temperature = params.get("temperature", 0.7) if params else 0.7
        max_tokens = params.get("max_tokens", 2048) if params else 2048

        output = self.model(
          f"{prompt}", # Prompt
          temperature = temperature, # Controls randomness in output
          max_tokens=max_tokens,  # Generate up to 512 tokens
          stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
          echo=False        # Whether to echo the prompt
        )
        generated_response = output['choices'][0]['text']
        return {"candidates": [generated_response]}

In [68]:
import llama_cpp

In [69]:
llama_cpp.__version__

'0.2.79'

In [None]:
import numpy as np
import pandas as pd

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types import ColSpec, DataType, ParamSchema, ParamSpec, Schema

# Define input and output schema
input_schema = Schema(
    [
        ColSpec(DataType.string, "prompt"),
    ]
)
output_schema = Schema([ColSpec(DataType.string, "candidates")])

parameters = ParamSchema(
    [
        ParamSpec("temperature", DataType.float, np.float32(0.1), None),
        ParamSpec("max_tokens", DataType.integer, np.int32(1000), None),
    ]
)

signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=parameters)


# Define input example
input_example = pd.DataFrame({"prompt": ["What is Neo4J?"]})

In [70]:


mlflow.set_experiment(experiment_name="/Users/olonok@hotmail.com/phi3-instruct-128k-gguf")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/1584291425451977', creation_time=1719841728446, experiment_id='1584291425451977', last_update_time=1719843997937, lifecycle_stage='active', name='/Users/olonok@hotmail.com/phi3-instruct-128k-gguf', tags={'mlflow.experiment.sourceName': '/Users/olonok@hotmail.com/phi3-instruct-128k-gguf',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'olonok@hotmail.com',
 'mlflow.ownerId': '8086550068642416'}>

In [71]:
# Get the current base version of torch that is installed, without specific version modifiers
torch_version = torch.__version__.split("+")[0]

# Start an MLflow run context and log the PHi3 model wrapper along with the param-included signature to
# allow for overriding parameters at inference time
with mlflow.start_run():
    model_info = mlflow.pyfunc.log_model(
        "phi3-instruct-cpp",
        python_model=Phi3(),
        # NOTE: the artifacts dictionary mapping is critical! This dict is used by the load_context() method in our PHi3() class.
        artifacts={"snapshot": snapshot_location},
        pip_requirements=[
            f"torch=={torch_version}",
            f"transformers=={transformers.__version__}",
            f"accelerate=={accelerate.__version__}",
            f"llama-cpp-python=={llama_cpp.__version__}",

        ],
        input_example=input_example,
        signature=signature,
    )

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Uploading /tmp/tmph_136261/model/artifacts/phi-3-mini-128k-instruct_q8_0.gguf:   0%|          | 0.00/3.78G [00…



In [72]:
model_info.model_uri

'runs:/d0b9f021cc1b410696cb6dc0a55f4cc3/phi3-instruct-cpp'

In [73]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading /tmp/tmpt3mdj7rj/phi3-instruct-cpp/artifacts/phi-3-mini-128k-instruct_q8_0.gguf:   0%|          | …

llama_model_loader: loaded meta data with 27 key-value pairs and 197 tensors from /tmp/tmpt3mdj7rj/phi3-instruct-cpp/artifacts/phi-3-mini-128k-instruct_q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.name str              = Phi3
llama_model_loader: - kv   2:                        phi3.context_length u32              = 131072
llama_model_loader: - kv   3:  phi3.rope.scaling.original_context_length u32              = 4096
llama_model_loader: - kv   4:                      phi3.embedding_length u32              = 3072
llama_model_loader: - kv   5:                   phi3.feed_forward_length u32              = 8192
llama_model_loader: - kv   6:                           phi3.block_count u32              = 32
llama_model_loader: - kv   7:     

In [74]:
text = """Crossrail link 'to get go-ahead' The £10bn Crossrail transport plan, backed by business groups, is to get the go-ahead this month, according to The Mail on Sunday.
It says the UK Treasury has allocated £7.5bn ($13.99bn) for the project and that talks with business groups on raising the rest will begin shortly.
The much delayed Crossrail Link Bill would provide for a fast cross-London rail link. The paper says it will go before the House of Commons on 23 February.
A second reading could follow on 16 or 17 March. We've always said we are going to introduce a hybrid Bill for Crossrail in the Spring and this remains the case,
the Department for Transport said on Sunday. Jeremy de Souza, a spokesman for Crossrail, said on Sunday he could not confirm whether the Treasury was planning to invest £7.5bn
or when the bill would go before Parliament. However, he said some impetus may have been provided by the proximity of an election.
The new line would go out as far as Maidenhead, Berkshire, to the west of London, and link Heathrow to Canary Wharf via the City.
Heathrow to the City would take 40 minutes, dramatically cutting journey times for business travellers, and reducing overcrowding on the tube.
The line has the support of the Mayor of London, Ken Livingstone, business groups and the government, but there have been three years of arguments over how it should be funded.
The Mail on Sunday's Financial Mail said the £7.5bn of Treasury money was earmarked for spending in £2.5bn instalments in 2010, 2011 and 2012."""

text2 = """           Celeste Barrios-Cruz
(312) 208-6505 | Celestebarrios35@gmail.com | LinkedIn | GitHub | Chicago, IL

PROFESSIONAL SUMMARY
●
Innovative thinker with extensive knowledge of SQL, experience utilizing Python, object oriented
programming: C++, front end knowledge of JavaScript
●
Excellent communication skills (English and Spanish) including teamwork and collaboration
●
Outstanding organization ability including problem-solving, and time management skills.

SKILLS
Programming Language: Python, C++, JavaScript, HTML5, CSS3, TypeScript
Web Technologies/Development Frameworks: NumPy, Pandas, MATLAB, Flask, jQuery, AJAX, JASON,
BootstrapUI, Angular7.0
Database: SQL, PostgreSQL, SQLite
Software: Microsoft Office (Word, Excel, PowerPoint), Google Developer Tools
Tools/Methodologies: Data Structures, Algorithms, GitHub, GIT, Heroku, Scrum, Agile Methodology, Agile
Software Development, Project Management, Anaconda, Jupyter Notebook, Visual Studio, Software Development,
Data Modeler, Tableau
Languages: Spanish (fluent conversational skills)

EDUCATION
University of Illinois at Chicago (UIC), Chicago, IL
Bachelor of Science in Math & Computer Science                                               December 2019

PROFESSIONAL EXPERIENCE
Empower Saturday School, Chicago IL


Co-Director of Technology

          November 2020-Present
●
Volunteer in a Non-Profit foundation to provide tutoring for underprivileged youth in Chicago implementing
WordPress for a student portal and website with upcoming donation features built-in.

Coding Temple, Chicago IL
Software Engineer                                                                                               May 2020-July 2020
●
Participated in intensive professional development experience in code production.
●
Collaborated with a team to utilize Flask to revamp a law firm’s website from a previous HTML/CSS draft
and then deployed the new website on Heroku from GitHub.
●
Created an Entity Relationship Diagram (ERD) using lucidchart.com to create a database; also used SQL
to export and import data between different data sources.
●
Utilized Object-Oriented Programming (OOP) concepts with Python to create a parking garage system.
●
Oversaw Full Web UI Development on 5+ projects using Angular 4 and above, AngularJS, JavaScript,
HTML, CSS, third party Angular frameworks, JQuery and JSON.
●
Used 5+ Python libraries and SQL queries/subqueries to create several datasets which produced statistics,
tables, figures, charts and graphs.
●
Completed case study problem sets using Python, NumPy, SciPy, Pandas packages in order to enhance
understanding of the functionality of each program and how to get concrete results.

PROJECTS
Avengers Phone Book


●
Created phone numbers for Avengers Phone Book using Flask and displayed them to your front page.
●
Designed a project so that characters could create,read, update their phone number from the phone book;
the project is hosted on Heroku.
Good Send



●
Collaborated with 2+ developers to design both administrator and client web portals using Python, Flask,
SQLite and multiple APIs; Utilized Github for version control and deployed the final product on Heroku.
●
Individually designed and deployed a SQLite database to allow organization to run a more secure,
organized, automated and efficient operation resulting in higher client satisfaction.
●
Incorporated Flask-Admin and Flask-Login to allow the administrator to view, create, update and delete.
MyMoviePoster

●
Linked Spotify playlist to movie poster using an API.
"""


classes = ["cv", "non-cv"]


In [75]:
import json
import datetime

In [76]:
prompt_1 =  f"""you are an expert document classifier
Classify the following text using these {len(classes)} classes: {classes}
Only use the labels provided: {classes}

Confidence score: float of 0-1.
for example:
0 means you are completely sure that the document does not belongs to class x
1 means you are completely sure that the document belong to class x
output: only respond with a json with these 2 attrubutes 'label' = class predicted, 'score': float

#begin text
{text2}.
#end text
"""

In [85]:
time1=  datetime.datetime.now()
response = loaded_model.predict(pd.DataFrame(
    {"prompt": [prompt_1]}), params={"temperature": 0.8, "max_tokens": 750}
)
time2=  datetime.datetime.now()
print(time2-time1)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     170.40 ms
llama_print_timings:      sample time =     338.46 ms /   570 runs   (    0.59 ms per token,  1684.09 tokens per second)
llama_print_timings: prompt eval time =     386.73 ms /   973 tokens (    0.40 ms per token,  2515.94 tokens per second)
llama_print_timings:        eval time =   12133.76 ms /   569 runs   (   21.32 ms per token,    46.89 tokens per second)
llama_print_timings:       total time =   13384.63 ms /  1542 tokens


0:00:13.399654


In [78]:
print(response['candidates'][0])

===
To classify the provided text into one of two classes—'cv' for a curriculum vitae (CV) or 'non-cv' for not a CV—we need to analyze the content and structure of the text. The given text appears to be an extensive professional summary, including personal details, education, skills, work experiences, projects, and other relevant information typically found in a curriculum vitae.

Key elements indicating this is indeed a CV include:
- Personal contact information (email, phone number, social profiles).
- A detailed professional summary highlighting skills and experience.
- Educational background with specific details about degrees and completion dates.
- Clearly stated professional experiences with responsibilities and achievements.
- Mention of projects that align with the individual's skill set and work history.
- List of programming languages, tools, methodologies, software development experience, etc., which are common components of a CV under skills or technical expertise sections

In [79]:
prompt_2 = f"""you are an expert document classifier
Classify the following text using these {len(classes)} classes: {classes}
Only use the labels provided: {classes}

Confidence score: float of 0-1.
for example:
0 means you are completely sure that the document does not belongs to class x
1 means you are completely sure that the document belong to class x
output: only respond with a json with these 2 attrubutes 'label' = class predicted, 'score': float

#begin text
{text}.
#end text
"""

In [83]:
time1=  datetime.datetime.now()
response = loaded_model.predict(pd.DataFrame(
    {"prompt": [prompt_2]}), params={"temperature": 0.7}
)
time2=  datetime.datetime.now()
print(time2-time1)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     170.40 ms
llama_print_timings:      sample time =     167.80 ms /   282 runs   (    0.60 ms per token,  1680.60 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)
llama_print_timings:        eval time =    5690.66 ms /   282 runs   (   20.18 ms per token,    49.55 tokens per second)
llama_print_timings:       total time =    6058.10 ms /   282 tokens


0:00:06.071007


In [84]:
print(response['candidates'][0])

===
The provided text discusses a significant infrastructure project related to transportation within London. Specifically, it details the Crossrail link's progress towards approval, its financial aspects as reported by The Mail on Sunday, and its implications for business travelers and overcrowding issues in the city. Given that this content is centered around an important transport infrastructure project, which falls under topics related to news articles, reports, or informational texts about such projects, it can be classified as "cv" (corporate/business).

Therefore, based on the analysis of the text's subject matter and aligning with the classification options provided, the output should reflect this categorization. Additionally, while a confidence score isn't explicitly calculated here due to the nature of manual evaluation in this context, it would typically be derived from how definitively the content matches one class over another based on its relevance to that category.

Outp