<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/mlflow/custom/Microsoft_phi3_128k_gguf_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phi3
https://github.com/microsoft/Phi-3CookBook

In [None]:
%pip install flash_attn einops timm mlflow pyngrok accelerate --quiet
! CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import userdata
import os
MLFLOW_TRACKING_URI="databricks"
# Specify the workspace hostname and token
DATABRICKS_HOST="https://dbc-4156d1ac-fdf5.cloud.databricks.com"
DATABRICKS_TOKEN=userdata.get('DATABRCKS_TTOKEN')

In [None]:
if "MLFLOW_TRACKING_URI" not in os.environ:
    os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI
if "DATABRICKS_HOST" not in os.environ:
    os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
if "DATABRICKS_TOKEN" not in os.environ:
    os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

In [None]:
# Disable tokenizers warnings when constructing pipelines
%env TOKENIZERS_PARALLELISM=false

import warnings

# Disable a few less-than-useful UserWarnings from setuptools and pydantic
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=Warning)

In [None]:
import mlflow
import accelerate
import torch
import transformers
from huggingface_hub import snapshot_download
from llama_cpp import Llama

In [None]:
mlflow.__version__

In [None]:
# Download the  instruct model and tokenizer to a local directory cache
snapshot_location = "/content/drive/MyDrive/models/home_made/phi-3-mini-128k-instruct_q8_0.gguf"

In [None]:
class Phi3(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        """
        This method initializes the tokenizer and language model
        using the specified model snapshot directory.
        """
        # If you are running this in a system that has a sufficiently powerful GPU with available VRAM,
        # uncomment the configuration setting below to leverage triton.
        # Note that triton dramatically improves the inference speed performance

        #config.attn_config["attn_impl"] = "triton"

        self.model =Llama(
          model_path=context.artifacts["snapshot"],  # Download the model file first
          n_ctx=2048,  # The max sequence length to use - note that longer sequence lengths require much more resources
          n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
          n_gpu_layers=100         # The number of layers to offload to GPU, if you have GPU acceleration available
        )


    def _build_prompt(self, instruction):
        """
        This method generates the prompt for the model.
        """
        chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'

        prompt = f'{chat_template.format(input=instruction)}'
        return prompt

    def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        prompt = model_input["prompt"][0]
         # Retrieve or use default values for temperature and max_tokens
        temperature = params.get("temperature", 0.7) if params else 0.7
        max_tokens = params.get("max_tokens", 2048) if params else 2048

        output = self.model(
          f"{prompt}", # Prompt
          temperature = temperature, # Controls randomness in output
          max_tokens=max_tokens,  # Generate up to 512 tokens
          stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
          echo=False        # Whether to echo the prompt
        )
        generated_response = output['choices'][0]['text']
        return {"candidates": [generated_response]}

In [None]:
import llama_cpp

In [None]:
llama_cpp.__version__

In [None]:
import numpy as np
import pandas as pd

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types import ColSpec, DataType, ParamSchema, ParamSpec, Schema

# Define input and output schema
input_schema = Schema(
    [
        ColSpec(DataType.string, "prompt"),
    ]
)
output_schema = Schema([ColSpec(DataType.string, "candidates")])

parameters = ParamSchema(
    [
        ParamSpec("temperature", DataType.float, np.float32(0.1), None),
        ParamSpec("max_tokens", DataType.integer, np.int32(1000), None),
    ]
)

signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=parameters)


# Define input example
input_example = pd.DataFrame({"prompt": ["What is Neo4J?"]})

In [None]:


mlflow.set_experiment(experiment_name="/Users/olonok@hotmail.com/phi3-instruct-128k-gguf")

In [None]:
# Get the current base version of torch that is installed, without specific version modifiers
torch_version = torch.__version__.split("+")[0]

# Start an MLflow run context and log the PHi3 model wrapper along with the param-included signature to
# allow for overriding parameters at inference time
with mlflow.start_run():
    model_info = mlflow.pyfunc.log_model(
        "phi3-instruct-cpp",
        python_model=Phi3(),
        # NOTE: the artifacts dictionary mapping is critical! This dict is used by the load_context() method in our PHi3() class.
        artifacts={"snapshot": snapshot_location},
        pip_requirements=[
            f"torch=={torch_version}",
            f"transformers=={transformers.__version__}",
            f"accelerate=={accelerate.__version__}",
            f"llama-cpp-python=={llama_cpp.__version__}",

        ],
        input_example=input_example,
        signature=signature,
    )

In [None]:
model_info.model_uri

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

In [None]:
text = """Crossrail link 'to get go-ahead' The £10bn Crossrail transport plan, backed by business groups, is to get the go-ahead this month, according to The Mail on Sunday.
It says the UK Treasury has allocated £7.5bn ($13.99bn) for the project and that talks with business groups on raising the rest will begin shortly.
The much delayed Crossrail Link Bill would provide for a fast cross-London rail link. The paper says it will go before the House of Commons on 23 February.
A second reading could follow on 16 or 17 March. We've always said we are going to introduce a hybrid Bill for Crossrail in the Spring and this remains the case,
the Department for Transport said on Sunday. Jeremy de Souza, a spokesman for Crossrail, said on Sunday he could not confirm whether the Treasury was planning to invest £7.5bn
or when the bill would go before Parliament. However, he said some impetus may have been provided by the proximity of an election.
The new line would go out as far as Maidenhead, Berkshire, to the west of London, and link Heathrow to Canary Wharf via the City.
Heathrow to the City would take 40 minutes, dramatically cutting journey times for business travellers, and reducing overcrowding on the tube.
The line has the support of the Mayor of London, Ken Livingstone, business groups and the government, but there have been three years of arguments over how it should be funded.
The Mail on Sunday's Financial Mail said the £7.5bn of Treasury money was earmarked for spending in £2.5bn instalments in 2010, 2011 and 2012."""

text2 = """           Celeste Barrios-Cruz
(312) 208-6505 | Celestebarrios35@gmail.com | LinkedIn | GitHub | Chicago, IL

PROFESSIONAL SUMMARY
●
Innovative thinker with extensive knowledge of SQL, experience utilizing Python, object oriented
programming: C++, front end knowledge of JavaScript
●
Excellent communication skills (English and Spanish) including teamwork and collaboration
●
Outstanding organization ability including problem-solving, and time management skills.

SKILLS
Programming Language: Python, C++, JavaScript, HTML5, CSS3, TypeScript
Web Technologies/Development Frameworks: NumPy, Pandas, MATLAB, Flask, jQuery, AJAX, JASON,
BootstrapUI, Angular7.0
Database: SQL, PostgreSQL, SQLite
Software: Microsoft Office (Word, Excel, PowerPoint), Google Developer Tools
Tools/Methodologies: Data Structures, Algorithms, GitHub, GIT, Heroku, Scrum, Agile Methodology, Agile
Software Development, Project Management, Anaconda, Jupyter Notebook, Visual Studio, Software Development,
Data Modeler, Tableau
Languages: Spanish (fluent conversational skills)

EDUCATION
University of Illinois at Chicago (UIC), Chicago, IL
Bachelor of Science in Math & Computer Science                                               December 2019

PROFESSIONAL EXPERIENCE
Empower Saturday School, Chicago IL


Co-Director of Technology

          November 2020-Present
●
Volunteer in a Non-Profit foundation to provide tutoring for underprivileged youth in Chicago implementing
WordPress for a student portal and website with upcoming donation features built-in.

Coding Temple, Chicago IL
Software Engineer                                                                                               May 2020-July 2020
●
Participated in intensive professional development experience in code production.
●
Collaborated with a team to utilize Flask to revamp a law firm’s website from a previous HTML/CSS draft
and then deployed the new website on Heroku from GitHub.
●
Created an Entity Relationship Diagram (ERD) using lucidchart.com to create a database; also used SQL
to export and import data between different data sources.
●
Utilized Object-Oriented Programming (OOP) concepts with Python to create a parking garage system.
●
Oversaw Full Web UI Development on 5+ projects using Angular 4 and above, AngularJS, JavaScript,
HTML, CSS, third party Angular frameworks, JQuery and JSON.
●
Used 5+ Python libraries and SQL queries/subqueries to create several datasets which produced statistics,
tables, figures, charts and graphs.
●
Completed case study problem sets using Python, NumPy, SciPy, Pandas packages in order to enhance
understanding of the functionality of each program and how to get concrete results.

PROJECTS
Avengers Phone Book


●
Created phone numbers for Avengers Phone Book using Flask and displayed them to your front page.
●
Designed a project so that characters could create,read, update their phone number from the phone book;
the project is hosted on Heroku.
Good Send



●
Collaborated with 2+ developers to design both administrator and client web portals using Python, Flask,
SQLite and multiple APIs; Utilized Github for version control and deployed the final product on Heroku.
●
Individually designed and deployed a SQLite database to allow organization to run a more secure,
organized, automated and efficient operation resulting in higher client satisfaction.
●
Incorporated Flask-Admin and Flask-Login to allow the administrator to view, create, update and delete.
MyMoviePoster

●
Linked Spotify playlist to movie poster using an API.
"""


classes = ["cv", "non-cv"]


In [None]:
import json
import datetime

In [None]:
prompt_1 =  f"""you are an expert document classifier
Classify the following text using these {len(classes)} classes: {classes}
Only use the labels provided: {classes}

Confidence score: float of 0-1.
for example:
0 means you are completely sure that the document does not belongs to class x
1 means you are completely sure that the document belong to class x
output: only respond with a json with these 2 attrubutes 'label' = class predicted, 'score': float

#begin text
{text2}.
#end text
"""

In [None]:
time1=  datetime.datetime.now()
response = loaded_model.predict(pd.DataFrame(
    {"prompt": [prompt_1]}), params={"temperature": 0.8, "max_tokens": 750}
)
time2=  datetime.datetime.now()
print(time2-time1)

In [None]:
print(response['candidates'][0])

In [None]:
prompt_2 = f"""you are an expert document classifier
Classify the following text using these {len(classes)} classes: {classes}
Only use the labels provided: {classes}

Confidence score: float of 0-1.
for example:
0 means you are completely sure that the document does not belongs to class x
1 means you are completely sure that the document belong to class x
output: only respond with a json with these 2 attrubutes 'label' = class predicted, 'score': float

#begin text
{text}.
#end text
"""

In [None]:
time1=  datetime.datetime.now()
response = loaded_model.predict(pd.DataFrame(
    {"prompt": [prompt_2]}), params={"temperature": 0.7}
)
time2=  datetime.datetime.now()
print(time2-time1)

In [None]:
print(response['candidates'][0])