# Get Data

This notebook fetches the latest data for Gradio fine-tuning from various sources.

In [1]:
import os

In [2]:
this_dir = os.getcwd()

In [3]:
import shutil

In [8]:
import requests
from urllib.parse import urljoin

In [5]:
headers = None

In [38]:
# headers = {
#     "Authorization": f"Bearer {'your_token_here'}"
# }
# i got rate limited so had to go get a auth token at https://github.com/settings/tokens but you should be ok to run once without this

# get latest gradio repo python code
for this I am utilising this [github2file](https://github.com/cognitivecomputations/github2file) repo from the inspirational ehartford 

In [2]:
os.chdir(this_dir+"/../../github2file/") # assumed you cloned that repo next to this one

In [3]:
!python github2file.py https://github.com/gradio-app/gradio

https://github.com/gradio-app/gradio/archive/refs/heads/master.zip
Combined Python source code saved to gradio_python.txt


In [5]:
shutil.copy("gradio_python.txt", this_dir + "/../data/latest-repo/gradio_repo_one_file.txt")

'../gradio-fine-tuning/data/latest-repo/gradio_repo_one_file.txt'

# get latest gradio repo docs / guides

In [4]:
os.chdir(this_dir+"/../data/latest-docs/")

In [26]:
# Define the base URL for the repository and the sub-directory
base_url = 'https://github.com/gradio-app/gradio/'
sub_directory = 'main/guides/'

# GitHub API base URL for fetching repository contents
api_base_url = 'https://api.github.com/repos/gradio-app/gradio/contents/guides'

# Directory to save downloaded files
save_directory = './'

# Create the save directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

def download_file(file_url, save_path):
    response = requests.get(file_url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download {file_url}")

def get_files_from_github(api_url,headers):
    if headers = None:
        response = requests.get(api_url)
    else:
        response = requests.get(api_url,headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch repository contents: {response.status_code}")
        return []

In [42]:
def process_directory(api_url, local_dir,headers=None):
    contents = get_files_from_github(api_url,headers)
    
    for item in contents:
        if (item['type'] == 'dir') & ("cn" not in item['name']) & ("assets" not in item['name']): #ignore the chinese version - gonna finetune in english
            # Create a corresponding local directory
            new_local_dir = os.path.join(local_dir, item['name'])
            if not os.path.exists(new_local_dir):
                os.makedirs(new_local_dir)
            # Recursively process the sub-directory
            process_directory(item['url'], new_local_dir,headers)
        elif item['type'] == 'file' and item['name'].endswith('.md'):
            # Download the file
            download_file(item['download_url'], os.path.join(local_dir, item['name']))


In [46]:
process_directory(api_base_url, save_directory,headers)

Downloaded: ./01_getting-started/01_quickstart.md
Downloaded: ./01_getting-started/02_key-features.md
Downloaded: ./02_building-interfaces/00_the-interface-class.md
Downloaded: ./02_building-interfaces/01_more-on-examples.md
Downloaded: ./02_building-interfaces/02_flagging.md
Downloaded: ./02_building-interfaces/03_interface-state.md
Downloaded: ./02_building-interfaces/04_reactive-interfaces.md
Downloaded: ./02_building-interfaces/05_four-kinds-of-interfaces.md
Downloaded: ./03_additional-features/01_queuing.md
Downloaded: ./03_additional-features/02_streaming-outputs.md
Downloaded: ./03_additional-features/03_alerts.md
Downloaded: ./03_additional-features/04_styling.md
Downloaded: ./03_additional-features/05_progress_bars.md
Downloaded: ./03_additional-features/06_batch-functions.md
Downloaded: ./03_additional-features/07_sharing-your-app.md
Downloaded: ./04_building-with-blocks/01_blocks-and-event-listeners.md
Downloaded: ./04_building-with-blocks/02_controlling-layout.md
Downloaded

# Chat and Existing User Queries
### TODO

In [None]:
import pandas as pd

In [102]:
from IPython.display import display, Markdown

In [30]:
import re

In [20]:
response = requests.get("https://raw.githubusercontent.com/gradio-app/awesome-demos/main/README.md")

In [None]:
readme_content = response.text

# Convert markdown to HTML
html_content = markdown(readme_content)

In [69]:
# Function to convert markdown table to pandas DataFrame
def markdown_table_to_dataframe(table):
    rows = table.strip().split('\n')
    header = rows[0]
    columns = [col.strip() for col in header.split('|') if col.strip()]
    data = []
    for row in rows[2:]:  # Skip the header and separator rows
        values = [val.strip() for val in row.split('|') if val.strip()]
        if values:
            data.append(values)
    return pd.DataFrame(data, columns=columns)

In [72]:
all_links = {}
for section in readme_content.split('##')[1:]:
        
    
    title, table = section.split('|',1)
    title=title.strip()
    table="|"+table
    
    
    all_links[title] = markdown_table_to_dataframe(table)

In [79]:
all_demo_links = pd.concat(all_links).reset_index()

In [87]:
link_regex=re.compile(r'\[.*?\]\((.*?)\)')

In [105]:
def get_link(link_row):
    raw_link = (link_regex.search(link_row['Demo name (link to demo)'])[1]+"/raw/main/app.py")
    
    try:
        return requests.get(raw_link).text
    except:
        print(raw_link)
        return 'FAILED ' + raw_link 
        

In [106]:
all_demo_links['python_str'] = all_demo_links.apply(lambda x: get_link(x),axis=1)

In [108]:
#display(Markdown(f"```python\n{all_demo_links.iloc[10]['python_str']}\n```"))

## Capture static status

In [115]:
link_row = all_demo_links.iloc[0]

In [125]:
def get_status(link_row):
    response = requests.get(link_regex.search(link_row['status badge'])[1])
    
    if "demo status: up" in str(response.content):
        return 'up'
    else:
        return 'drama'

In [126]:
all_demo_links['static_status'] = all_demo_links.apply(lambda x: get_status(x),axis=1)

In [127]:
all_demo_links.to_parquet( this_dir + "/../data/latest-demos/all_demo_python.parquet")

In [130]:
display(Markdown(f"```python\n{all_demo_links.iloc[62]['python_str']}\n```"))

```python
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
import gpytorch
import torch
import sys

import gpytorch

# We will use the simplest form of GP model, exact inference
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

def get_model(x, y, hyperparameters):
    likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.GreaterThan(1.e-9))
    model = ExactGPModel(x, y, likelihood)
    model.likelihood.noise = torch.ones_like(model.likelihood.noise) * hyperparameters["noise"]
    model.covar_module.outputscale = torch.ones_like(model.covar_module.outputscale) * hyperparameters["outputscale"]
    model.covar_module.base_kernel.lengthscale = torch.ones_like(model.covar_module.base_kernel.lengthscale) * \
                                                 hyperparameters["lengthscale"]
    return model, likelihood



excuse = "Please only specify numbers, x values should be in [0,1] and y values in [-1,1]."
excuse_max_examples = "This model is trained to work with up to 4 input points."
hyperparameters = {'noise': 1e-4, 'outputscale': 1., 'lengthscale': .1, 'fast_computations': (False,False,False)}


conf = .5

def mean_and_bounds_for_gp(x,y,test_xs):
    gp_model, likelihood = get_model(x,y,hyperparameters)
    gp_model.eval()
    l = likelihood(gp_model(test_xs))
    means = l.mean.squeeze()
    varis = torch.diagonal(l.covariance_matrix.squeeze())
    stds = varis.sqrt()
    return means, means-stds, means+stds


def mean_and_bounds_for_pnf(x,y,test_xs, choice):
    sys.path.append('prior-fitting/')
    model = torch.load(f'onefeature_gp_ls.1_pnf_{choice}.pt')

    logits = model((torch.cat([x,test_xs],0).unsqueeze(1),y.unsqueeze(1)),single_eval_pos=len(x))
    bounds = model.criterion.quantile(logits,center_prob=.682).squeeze(1)
    return model.criterion.mean(logits).squeeze(1), bounds[:,0], bounds[:,1]

def plot_w_conf_interval(ax_or_plt, x, m, lb, ub, color, label_prefix):
    ax_or_plt.plot(x.squeeze(-1),m, color=color, label=label_prefix+' mean')
    ax_or_plt.fill_between(x.squeeze(-1), lb, ub, alpha=.1, color=color, label=label_prefix+' conf. interval')




@torch.no_grad()
def infer(table, choice):
    vfunc = np.vectorize(lambda s: len(s))
    non_empty_row_mask = (vfunc(table).sum(1) != 0)
    table = table[non_empty_row_mask]

    try:
        table = table.astype(np.float32)
    except ValueError:
        return excuse, None
    x = torch.tensor(table[:,0]).unsqueeze(1)
    y = torch.tensor(table[:,1])
    fig = plt.figure(figsize=(8,4),dpi=1000)

    if len(x) > 4:
        return excuse_max_examples, None
    if (x<0.).any() or (x>1.).any() or (y<-1).any() or (y>1).any():
        return excuse, None

    plt.scatter(x,y, color='black', label='Examples in given dataset')


    
    test_xs = torch.linspace(0,1,100).unsqueeze(1)
    
    plot_w_conf_interval(plt, test_xs, *mean_and_bounds_for_gp(x,y,test_xs), 'green', 'GP')
    plot_w_conf_interval(plt, test_xs, *mean_and_bounds_for_pnf(x,y,test_xs, choice), 'blue', 'PFN')
    
    plt.legend(ncol=2,bbox_to_anchor=[0.5,-.14],loc="upper center")
    plt.xlabel('x')
    plt.ylabel('y')
    plt.tight_layout()

    
    return 'There you go, your plot. 📈', plt.gcf()

iface = gr.Interface(fn=infer,
                     title='GP Posterior Approximation with Transformers',
                     description='''This is a demo of PFNs as we describe them in our recent paper (https://openreview.net/forum?id=KSugKcbNf9).
Lines represent means and shaded areas are the confidence interval (68.2% quantile). In green, we have the ground truth GP posterior and in blue we have our approximation.
We provide three models that are architecturally the same, but with different training budgets.
The GP (approximated) uses an RBF Kernel with a little noise (1e-4), 0 mean and a length scale of 0.1.
                     ''',
                     article="<p style='text-align: center'><a href='https://arxiv.org/abs/2112.10510'>Paper: Transformers Can Do Bayesian Inference</a></p>",
                     inputs=[
                         gr.inputs.Dataframe(headers=["x", "y"], datatype=["number", "number"], type='numpy', default=[['.25','.1'],['.75','.4']], col_count=2, label='The data: you can change this and increase the number of data points using the `enter` key.'),
                         gr.inputs.Radio(['160K','800K','4M'], type="value", default='4M', label='Number of Sampled Datasets in Training (Training Costs), higher values yield better results')
                     ], outputs=["text",gr.outputs.Plot(type="matplotlib")])
iface.launch()




```