<a href="https://colab.research.google.com/github/muhammad-mobeen/FYP-DGAN/blob/main/DGAN_FASTAPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install torch numpy pandas matplotlib scikit-learn
!pip install gretel-synthetics
!pip install fastapi uvicorn[standard] ngrok nest-asyncio python-multipart

# !pip install git+https://github.com/gretelai/gretel-synthetics.git
# !pip install gunicorn
# !pip install uvicorn[standard]
# !pip install httpx
# !pip install pydantic
# !pip install requests
# !pip install pypi-json
# !pip install pyngrok
# !pip install ngrok
# !pip install nest-asyncio

In [None]:
# Model Dependencies
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType
from gretel_synthetics.timeseries_dgan.structures import ProgressInfo
from gretel_synthetics.timeseries_dgan.dgan import DGAN
from sklearn.preprocessing import OrdinalEncoder
from dateutil.parser import parse
import matplotlib.pyplot as plt
import matplotlib.dates as md
import pandas as pd
import numpy as np
import torch
import pickle
import json

In [None]:
# API Dependencies
import uvicorn
from fastapi import FastAPI, File, UploadFile, HTTPException, Depends, BackgroundTasks
from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
from fastapi.openapi.models import HTTPBase
from concurrent.futures import ThreadPoolExecutor
from pydantic import BaseModel
from typing import Optional
from io import BytesIO
import nest_asyncio
import ngrok
from google.colab import userdata
import uuid
import shutil
import os

In [None]:
import pandas as pd
from dateutil.parser import parse

def detect_datetime_column(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # Dictionary to hold the success rate of date parsing for each column
    datetime_success_rate = {}

    # Iterate over each column in the DataFrame
    for column in df.columns:
        total_count = 0
        success_count = 0

        # Attempt to parse each value in the column
        for value in df[column].dropna().unique():
            try:
                # Try parsing the value as a date
                parsed_date = parse(str(value), fuzzy=False)
                success_count += 1
            except (ValueError, TypeError):
                # If parsing fails, continue to the next value
                continue
            finally:
                total_count += 1

        # Calculate the success rate of parsing for the current column
        if total_count > 0:
            success_rate = success_count / total_count
            datetime_success_rate[column] = success_rate

    # Find the column with the highest success rate of date parsing
    datetime_column = max(datetime_success_rate, key=datetime_success_rate.get, default=None)

    print(datetime_success_rate)

    # Return the name of the column that most likely represents a datetime
    return datetime_column, datetime_success_rate[datetime_column] if datetime_column else None

# Example usage:
datetime_column_name = detect_datetime_column("btc.csv")
print(f"The column most likely representing a datetime is: {datetime_column_name}")


{'Unnamed: 0': 0.9989235737351991, 'Currency': 0.0, 'Date': 1.0, 'Closing Price (USD)': 0.6781485468245425, '24h Open (USD)': 0.6781485468245425, '24h High (USD)': 0.6706135629709364, '24h Low (USD)': 0.696588868940754}
The column most likely representing a datetime is: ('Date', 1.0)


In [None]:
def detect_numeric_columns(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # List to hold the names of numeric and non-alphabetic columns
    numeric_columns = []

    # Iterate over each column in the DataFrame
    for column in df.columns:
        # Use to_numeric to attempt converting the column, errors='coerce' replaces non-convertible values with NaN
        numeric_series = pd.to_numeric(df[column], errors='coerce')

        # After conversion, if there are no NaN values, it means all values were numeric
        if not numeric_series.isnull().any():
            # Additional check: Ensure there are no alphabetic characters
            # This is a revised approach where we directly analyze the content without replacing
            if not df[column].astype(str).str.contains('[a-zA-Z]').any():
                numeric_columns.append(column)

    # Return the list of column names that are numeric and non-alphabetic
    return numeric_columns

# Example usage:
numeric_columns = detect_numeric_columns("btc.csv")
print(f"Columns containing numeric and non-alphabetic data are: {numeric_columns}")

Columns containing numeric and non-alphabetic data are: ['Unnamed: 0', 'Closing Price (USD)', '24h Open (USD)', '24h High (USD)', '24h Low (USD)']


In [None]:
def detect_string_columns(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # List to hold the names of string-like columns
    string_columns = []

    # Iterate over each column in the DataFrame
    for column in df.columns:
        # Initially assume the column is string-like
        is_string_like = True

        # Check if the column is of object type (commonly used for strings in pandas)
        if df[column].dtype == 'object':
            # Check if the majority of the values in the column are non-numeric
            non_numeric_count = 0
            for value in df[column].dropna().unique():
                if isinstance(value, str) and not value.replace('.', '', 1).isdigit():
                    non_numeric_count += 1

            # Determine if the majority of the column's values are non-numeric
            if non_numeric_count / len(df[column].dropna().unique()) < 0.5:
                is_string_like = False
        else:
            # If the column's dtype is not 'object', it's less likely to be string-like
            is_string_like = False

        # If the column is determined to be string-like, add it to the list
        if is_string_like:
            string_columns.append(column)

    # Return the list of column names that are string-like
    return string_columns

# Example usage:
string_columns = detect_string_columns("btc.csv")
print(f"Columns containing string-like data are: {string_columns}")

Columns containing string-like data are: ['Currency', 'Date']


In [None]:
class AutoSyntheticConfigurator:
    def __init__(self, file_path):
        self.data_df = pd.read_csv(file_path)

    def get_dgan_config(self):
        dgan_main_config = {
            # Training Configs
            "df_style": "long",
            "time_column": None,
            "feature_columns": None,
            "discrete_columns": None,
            "encodable_columns": None,
            "attribute_columns": None,
            "example_id_column": None,
            # Model Configs
            "max_sequence_len": 'default',
            "sample_len" : 'default',
            "batch_size": 'default',
            "apply_feature_scaling" : True,
            "apply_example_scaling": False,
            "use_attribute_discriminator": False,
            "generator_learning_rate": 1e-4,
            "discriminator_learning_rate": 1e-4,
            "epochs": 500,
            "cuda": True
        }
        # First check the datetime column
        datetime_candidate = self.detect_datetime_column()
        if datetime_candidate:
            datetime_candidate_name, success_score = datetime_candidate
            dgan_main_config["time_column"] = datetime_candidate_name

        # Now check for feature cols
        features_candidates = self.detect_numeric_columns()
        if features_candidates:
            dgan_main_config["feature_columns"] = features_candidates

        # Now check for encodable cols
        encodable_candidates = self.detect_string_columns()
        if encodable_candidates:
            if dgan_main_config["time_column"] in encodable_candidates:
                encodable_candidates.remove(dgan_main_config["time_column"])
            dgan_main_config["encodable_columns"] = encodable_candidates
            dgan_main_config["feature_columns"] += dgan_main_config["encodable_columns"]

        return dgan_main_config

    def detect_datetime_column(self):
        # Load the CSV file
        df = self.data_df
        # Dictionary to hold the success rate of date parsing for each column
        datetime_success_rate = {}
        # Iterate over each column in the DataFrame
        for column in df.columns:
            total_count = 0
            success_count = 0
            # Attempt to parse each value in the column
            for value in df[column].dropna().unique():
                try:
                    # Try parsing the value as a date
                    parsed_date = parse(str(value), fuzzy=False)
                    success_count += 1
                except (ValueError, TypeError):
                    # If parsing fails, continue to the next value
                    continue
                finally:
                    total_count += 1
            # Calculate the success rate of parsing for the current column
            if total_count > 0:
                success_rate = success_count / total_count
                datetime_success_rate[column] = success_rate
        # Find the column with the highest success rate of date parsing
        datetime_column = max(datetime_success_rate, key=datetime_success_rate.get, default=None)
        # Return the name of the column that most likely represents a datetime
        return datetime_column, datetime_success_rate[datetime_column] if datetime_column else None

    def detect_numeric_columns(self):
        # Load the CSV file
        df = self.data_df

        # List to hold the names of numeric and non-alphabetic columns
        numeric_columns = []

        # Iterate over each column in the DataFrame
        for column in df.columns:
            # Use to_numeric to attempt converting the column, errors='coerce' replaces non-convertible values with NaN
            numeric_series = pd.to_numeric(df[column], errors='coerce')

            # After conversion, if there are no NaN values, it means all values were numeric
            if not numeric_series.isnull().any():
                # Additional check: Ensure there are no alphabetic characters
                # This is a revised approach where we directly analyze the content without replacing
                if not df[column].astype(str).str.contains('[a-zA-Z]').any():
                    numeric_columns.append(column)

        # Return the list of column names that are numeric and non-alphabetic
        return numeric_columns if numeric_columns != [] else None

    def detect_string_columns(self):
        # Load the CSV file
        df = self.data_df
        # List to hold the names of string-like columns
        string_columns = []
        # Iterate over each column in the DataFrame
        for column in df.columns:
            # Initially assume the column is string-like
            is_string_like = True
            # Check if the column is of object type (commonly used for strings in pandas)
            if df[column].dtype == 'object':
                # Check if the majority of the values in the column are non-numeric
                non_numeric_count = 0
                for value in df[column].dropna().unique():
                    if isinstance(value, str) and not value.replace('.', '', 1).isdigit():
                        non_numeric_count += 1
                # Determine if the majority of the column's values are non-numeric
                if non_numeric_count / len(df[column].dropna().unique()) < 0.5:
                    is_string_like = False
            else:
                # If the column's dtype is not 'object', it's less likely to be string-like
                is_string_like = False

            # If the column is determined to be string-like, add it to the list
            if is_string_like:
                string_columns.append(column)
        # Return the list of column names that are string-like
        return string_columns if string_columns != [] else None

In [None]:
class DGANER:
    def __init__(self, file_path, main_config="load_mode", project_directory_path=None) -> None:
        if main_config != "load_mode":
            self.main_config = main_config
            self.encodable_encoding_mappings = {}
        else:
            dgan_config_path = os.path.join(project_directory_path, "dgan_config.json")
            model_encoding_mappings_path = os.path.join(project_directory_path, "encoding_mappings.pkl")
            with open(dgan_config_path, "r") as json_file:
                self.main_config = json.load(json_file)
            with open(model_encoding_mappings_path, "rb") as pickle_file:
                self.encodable_encoding_mappings = pickle.load(pickle_file)

        self.df_style = self.main_config["df_style"]
        self.example_id_column = self.main_config["example_id_column"]
        self.feature_columns = self.main_config["feature_columns"]
        self.attribute_columns = self.main_config["attribute_columns"]
        self.discrete_columns = self.main_config["discrete_columns"]
        self.encodable_columns = self.main_config["encodable_columns"]
        self.time_column = self.main_config["time_column"]

        self.data_df = pd.read_csv(file_path)
        self.model = DGAN(DGANConfig(
            # max_sequence_len = self.data_df.shape[0] if self.main_config["max_sequence_len"] == 'default' else self.main_config["max_sequence_len"],
            max_sequence_len = self.data_df.shape[0]//2 if self.main_config["max_sequence_len"] == 'default' else self.main_config["max_sequence_len"],
            # max_sequence_len = 10 if self.main_config["max_sequence_len"] == 'default' else self.main_config["max_sequence_len"],
            sample_len = 1 if self.main_config["sample_len"] == 'default' else self.main_config["sample_len"],
            batch_size = min(100, self.data_df.shape[1]) if self.main_config["batch_size"] == 'default' else self.main_config["batch_size"],
            apply_feature_scaling = self.main_config["apply_feature_scaling"],
            apply_example_scaling = self.main_config["apply_example_scaling"],
            use_attribute_discriminator = self.main_config["use_attribute_discriminator"],
            generator_learning_rate = self.main_config["generator_learning_rate"],
            discriminator_learning_rate = self.main_config["discriminator_learning_rate"],
            epochs = self.main_config["epochs"],
            cuda = self.main_config["cuda"]
            ))

        if main_config == "load_mode":
            model_path = os.path.join(project_directory_path, "model.pt")
            self.model = self.model.load(model_path)

    def train(self):
        encoder = OrdinalEncoder()
        self.encodable_encoding_mappings = {}
        for column in self.encodable_columns:
            # Encode the column
            self.data_df[column] = encoder.fit_transform(self.data_df[[column]])
            # Store the mapping (encoder.categories_ contains the original values)
            self.encodable_encoding_mappings[column] = encoder.categories_[0]

        self.model.train_dataframe(
            self.data_df,
            df_style = self.df_style,
            example_id_column = self.example_id_column,
            feature_columns = self.feature_columns,
            attribute_columns = self.attribute_columns,
            discrete_columns = self.discrete_columns,
            time_column = self.time_column,
            progress_callback = self.progress_callbacker
            )

    def generate_synthetic_data_df(self, num_examples):
        if self.encodable_columns:
            # Create a copy to avoid modifying the original encoded_df
            reverted_df = self.model.generate_dataframe(num_examples)
            print(reverted_df)
            # Iterate over the encoding mappings and revert each column
            for column, mapping in self.encodable_encoding_mappings.items():
                reverted_df[column] = reverted_df[column].astype(int)
                print(reverted_df)
                # Create a mapping from encoded value back to original value
                inverse_mapping = {i: val for i, val in enumerate(mapping)}
                # Replace encoded values with original values using the inverse mapping
                reverted_df[column] = reverted_df[column].map(inverse_mapping)

            # Return the DataFrame with reverted encoding
            return reverted_df
        else:
            return self.model.generate_dataframe(num_examples)

    def generate_synthetic_data_csv(self, filename, num_examples, index=False, encoding='utf-8'):
        self.generate_synthetic_data_df(num_examples).to_csv(filename, index = index, encoding=encoding)

    def progress_callbacker(self, progress_callback:ProgressInfo):
        progress = f"Epoch {progress_callback.epoch}/{progress_callback.total_epochs}, Batch {progress_callback.batch}/{progress_callback.total_batches}: {int(progress_callback.frac_completed * 100)}%"
        print(progress)
        return progress

    def show_df(self):
        return self.data_df

    def save(self, directory_path):
        model_path = os.path.join(directory_path, "model.pt")
        dgan_config_path = os.path.join(directory_path, "dgan_config.json")
        model_encoding_mappings_path = os.path.join(directory_path, "encoding_mappings.pkl")
        self.model.save(model_path)
        with open(dgan_config_path, 'w') as fp:
            json.dump(self.main_config, fp)
        with open(model_encoding_mappings_path, 'wb') as handle:
            pickle.dump(self.encodable_encoding_mappings, handle, protocol=pickle.HIGHEST_PROTOCOL)



# Testing Rig

In [None]:
data_df = pd.read_csv("btc.csv")

In [None]:
data_df.dtypes

Unnamed: 0               int64
Currency                object
Date                    object
Closing Price (USD)    float64
24h Open (USD)         float64
24h High (USD)         float64
24h Low (USD)          float64
dtype: object

In [None]:
con = AutoSyntheticConfigurator("btc.csv")
dgan_config = con.get_dgan_config()
dgan_config

{'df_style': 'long',
 'time_column': 'Date',
 'feature_columns': ['Unnamed: 0',
  'Closing Price (USD)',
  '24h Open (USD)',
  '24h High (USD)',
  '24h Low (USD)',
  'Currency'],
 'discrete_columns': None,
 'encodable_columns': ['Currency'],
 'attribute_columns': None,
 'example_id_column': None,
 'max_sequence_len': 'default',
 'sample_len': 'default',
 'batch_size': 'default',
 'apply_feature_scaling': True,
 'apply_example_scaling': False,
 'use_attribute_discriminator': False,
 'generator_learning_rate': 0.0001,
 'discriminator_learning_rate': 0.0001,
 'epochs': 2000,
 'cuda': True}

In [None]:
# dgan_config["example_id_column"] = "Currency"
dgan_config["attribute_columns"] = ["Currency"]
dgan_config

In [None]:
# agent = DGANER('btc.csv', time_column='Date', feature_columns=['Closing Price (USD)', '24h Open (USD)', '24h High (USD)', '24h Low (USD)'])
agent = DGANER('btc.csv', dgan_config)

In [None]:
agent.train()



Epoch 0/2000, Batch 0/1: 0%
Epoch 1/2000, Batch 0/1: 0%
Epoch 2/2000, Batch 0/1: 0%
Epoch 3/2000, Batch 0/1: 0%
Epoch 4/2000, Batch 0/1: 0%
Epoch 5/2000, Batch 0/1: 0%
Epoch 6/2000, Batch 0/1: 0%
Epoch 7/2000, Batch 0/1: 0%
Epoch 8/2000, Batch 0/1: 0%
Epoch 9/2000, Batch 0/1: 0%
Epoch 10/2000, Batch 0/1: 0%
Epoch 11/2000, Batch 0/1: 0%
Epoch 12/2000, Batch 0/1: 0%
Epoch 13/2000, Batch 0/1: 0%
Epoch 14/2000, Batch 0/1: 0%
Epoch 15/2000, Batch 0/1: 0%
Epoch 16/2000, Batch 0/1: 0%
Epoch 17/2000, Batch 0/1: 0%
Epoch 18/2000, Batch 0/1: 1%
Epoch 19/2000, Batch 0/1: 1%
Epoch 20/2000, Batch 0/1: 1%
Epoch 21/2000, Batch 0/1: 1%
Epoch 22/2000, Batch 0/1: 1%
Epoch 23/2000, Batch 0/1: 1%
Epoch 24/2000, Batch 0/1: 1%
Epoch 25/2000, Batch 0/1: 1%
Epoch 26/2000, Batch 0/1: 1%
Epoch 27/2000, Batch 0/1: 1%
Epoch 28/2000, Batch 0/1: 1%
Epoch 29/2000, Batch 0/1: 1%
Epoch 30/2000, Batch 0/1: 1%
Epoch 31/2000, Batch 0/1: 1%
Epoch 32/2000, Batch 0/1: 1%
Epoch 33/2000, Batch 0/1: 1%
Epoch 34/2000, Batch 0/1

In [None]:
agent.show_df()

Unnamed: 0.1,Unnamed: 0,Currency,Date,Closing Price (USD),24h Open (USD),24h High (USD),24h Low (USD)
0,0,0.0,2014-03-14,124.654990,125.304660,125.751660,123.563490
1,1,0.0,2014-03-15,126.455000,124.654990,126.758500,124.633830
2,2,0.0,2014-03-16,109.584830,126.455000,126.665660,84.328330
3,3,0.0,2014-03-17,119.674660,109.584830,119.675000,108.058160
4,4,0.0,2014-03-18,122.338660,119.674660,122.936330,119.005660
...,...,...,...,...,...,...,...
2782,2782,0.0,2021-10-25,49765.132082,49597.778891,51449.798576,46295.720180
2783,2783,0.0,2021-10-26,50033.693137,49718.354353,51579.312545,48945.346536
2784,2784,0.0,2021-10-27,47886.625255,49927.035067,50691.802950,47006.102292
2785,2785,0.0,2021-10-28,45605.615754,46806.537852,49671.414174,43869.638969


In [None]:
synthetic_features = agent.generate_synthetic_data_df(1)

In [None]:
agent.generate_synthetic_data_csv("synthetic.csv",2)

In [None]:
synthetic_features

Unnamed: 0.1,Unnamed: 0,Currency,Date,Closing Price (USD),24h Open (USD),24h High (USD),24h Low (USD),example_id
0,1333.028198,BTC,2014-03-14,30583.269531,28919.636719,31984.599609,30959.642578,0
1,1353.775879,BTC,2014-03-15,29249.781250,27672.162109,29421.876953,29326.025391,0
2,1338.593018,BTC,2014-03-16,26737.576172,25198.707031,26922.900391,26402.537109,0
3,1255.883301,BTC,2014-03-17,23901.357422,22615.970703,24172.000000,23572.962891,0
4,1226.581299,BTC,2014-03-18,19311.685547,17913.941406,19164.884766,19108.710938,0
...,...,...,...,...,...,...,...,...
1388,1397.899780,BTC,2017-12-31,6393.935547,6494.145508,6573.514160,6147.239746,0
1389,1406.934937,BTC,2018-01-01,6458.743164,6423.962891,6581.070801,6251.406250,0
1390,1413.824829,BTC,2018-01-02,6590.161133,6499.413574,6701.803223,6353.321777,0
1391,1409.743652,BTC,2018-01-03,6510.794434,6443.762695,6680.108398,6137.025391,0


# Ordinal encoder example

In [None]:

def encode_string_columns_with_mapping(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # Identify string-like columns
    string_columns = df.select_dtypes(include=['object']).columns

    # Initialize the OrdinalEncoder
    encoder = OrdinalEncoder()

    # Dictionary to store encoding mappings for each column
    encoding_mappings = {}

    # Apply ordinal encoding to string-like columns and store mappings
    for column in string_columns:
        # Encode the column
        df[column] = encoder.fit_transform(df[[column]])

        # Store the mapping (encoder.categories_ contains the original values)
        encoding_mappings[column] = encoder.categories_[0]

    # Return the modified DataFrame and the mappings
    return df, encoding_mappings

# Example usage:
df, enc = encode_string_columns_with_mapping("btc.csv")
print(enc)
df

{'Currency': array(['BTC'], dtype=object), 'Date': array(['2014-03-14', '2014-03-15', '2014-03-16', ..., '2021-10-27',
       '2021-10-28', '2021-10-29'], dtype=object)}


Unnamed: 0.1,Unnamed: 0,Currency,Date,Closing Price (USD),24h Open (USD),24h High (USD),24h Low (USD)
0,0,0.0,0.0,124.654990,125.304660,125.751660,123.563490
1,1,0.0,1.0,126.455000,124.654990,126.758500,124.633830
2,2,0.0,2.0,109.584830,126.455000,126.665660,84.328330
3,3,0.0,3.0,119.674660,109.584830,119.675000,108.058160
4,4,0.0,4.0,122.338660,119.674660,122.936330,119.005660
...,...,...,...,...,...,...,...
2782,2782,0.0,2782.0,49765.132082,49597.778891,51449.798576,46295.720180
2783,2783,0.0,2783.0,50033.693137,49718.354353,51579.312545,48945.346536
2784,2784,0.0,2784.0,47886.625255,49927.035067,50691.802950,47006.102292
2785,2785,0.0,2785.0,45605.615754,46806.537852,49671.414174,43869.638969


In [None]:
def revert_encoding(encoded_df, encoding_mappings):
    # Create a copy to avoid modifying the original encoded_df
    reverted_df = encoded_df.copy()

    # Iterate over the encoding mappings and revert each column
    for column, mapping in encoding_mappings.items():
        # Create a mapping from encoded value back to original value
        inverse_mapping = {i: val for i, val in enumerate(mapping)}
        # Replace encoded values with original values using the inverse mapping
        reverted_df[column] = reverted_df[column].map(inverse_mapping)

    # Return the DataFrame with reverted encoding
    return reverted_df

revert_encoding(df, enc)

Unnamed: 0.1,Unnamed: 0,Currency,Date,Closing Price (USD),24h Open (USD),24h High (USD),24h Low (USD)
0,0,BTC,2014-03-14,124.654990,125.304660,125.751660,123.563490
1,1,BTC,2014-03-15,126.455000,124.654990,126.758500,124.633830
2,2,BTC,2014-03-16,109.584830,126.455000,126.665660,84.328330
3,3,BTC,2014-03-17,119.674660,109.584830,119.675000,108.058160
4,4,BTC,2014-03-18,122.338660,119.674660,122.936330,119.005660
...,...,...,...,...,...,...,...
2782,2782,BTC,2021-10-25,49765.132082,49597.778891,51449.798576,46295.720180
2783,2783,BTC,2021-10-26,50033.693137,49718.354353,51579.312545,48945.346536
2784,2784,BTC,2021-10-27,47886.625255,49927.035067,50691.802950,47006.102292
2785,2785,BTC,2021-10-28,45605.615754,46806.537852,49671.414174,43869.638969


# Old FastAPI

In [None]:
app = FastAPI()

dgan_models = {}

# HTML Template as a string
html_template = """
<!DOCTYPE html>
<html>
<head>
    <title>DGAN FastAPI App</title>
</head>
<body>
    <h1>DGAN FastAPI App</h1>
    <h2>Status: {{ status }}</h2>
    <p>Synthetic Data:</p>
    {{ synthetic_data }}
</body>
</html>
"""

class TrainModelRequest(BaseModel):
    df_style: str = "long"
    example_id_column: None
    feature_columns: None
    attribute_columns: None
    discrete_columns: None
    time_column: str | None
    max_sequence_len: str | int = 'default'
    sample_len: str | int = 'default'
    batch_size: str | int = 'default'
    apply_feature_scaling: bool = True
    apply_example_scaling: bool = False
    use_attribute_discriminator: bool = False
    generator_learning_rate: int = 1e-4
    discriminator_learning_rate: int = 1e-4
    epochs: int = 5000
    cuda: bool = True


class StatusResponse(BaseModel):
    status: str

class DownloadResponse(BaseModel):
    content: bytes

def get_dgan_model(model_id: str) -> DGANER:
    if model_id not in dgan_models:
        raise HTTPException(status_code=404, detail="Model not found")
    return dgan_models[model_id]

def train_model(model_id: str, filename: str, request: TrainModelRequest):
    dgan_model = DGANER(
        file_path=filename,
        df_style = request.df_style,
        example_id_column = request.example_id_column,
        feature_columns = request.feature_columns,
        attribute_columns = request.attribute_columns,
        discrete_columns = request.discrete_columns,
        time_column = request.time_column,
        max_sequence_len = request.max_sequence_len,
        sample_len = request.sample_len,
        batch_size = request.batch_size,
        apply_feature_scaling = request.apply_feature_scaling,
        apply_example_scaling = request.apply_example_scaling,
        use_attribute_discriminator = request.use_attribute_discriminator,
        generator_learning_rate = request.generator_learning_rate,
        discriminator_learning_rate = request.discriminator_learning_rate,
        epochs = request.epochs,
        cuda = request.cuda
        )
    dgan_models[model_id] = dgan_model
    dgan_model.train()

def generate_synthetic_data(model_id: str, num_examples: int) -> pd.DataFrame:
    dgan_model = get_dgan_model(model_id)
    return dgan_model.generate_synthetic_data_df(num_examples)

# @app.on_event("startup")
# async def startup_event():
#     app.executor = ThreadPoolExecutor()

# @app.on_event("shutdown")
# async def shutdown_event():
#     app.executor.shutdown()

# @app.post("/uploadfile/")
# async def create_upload_file(file: UploadFile = File(...)):
#     # Process the uploaded CSV file
#     content = await file.read()
#     # You can perform further processing, parsing, or save the file as needed
#     # For simplicity, let's just return the content in this example
#     return {"filename": file.filename, "content": content.decode("utf-8")}

@app.post("/train")
async def train(
        # model_id: str,
        request: TrainModelRequest,
        background_tasks: BackgroundTasks,
        file: UploadFile = File(...)
):
    # file_content = await file.read()
    train_id = str(uuid.uuid4())
    filename = train_id+".csv"

    with open(filename, "wb") as f:
        f.write(file.file.read())

    # Save the file locally
    # with open(filename, "wb") as local_file:
    #     local_file.write(file_content)
        # shutil.copyfileobj(file.file, local_file)

    background_tasks.add_task(train_model, train_id, filename, request)

    return {"message": f"Training started for id {train_id}"}

@app.get("/status/{model_id}", response_model=StatusResponse)
async def status(model_id: str):
    dgan_model = get_dgan_model(model_id)
    return {"status": "Training" if dgan_model.model.training else "Idle"}

@app.get("/download/{model_id}", response_model=DownloadResponse)
async def download(
        model_id: str,
        num_examples: int,
):
    dgan_model = get_dgan_model(model_id)
    synthetic_data = generate_synthetic_data(model_id, num_examples)
    content = synthetic_data.to_csv(index=False).encode()
    return DownloadResponse(content=content)

@app.get("/ui/{model_id}", response_class=HTMLResponse)
async def get_ui(model_id: str, num_examples: Optional[int] = 100):
    dgan_model = get_dgan_model(model_id)
    status = await app.run_in_threadpool(dgan_model.progress_callbacker)
    synthetic_data = generate_synthetic_data(model_id, num_examples)

    # Rendering HTML with variables
    rendered_html = html_template.replace("{{ status }}", status).replace("{{ synthetic_data }}", str(synthetic_data))

    return HTMLResponse(content=rendered_html, status_code=200)


# New FastAPI

In [None]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import uuid
import os

app = FastAPI()

@app.post("/upload_file/")
async def upload_file(file: UploadFile = File(...)):
    # Generate a unique ID for this upload
    folder_id = str(uuid.uuid4())
    folder_path = os.path.join("client", folder_id)

    # Create the directory if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)

    # Define the file path
    file_path = os.path.join(folder_path, file.filename)
    if "CSV" in file_path:
        file_path = file_path.replace("CSV","csv")

    # Save the uploaded file
    with open(file_path, "wb+") as file_object:
        file_object.write(await file.read())

    # Respond with the UUID
    return JSONResponse(status_code=200, content={"key": folder_id})

@app.get("/config/{key}")
def get_config(key: str):
    folder_path = os.path.join("client", key)
    # Verify the folder exists
    if not os.path.exists(folder_path):
        raise HTTPException(status_code=404, detail="Key not found")

    # Find the CSV file in the folder
    csv_file = None
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            csv_file = file_name
            break

    if csv_file is None:
        raise HTTPException(status_code=404, detail="CSV file not found under the provided key")

    file_path = os.path.join(folder_path, csv_file)

    configurator = AutoSyntheticConfigurator(file_path)
    dgan_config = configurator.get_dgan_config()

    return JSONResponse(status_code=200, content=dgan_config)

@app.post("/train_model/")
def train_model(background_tasks: BackgroundTasks, key: str, config: dict):
    folder_path = os.path.join("client", key)
    if not os.path.exists(folder_path):
        raise HTTPException(status_code=404, detail="Project key not found")

    csv_file = None
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            csv_file = file_name
            break

    if csv_file is None:
        raise HTTPException(status_code=404, detail="CSV file not found under the provided key")

    file_path = os.path.join(folder_path, csv_file)

    def train_and_save_model(file_path, config, folder_path):
        dganer = DGANER(file_path, config)
        dganer.train()
        dganer.save(folder_path)

    background_tasks.add_task(train_and_save_model, file_path, config, folder_path)

    return {"message": "Training started", "key": key}

@app.post("/generate_synthetic_data/{key}")
def generate_synthetic_data(key: str, num_examples: int=1):
    project_path = os.path.join("client", key)
    model_path = os.path.join(project_path, "model.pt")
    dgan_config_path = os.path.join(project_path, "dgan_config.json")
    encoding_mappings_path = os.path.join(project_path, "encoding_mappings.pkl")

    # Check if model and encoding mappings exist
    if not os.path.exists(model_path) or not os.path.exists(dgan_config_path) or not os.path.exists(encoding_mappings_path):
        return JSONResponse(status_code=404, content={"message": "Model is not trained yet or missing files"})

    # Find the original CSV file to determine the name
    original_csv = None
    for file_name in os.listdir(project_path):
        if file_name.endswith('.csv'):
            original_csv = file_name
            break

    if original_csv is None:
        return JSONResponse(status_code=404, content={"message": "Original CSV file not found"})

    exports_path = os.path.join(project_path, "exports")
    os.makedirs(exports_path, exist_ok=True)

    # Determine the new filename with versioning
    base_name = original_csv.rsplit('.', 1)[0]
    version = 1
    new_filename = f"{base_name}_{version}.csv"
    while os.path.exists(os.path.join(exports_path, new_filename)):
        version += 1
        new_filename = f"{base_name}_{version}.csv"

    # Initialize DGANER with load_mode
    original_csv_path = os.path.join(project_path, original_csv)
    dganer = DGANER(file_path=original_csv_path, main_config="load_mode", project_directory_path=project_path)

    # Generate and save the synthetic data
    dganer.generate_synthetic_data_csv(os.path.join(exports_path, new_filename), num_examples=num_examples)

    # Assuming you want to return the file for download
    return FileResponse(path=os.path.join(exports_path, new_filename), filename=new_filename)

# NGROK

In [None]:
with open("client/8d49e029-4b37-4a0b-981f-01f32760bbb8/encoding_mappings.pkl", "rb") as pickle_file:
    data = pickle.load(pickle_file)

In [None]:
data["notes"]

array(['COVID_lockdown', 'vacation', 'weekday', 'weekend'], dtype=object)

In [None]:
nest_asyncio.apply()
uvicorn.run(app, port=8000)

In [None]:
# ngrok_tunnel = ngrok.connect(8000)
ngrokou = ngrok.forward(8000, authtoken=userdata.get('NGROK_AUTHTOKEN'))

In [None]:
ngrokou.get_name()

'Task-1'