## Version-2

In [4]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
from openai import OpenAI
from openai import AsyncOpenAI
import asyncio
from io import BytesIO
from io import StringIO
import numpy as np
import base64
import re
import pingouin as pg
from factor_analyzer import FactorAnalyzer
from scipy.stats import ks_2samp

# Set page config with page title and layout
st.set_page_config(page_title='DataWeave', layout='wide')

# Adjust the layout to move the title up
col_logo, col_title = st.columns([1, 3])

with col_logo:
    st.image('/Users/maniveena/Desktop/Gen_AI/Synthetic DATA/logo.png', width=200)  # Replace with the path to your logo image

with col_title:
    st.markdown("<h1 style='text-align: center;'>DataWeave - Synthetic Data Generator</h1>", unsafe_allow_html=True)
    st.markdown("<p style='text-align: center;'>Generate synthetic data based on your custom requirements with AI.</p>", unsafe_allow_html=True)

# Initialize session state variables if they don't exist
if 'field_names' not in st.session_state:
    st.session_state['field_names'] = []
if 'field_types' not in st.session_state:
    st.session_state['field_types'] = []

options = [
    'Text', 'integer', 'Float', 'Date', 'Boolean', 'Enum', 'Email',
    'IPAddress', 'URL', 'PhoneNumber', 'Currency', 'TextBlob', 'Name',
    'Country', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude'
]

domains = [
    'Medicine', 'Finance', 'E-commerce', 'Logistics', 'Travel',
    'Social Media', 'Education', 'Real Estate', 'Gaming',
    'Automotive', 'Energy', 'Technology', 'Retail', 'Entertainment',
    'Telecommunications'
]

# Define add_field function to add a new field to the session state
def add_field():
    st.session_state['field_names'].append('')
    st.session_state['field_types'].append('')

# Custom CSS to create boxes around sections
st.markdown(
    """
    <style>
    .box {
        border: none;
        border-radius: 5px;
        padding: 10px;
        margin: 10px 0;
    }
    </style>
    """,
    unsafe_allow_html=True
)
# DEFINING ALL THE RELEVANT FUNCTIONS

# Function to capture and validate user inputs
def capture_and_validate_inputs():
    # Check if the necessary inputs are present in the session state
    if 'field_names' in st.session_state and 'field_types' in st.session_state:
        # Capture the field names and types
        field_names = st.session_state['field_names']
        field_types = st.session_state['field_types']

        # Capture the domain and description if they are provided
        domain = st.session_state.get('domain_info', 'Not Specified')
        description = st.session_state.get('description_info', 'Not Specified')

        # Print the captured inputs for validation
        print("Captured User Inputs:")
        print("Field Names:", field_names)
        print("Field Types:", field_types)
        print("Domain:", domain)
        print("Description:", description)

        return field_names, field_types, domain, description
    else:
        print("Required inputs are not fully provided.")
        return None, None, None, None
    
# Function to capture the input and display it on the website
def display_formatted_data(field_names, field_types, domain, description):
    st.subheader("Captured Data for Validation")

    # Using columns to control the width of the display
    col1, col2, col3 = st.columns([3, 1, 1])  # Adjust column ratios as needed

    with col1:
        st.markdown("**Field Names and Types**")
        if field_names and field_types:
            data_df = pd.DataFrame({'Field Name': field_names, 'Field Type': field_types})
            st.dataframe(data_df.style.set_table_styles([{'selector': 'th', 'props': [('max-width', '200px')]}]), width=500)  # Adjust width as needed
        else:
            st.write("No field data captured.")

    with col2:
        st.markdown("**Domain**")
        st.write(domain)

    with col3:
        st.markdown("**Description**")
        st.write(description)
        
# Function to construct the prompt for the LLM
def construct_prompt(field_names, field_types, domain, description):
    # Specify the tabular format in the prompt
    prompt = f"Create a table of data in the domain of {domain}. "
    prompt += f"The table should have the following columns: {', '.join(field_names)}. "
    prompt += f"Each column should follow the type specified: {', '.join([f'{name} ({f_type})' for name, f_type in zip(field_names, field_types)])}. "
    prompt += f"Description: {description}. "
    prompt += "Format the data as a table with each row representing a unique entry.\n\n"
   # Add your custom message with rules
    prompt += "This is are STRICT RULES which you should never violate:\n"
    prompt += "1. Only generate tabular data AND make sure you generate data based on the number of rows the USER SPECIFIED. Not more data or less data.\n"
    prompt += "2. DO NOT add any TEXT whatsoever apart from the data you generate. for ex \"Note: This table shows only a few sample entries. You can add more rows to complete the dataset.\"\n"
    prompt += "3. REMEMBER the data you generated will be converted to TXT, JSON, or CSV as per the user requirement. So Any unwanted data will cause unnecessary data conversion problems. YOU SHOULD ENSURE your data COMPLIES to the above 2 rules."
    prompt += "4. When you generate personal data like Email, IPAddress, URL, PhoneNumber, Zip Code etc. ENSURE THAT they are FAKE i.e. by giving made up domains, URLs etc."
    prompt += "5. THE MOST IMPORTANT RULE: Format the data as a table with columns separated by '|' and rows separated by new lines.\n\n"
    
    return prompt




# Function to call OpenAI API to generate synthetic data

OPENAI_API_KEY = 'your-api-key-xxxxxxxxx-xxx234356%%&*^'  #Set your OpenAI API key here
#client = OpenAI(api_key=OPENAI_API_KEY)
client = AsyncOpenAI(api_key=OPENAI_API_KEY)

async def generate_synthetic_data(prompt):
    try:
        # Ensure the prompt is not empty
        if not prompt:
            raise ValueError("Prompt is empty")

        # Call the OpenAI API using the new method for chat completions
        response = await client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "user", "content": prompt}]
        )

        # Assuming the response structure matches the chat completion format
        # The actual response content might be nested differently
        # You will need to adjust the following line based on the actual structure
        # This is an example based on common structures for chat responses
        return response.choices[0].message.content
    except Exception as e:
        return f"An error occurred while generating data: {e}"

# Function to parse the output of LLM


def parse_llm_output(output):
    # Split the output into lines
    lines = output.strip().split('\n')
    
    # Remove leading and trailing '|' characters and strip spaces for each line
    lines = [line.strip('|').strip() for line in lines]

    # Process the header line
    headers = [header.strip() for header in lines[0].split('|')]

    # Process the data lines
    data = []
    for line in lines[2:]:  # Skip the dashed separator line
        row = [element.strip() for element in line.split('|')]
        data.append(row)

    # Create a DataFrame from the processed data
    df = pd.DataFrame(data, columns=headers)

    return df



    # Function to convert data to different formats
def convert_data_format(data, format_type):
    if format_type == 'CSV':
        converted_data = data.to_csv(index=False).encode('utf-8')
    elif format_type == 'JSON':
        converted_data = data.to_json(orient='records').encode('utf-8')
    elif format_type == 'Excel':
        output = BytesIO()
        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
            data.to_excel(writer, index=False)
        output.seek(0)
        converted_data = output.getvalue()
    elif format_type == 'TXT':
        converted_data = data.to_csv(index=False, sep='\t').encode('utf-8')
    else:
        raise ValueError("Unsupported format type")
    return converted_data


# Function to download data in selected format
def modified_download_button(data, format_type, file_name):
    if format_type == 'Excel':
        mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    elif format_type == 'CSV':
        mime_type = "text/csv"
    elif format_type == 'JSON':
        mime_type = "application/json"
    elif format_type == 'TXT':
        mime_type = "text/plain"
    else:
        raise ValueError("Unsupported format type")

    # Check for the state to show the download button
    show_download_key = f"show_download_button_{format_type}"
    if st.session_state.get(show_download_key, False):
        st.download_button(
            label=f"Download data as {format_type}",
            data=data,
            file_name=file_name,
            mime=mime_type
        )
def analyze_data(data):
    try:
        report = {}

        # Descriptive statistics for numeric fields
        numeric_cols = data.select_dtypes(include=[np.number])
        if not numeric_cols.empty:
            report['Descriptive Statistics for Numeric Fields'] = numeric_cols.describe()
            # Distribution checks
            skewness = numeric_cols.skew().to_frame(name='Skewness')
            kurtosis = numeric_cols.kurtosis().to_frame(name='Kurtosis')
            distribution_stats = pd.concat([skewness, kurtosis], axis=1)
            report['Distribution Statistics for Numeric Fields'] = distribution_stats

        # Descriptive statistics for categorical fields
        categorical_cols = data.select_dtypes(include=['object', 'category'])
        if not categorical_cols.empty:
            report['Descriptive Statistics for Categorical Fields'] = categorical_cols.describe()

        # Correlation matrix for the data
        if len(numeric_cols.columns) > 1:
            report['Correlation Matrix'] = numeric_cols.corr()

        return report
    except Exception as e:
        print("Error in analyze_data:", e)
        return {"Error": f"Error occurred during data analysis: {e}"}
    
def calculate_icc(df, rating_col, group_col, rater_col):
    icc = pg.intraclass_corr(data=df, targets=group_col, raters=rater_col, ratings=rating_col).round(3)
    return icc


def perform_factor_analysis(df, num_factors):
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.shape[1] < num_factors:
        return "Not enough numeric columns for factor analysis."

    fa = FactorAnalyzer(n_factors=num_factors, method='principal', rotation=None)
    fa.fit(numeric_df)
    return pd.DataFrame(fa.loadings_, index=numeric_df.columns)

def distribution_comparison(df1, df2):
    common_columns = set(df1.columns).intersection(df2.columns)
    numeric_columns = [col for col in common_columns if df1[col].dtype in ['float64', 'int64']]
    results = {}

    for column in numeric_columns:
        stat, p_value = ks_2samp(df1[column].dropna(), df2[column].dropna())
        results[column] = {'KS Statistic': stat, 'P-Value': p_value}

    return pd.DataFrame(results)

def analyze_data_with_additional_tests(original_data, synthetic_data):
    try:
        reports = {}

        # Standard Analysis
        reports['Original Data Analysis'] = analyze_data(original_data)
        reports['Synthetic Data Analysis'] = analyze_data(synthetic_data)

        # Intraclass Correlation Coefficient (ICC)
        # Assuming the data has the required structure for ICC
        #reports['ICC Analysis'] = calculate_icc(df=original_data, rating_col='rating', group_col='group', rater_col='rater')

        # Factor Analysis
        # Perform only if the original data is suitable for factor analysis
        reports['Factor Analysis'] = perform_factor_analysis(df=original_data, num_factors=5)

        # Distribution Comparisons
        reports['Distribution Comparison'] = distribution_comparison(original_data, synthetic_data)

        return reports
    except Exception as e:
        print("Error in combined analysis:", e)
        return {"Error": f"Error occurred during combined data analysis: {e}"}


    

type_mapping = {
    'text': 'object',
    'integer': 'int64',
    'float': 'float64',
    'date': 'datetime64',
    'boolean': 'bool',
    'enum': 'category',
    'email': 'object',
    'ipaddress': 'object',
    'url': 'object',
    'phonenumber': 'object',
    'currency': 'float64',  # Assuming currency values are to be treated as numerical values
    'textblob': 'object',
    'name': 'object',
    'country': 'object',
    'city': 'object',
    'state': 'object',
    'zipcode': 'object',
    'latitude': 'float64',
    'longitude': 'float64'
}


def standardize_and_convert_data_types(df, field_names, field_types):
    for field_name, user_type in zip(field_names, field_types):
        standardized_type = type_mapping.get(user_type.lower().replace(" ", ""), None)
        if standardized_type:
            try:
                if standardized_type == 'int64':
                    df[field_name] = pd.to_numeric(df[field_name], errors='coerce').astype('Int64')
                elif standardized_type == 'float64':
                    df[field_name] = pd.to_numeric(df[field_name], errors='coerce').astype('float')
                elif standardized_type == 'datetime64':
                    df[field_name] = pd.to_datetime(df[field_name], errors='coerce')
                elif standardized_type == 'bool':
                    df[field_name] = df[field_name].astype('bool')
                # Additional data types can be added here.
            except Exception as e:
                print(f"Error converting {field_name} to {standardized_type}: {e}")
        else:
            print(f"No mapping found for type '{user_type}', field '{field_name}' will remain unchanged.")
    return df

def infer_field_details_from_dataset(df):
    # Expand the type mapping to include more data types
    type_mapping = {
        'int64': 'integer',
        'int32': 'integer',
        'int16': 'integer',
        'int8': 'integer',
        'float64': 'float',
        'float32': 'float',
        'float16': 'float',
        'object': 'text',
        'bool': 'boolean',
        'datetime64[ns]': 'date',
        'datetime64[ns, tz]': 'date',
        'timedelta[ns]': 'text',  # timedelta is not always directly supported
        'category': 'enum',
        'string': 'text',
        # Add other pandas dtypes if needed
    }

    field_names = df.columns.tolist()
    field_types = df.dtypes.apply(lambda x: type_mapping.get(str(x), 'text')).tolist()
    
    return field_names, field_types



# Create two blocks for Manual Data Specification and Upload CSV File
col_manual_data_spec, col_csv_upload = st.columns([3, 2])

with col_manual_data_spec:
    st.markdown("<div class='box'>", unsafe_allow_html=True)
    st.subheader('Manual Data Specification')

    if st.button('Add Field'):
        add_field()
    
    with st.form(key='manual_data_form'):
        all_fields_filled = True
        for i in range(len(st.session_state['field_names'])):
            col1, col2 = st.columns([3, 2])
            with col1:
                field_name = st.text_input(f'Field Name {i+1}', value=st.session_state['field_names'][i], key=f'field_name_{i}')
                st.session_state['field_names'][i] = field_name
                if not field_name:
                    all_fields_filled = False
            with col2:
                field_type = st.selectbox(f'Field Type {i+1}', options, index=options.index(st.session_state['field_types'][i]) if st.session_state['field_types'][i] in options else 0, key=f'field_type_{i}')
                st.session_state['field_types'][i] = field_type
                if not field_type:
                    all_fields_filled = False

        domain = st.selectbox('Domain Info (Mandatory)', [''] + domains, key='domain_info')
        description = st.text_area('Description (Mandatory)', key='description_info')
        if not domain or not description:
            all_fields_filled = False

        submit_button = st.form_submit_button('Generate Data', disabled=not all_fields_filled)
        

csv_data = None
with col_csv_upload:
    st.markdown("<div class='box'>", unsafe_allow_html=True)
    st.subheader('Upload CSV File')

    # Start of the form for CSV upload
    with st.form(key='csv_upload_form'):
        uploaded_file = st.file_uploader("Drag and drop CSV file here", type=['csv'], key='uploaded_file')
        domain_csv = st.selectbox('Domain Info (Mandatory)', [''] + domains, key='domain_csv_info')
        description_csv = st.text_area('Description (Mandatory)', key='description_csv_info')
        submit_csv_button = st.form_submit_button('Generate Data from CSV')

if submit_button or (submit_csv_button and uploaded_file is not None):
    # Manual data specification form submitted
    if submit_button:
        field_names, field_types, domain, description = capture_and_validate_inputs()

    # CSV upload form submitted
    if submit_csv_button and uploaded_file is not None:
        try:
            # Read the uploaded CSV file
            csv_data = pd.read_csv(uploaded_file)
            # Check if the CSV is a metadata file by looking for specific markers (e.g., a header)
            # Here, we assume the metadata file has a column 'Field Type'
            if 'Field Type' in csv_data.columns:
                # Extract data from the CSV file
                field_names = csv_data.iloc[:, 0].tolist()  # First column as field names
                field_types = csv_data.iloc[:, 1].tolist()  # Second column as field types
                domain = domain_csv
                description = description_csv
            else:
                field_names, field_types = infer_field_details_from_dataset(csv_data)
        except Exception as e:
            st.error(f"An error occurred while reading the CSV file: {e}")
        
        # Construct the prompt and generate synthetic data
        if field_names and field_types:
            prompt = construct_prompt(field_names, field_types, domain, description)

            synthetic_data = asyncio.run(generate_synthetic_data(prompt))

            try:
                synthetic_df = parse_llm_output(synthetic_data)
                synthetic_df = standardize_and_convert_data_types(synthetic_df, field_names, field_types)

                # Perform statistical analysis on the generated data
                if not synthetic_df.empty:
                    if 'Field Type' not in csv_data.columns:
                        # Analysis for dataset uploads
                        combined_reports = analyze_data_with_additional_tests(csv_data, synthetic_df)
                        for title, report in combined_reports.items():
                            st.markdown(f"### {title}")
                            if isinstance(report, pd.DataFrame):
                                st.table(report)
                            else:
                                st.write(report)
                    else:
                        # Analysis for metadata uploads
                        report_dict = analyze_data(synthetic_df)
                        for title, df in report_dict.items():
                            st.markdown(f"### {title}")
                            st.table(df)
                else:
                    st.write("No synthetic data available for analysis.")
            except Exception as e:
                st.error(f"Error parsing LLM output: {e}")
                synthetic_df = pd.DataFrame()



        # Check if synthetic data was generated and the DataFrame is not empty
        if 'show_download_button_JSON' not in st.session_state:
            st.session_state['show_download_button_JSON'] = False
        if 'show_download_button_CSV' not in st.session_state:
            st.session_state['show_download_button_CSV'] = False
        if 'show_download_button_Excel' not in st.session_state:
            st.session_state['show_download_button_Excel'] = False
        if 'show_download_button_TXT' not in st.session_state:
            st.session_state['show_download_button_TXT'] = False

        # Code block where the format is selected and the download button is displayed
        if synthetic_data and not synthetic_df.empty:
            # Let the user choose the download format
            download_format = st.selectbox('Select the format for download', ['CSV', 'JSON', 'Excel', 'TXT'])
            converted_data = convert_data_format(synthetic_df, download_format)

            if converted_data is not None:
                # Set the session state to show the download button for the selected format
                st.session_state[f'show_download_button_{download_format}'] = True

                # Call the modified download button function
                modified_download_button(converted_data, download_format, f"synthetic_data.{download_format.lower()}")
            else:
                st.error("Failed to convert data to the selected format.")
                st.session_state[f'show_download_button_{download_format}'] = False

# Run this from the command line:
# streamlit run streamlit_app.py


Overwriting streamlit_app.py
