# Generating integer table for easy comparison
Fewer scenarios (no scenario B). And only integers.

In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
# Careful we're transposing the dataframe so output is different as previously
def generate_custom_data(base_values, trend_multipliers):
    data = {}
    metrics = ['units_sold', 'purchasers', 'generated_turnover', 'average_basket']
    
    for country, multipliers in trend_multipliers.items():
        country_data = {}
        for metric in metrics:
            base = base_values[metric]
            multiplier = multipliers.get(metric, 1)
            if metric in ['units_sold', 'purchasers']:
                # Generate integers directly
                country_data[metric] = np.random.randint(int(base * 0.8 * multiplier), int(base * 1.2 * multiplier))
            elif metric == 'generated_turnover':
                # Calculate turnover based on previously generated units_sold
                unit_price = 10
                turnover = country_data['units_sold'] * unit_price
                country_data[metric] = round(np.random.normal(loc=turnover, scale=turnover * 0.05), 2)
            elif metric == 'average_basket':
                # Calculate average basket size based on previously generated values
                if country_data['purchasers'] > 0:
                    avg_basket = country_data['generated_turnover'] / country_data['purchasers']
                    country_data[metric] = round(np.random.normal(loc=avg_basket, scale=avg_basket * 0.1), 2)
        data[country] = country_data

        # Ensuring 'units_sold' is not less than 'purchasers'
        if data[country]['units_sold'] < data[country]['purchasers']:
            data[country]['units_sold'], data[country]['purchasers'] = data[country]['purchasers'], data[country]['units_sold']
            # modifying average basket 
            print("avg_basket BEFORE: ", avg_basket)
            avg_basket = data[country]['generated_turnover'] / data[country]['purchasers']
            print("avg_basket AFTER: ", avg_basket)
            data[country]['average_basket'] = round(np.random.normal(loc=avg_basket, scale=avg_basket * 0.1), 2)

    # Convert the dictionary to a DataFrame and transpose it
    df = pd.DataFrame(data).T

    # Correct data types
    df['units_sold'] = df['units_sold'].astype(int)
    df['purchasers'] = df['purchasers'].astype(int)
    df['generated_turnover'] = df['generated_turnover'].astype(int)
    df['average_basket'] = df['average_basket'].astype(int)
    # Changed 2 lines above to have integers everywhere.

    return df

### TEST 
base_values = {'units_sold': 5000, 'purchasers': 1500, 'generated_turnover': 50000, 'average_basket': 20}
trend_multipliers = {
    'France': {'units_sold': 1.1, 'purchasers': 1.1, 'generated_turnover': 1.1, 'average_basket': 1.1},
    'Spain': {'units_sold': 1.2, 'purchasers': 1.2, 'generated_turnover': 1.2, 'average_basket': 1.2},
    'Italy': {'units_sold': 0.9, 'purchasers': 0.9, 'generated_turnover': 0.9, 'average_basket': 0.9}
}
df = generate_custom_data(base_values, trend_multipliers)
df

Unnamed: 0,units_sold,purchasers,generated_turnover,average_basket
France,6053,1512,63492,51
Spain,5833,1717,53905,28
Italy,5096,1476,54186,34


In [2]:
def adjust_base_values(digits_units, digits_purchasers):
    return {
        'units_sold': 5 * 10**(digits_units-1),     # on multiplie par 5 pour centrer et éviter d'avoir 1 digit de moins lors de la génération lorsqu'on multiplie par 0.8 par exemple...
        'purchasers': 5 * 10**(digits_purchasers-1),
        'generated_turnover': 50000,
        'average_basket': 20
    }

scenarios = {
    "Scenario1integer": (1, 1),
    "Scenario2integer": (2, 2),
    "Scenario3integer": (3, 3),
    "Scenario4integer": (4, 4),
    "Scenario5integer": (5, 5),
    "Scenario6integer": (6, 6),
    "Scenario7integer": (7, 7)
}

multiplier_ranges = {
    'units_sold': (0.8, 1.2),
    'purchasers': (0.8, 1.2),
    'generated_turnover': (0.5, 2.0),
    'average_basket': (0.8, 1.5)
}

In [3]:
import os
os.makedirs('data/multiCSV_integers', exist_ok=True)

for scenario_name, (digits_units, digits_purchasers) in scenarios.items():
    base_values = adjust_base_values(digits_units, digits_purchasers)
    trend_multipliers = {
        'France': {metric: np.random.uniform(*multiplier_ranges[metric]) for metric in ['units_sold', 'purchasers', 'generated_turnover', 'average_basket']},
        'Spain': {metric: np.random.uniform(*multiplier_ranges[metric]) for metric in ['units_sold', 'purchasers', 'generated_turnover', 'average_basket']},
        'Italy': {metric: np.random.uniform(*multiplier_ranges[metric]) for metric in ['units_sold', 'purchasers', 'generated_turnover', 'average_basket']}
    }
    df = generate_custom_data(base_values, trend_multipliers)
    df.to_csv(f'data/multiCSV_integers/{scenario_name}.csv')

avg_basket BEFORE:  6.381836734693877
avg_basket AFTER:  9.19735294117647
avg_basket BEFORE:  6.968541033434651
avg_basket AFTER:  9.8186295503212
avg_basket BEFORE:  8.261881100266192
avg_basket AFTER:  9.564601951720595
avg_basket BEFORE:  7.298315044247788
avg_basket AFTER:  10.02808365758755
avg_basket BEFORE:  8.807306714877795
avg_basket AFTER:  10.878053507478407
avg_basket BEFORE:  7.656871738829287
avg_basket AFTER:  10.095571790969052
avg_basket BEFORE:  9.628057426444522
avg_basket AFTER:  10.188212764050034
avg_basket BEFORE:  6.432269006860725
avg_basket AFTER:  9.453469245495818


In [4]:
# Directory containing the CSV files
directory = 'data/multiCSV_integers'
data_frames = {}    # Initialize an empty dictionary to hold the DataFrames

# Iterate over each file in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)   # Construct the full file path
    df = pd.read_csv(file_path, index_col=0)        # Read the CSV file into a DataFrame, don't need to use with open()
    scenario_name = filename.replace('.csv', '').replace('_', ' ')  # Extract scenario name from the filename, e.g., 'Scenario1.csv' -> 'Scenario 1'
    data_frames[scenario_name] = df     # Add the DataFrame to the dictionary

In [5]:
# Saving to TXT
directoryT = "data/multiTXT_integers"
os.makedirs(directoryT, exist_ok=True)  # Ensure the directory exists

for key, frame in data_frames.items():
    table_string = frame.to_string(index=True)
    file_path = os.path.join(directoryT, f"{key}.txt")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(table_string)

## Generating target

In [9]:
prompts = [
    ##### Niveau 1: Récuparation
    ["How many clients are there in France?",1],    # name of variable is different, possible higher complexity
    ["How many purchasers are there in France?",1],
    ["What's the generated turnover of Spain?",1],
    ["What's the average basket in Italy?",1],
    ["How many units are sold in France?",1],
    # add more yes / no 
    # 5 more (1 additional above):
    # how many countries / variables / KPIs etc.
    ["How many units are sold in Spain?",1],
    ["What's the average basket in Spain?",1],
    ["How many columns are there?",1],  # changed
    ["How many countries are there?",1],
    ["How many KPIs are there?",1],

    ##### Niveau 2: Comparaison
    ["Which country has the lowest turnover", 2],
    ["Which country has the highest average basket", 2],
    ["Name the country with the highest number of purchasers", 2],
    ["Which country sold the most units?", 2],
    ["Can you rank the countries in descending order per turnover", 2],
    ["Rank the country names in descending order per turnover", 2],
    ["Do purchasers in France spend more than in Italy?", 2],   # changed to get rid of (answer by yes or no)
    # 3 more:
    ["Do purchasers in Italy spend more than in France?", 2],
    ["Is the generated turnover higher in Spain than in France?", 2],
    ["Did Italy sell more units than Spain?", 2],

    ##### Niveau 3: Opération élémentaire
    ["What is the total generated turnover across all countries?", 3],
    ["What is the total number of purchasers across all countries?", 3],
    ["What is the total number of units sold across all countries?", 3],
    ["What is the average generated turnover across all countries?", 3],
    ["What is the average number of purchasers across all countries?", 3],
    ["What is the average basket size across all countries?", 3],
    ["How many more clients are there in Italy compared to France", 3],
    #############################
    ["What's the absolute difference between number of purchasers in Italy compared to number of purchasers in France?", 3],
    ["Which is the average turnover per client in Italy", 3],
    ["Which is the average turnover per client in France?", 3],

    ##### Niveau 4: Opération complexe
    ["Which share of clients in France among all countries", 4],
    ["Do clients in France spend more on average than in Italy?", 4],
    ["What is the average number of articles each client buys", 4],
    ["What is the average number of articles each purchaser buys", 4]
]

In [10]:
# Final target calculation with reordered prompts

# Initialize the DataFrame for prompts and answers
targetDF = pd.DataFrame(columns=["Prompt", "Complexité"] + [f"Target {scenario}" for scenario in data_frames.keys()])

# Extend the DataFrame for new prompts
for promptL in prompts:
    targetDF.loc[len(targetDF), ['Prompt', 'Complexité']] = [promptL[0], promptL[1]]  # promptL[1] is "Complexité"

# Iterate through all the prompts to populate target columns
for idx, prompt_row in targetDF.iterrows():
    prompt = prompt_row['Prompt']
    for scenario, df_to_measure in data_frames.items():

        # Niveau 1: Récuparation
        if prompt == "How many clients are there in France?":
            value = df_to_measure.at['France', 'purchasers']

        elif prompt == "How many purchasers are there in France?":
            value = df_to_measure.at['France', 'purchasers']

        elif prompt == "What's the generated turnover of Spain?":
            value = df_to_measure.at['Spain', 'generated_turnover']

        elif prompt == "What's the average basket in Italy?":
            value = df_to_measure.at['Italy', 'average_basket']

        elif prompt == "How many units are sold in France?":
            value = df_to_measure.at['France', 'units_sold']

        elif prompt == "How many units are sold in Spain?":
            value = df_to_measure.at['Spain', 'units_sold']

        elif prompt == "What's the average basket in Spain?":
            value = df_to_measure.at['Spain', 'average_basket']

        elif prompt == "How many columns are there?":
            value = df_to_measure.shape[1]  # columns

        elif prompt == "How many countries are there?":
            value = df_to_measure.shape[0]  # Assuming that 'countries' are the rows of the DataFrame

        elif prompt == "How many KPIs are there?":
            value = df_to_measure.shape[1]  # Assuming that 'KPIs' are the columns of the DataFrame
        

        # Niveau 2: Comparaison
        elif prompt == "Which country has the lowest turnover":
            value = df_to_measure['generated_turnover'].idxmin()

        elif prompt == "Which country has the highest average basket":
            value = df_to_measure['average_basket'].idxmax()

        elif prompt == "Name the country with the highest number of purchasers":
            value = df_to_measure['purchasers'].idxmax()

        elif prompt == "Which country sold the most units?":
            value = df_to_measure['units_sold'].idxmax()

        elif prompt == "Can you rank the countries in descending order per turnover":
            value = df_to_measure['generated_turnover'].sort_values(ascending=False).index.tolist()

        elif prompt == "Rank the country names in descending order per turnover":
            value = df_to_measure['generated_turnover'].sort_values(ascending=False).index.tolist()

        elif prompt == "Do purchasers in France spend more than in Italy?":
            turnover_france = df_to_measure.at['France', 'generated_turnover']
            turnover_italy = df_to_measure.at['Italy', 'generated_turnover']
            value = "Yes" if turnover_france > turnover_italy else "No"

        elif prompt == "Do purchasers in Italy spend more than in France?":
            turnover_france = df_to_measure.at['France', 'generated_turnover']
            turnover_italy = df_to_measure.at['Italy', 'generated_turnover']
            value = "Yes" if turnover_france < turnover_italy else "No"

        elif prompt == "Is the generated turnover higher in Spain than in France?":
            turnover_france = df_to_measure.at['France', 'generated_turnover']
            turnover_spain = df_to_measure.at['Spain', 'generated_turnover']
            value = "Yes" if turnover_france < turnover_spain else "No"

        elif prompt == "Did Italy sell more units than Spain?":
            units_italy = df_to_measure.at['Italy', 'units_sold']
            units_spain = df_to_measure.at['Spain', 'units_sold']
            value = "Yes" if units_italy > units_spain else "No"

        # Niveau 3: Opération élémentaire
        elif prompt == "What is the total generated turnover across all countries?":
            value = int(df_to_measure['generated_turnover'].sum())

        elif prompt == "What is the total number of purchasers across all countries?":
            value = int(df_to_measure['purchasers'].sum())

        elif prompt == "What is the total number of units sold across all countries?":
            value = int(df_to_measure['units_sold'].sum())

        elif prompt == "What is the average generated turnover across all countries?":
            value = int(df_to_measure['generated_turnover'].mean())

        elif prompt == "What is the average number of purchasers across all countries?":
            value = int(df_to_measure['purchasers'].mean())

        elif prompt == "What is the average basket size across all countries?":
            value = int(df_to_measure['average_basket'].mean())

        elif prompt == "How many more clients are there in Italy compared to France":
            value = df_to_measure.at['Italy', 'purchasers'] - df_to_measure.at['France', 'purchasers']

        ###################################################
        elif prompt == "What's the absolute difference between number of purchasers in Italy compared to number of purchasers in France?":
            value = abs(df_to_measure.at['Italy', 'purchasers'] - df_to_measure.at['France', 'purchasers'])

        elif prompt == "Which is the average turnover per client in Italy":
            value = df_to_measure.at['Italy', 'generated_turnover'] / df_to_measure.at['Italy', 'purchasers']

        elif prompt == "Which is the average turnover per client in France?":
            value = df_to_measure.at['France', 'generated_turnover'] / df_to_measure.at['France', 'purchasers']

        # Niveau 4: Opération complexe
        elif prompt == "Which share of clients in France among all countries":
            total_clients = df_to_measure['purchasers'].sum()
            france_clients = df_to_measure.at['France', 'purchasers']
            value = france_clients / total_clients
        
        elif prompt == "Do clients in France spend more on average than in Italy?":
            avg_basket_france = df_to_measure.at['France', 'average_basket']
            avg_basket_italy = df_to_measure.at['Italy', 'average_basket']
            value = "Yes" if avg_basket_france > avg_basket_italy else "No"

        elif prompt == "What is the average number of articles each client buys":
            total_units_sold = df_to_measure['units_sold'].sum()
            total_clients = df_to_measure['purchasers'].sum()
            value = total_units_sold / total_clients

        elif prompt == "What is the average number of articles each purchaser buys":
            total_units_sold = df_to_measure['units_sold'].sum()
            total_clients = df_to_measure['purchasers'].sum()
            value = total_units_sold / total_clients

        # Update the DataFrame
        targetDF.at[idx, f"Target {scenario}"] = value

targetDF.set_index('Prompt', inplace=True)
targetDF

Unnamed: 0_level_0,Complexité,Target Scenario1integer,Target Scenario2integer,Target Scenario3integer,Target Scenario4integer,Target Scenario5integer,Target Scenario6integer,Target Scenario7integer
Prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
How many clients are there in France?,1,5,41,591,3894,47470,495137,4733912
How many purchasers are there in France?,1,5,41,591,3894,47470,495137,4733912
What's the generated turnover of Spain?,1,43,312,5590,41235,497459,4318559,62642201
What's the average basket in Italy?,1,9,11,9,18,10,13,11
How many units are sold in France?,1,5,46,592,4508,58631,514165,5881544
How many units are sold in Spain?,1,4,49,531,5650,64969,448539,6059285
What's the average basket in Spain?,1,14,9,10,9,12,10,13
How many columns are there?,1,4,4,4,4,4,4,4
How many countries are there?,1,3,3,3,3,3,3,3
How many KPIs are there?,1,4,4,4,4,4,4,4


In [11]:
# Saving target to csv
targetDF.to_csv('target/target_digit_integer', sep=';', index=True)