# Create temp tables

Use this script in a notebook to create temp collections for a projet :
- events
- sessions
- flattened_tasks

This is helpful to use with the postgresql export. 

In [None]:
from app.db.mongo import connect_and_init_db, get_mongo_db
from loguru import logger

await connect_and_init_db()

mongo_db = await get_mongo_db()

# Change the project id here
project_id = ...


[32m2024-11-21 17:59:37.558[0m | [1mINFO    [0m | [36mapp.db.mongo[0m:[36mconnect_and_init_db[0m:[36m34[0m - [1mConnected to mongodb (MONGODB_NAME=production)[0m


In [None]:
# Delete the collections if they exist
logger.info("Deleting collections")
await mongo_db[f"events_{project_id}"].drop()
await mongo_db[f"sessions_{project_id}"].drop()
await mongo_db[f"flattened_tasks_{project_id}"].drop()

In [6]:

# Create a new collection with only the events of project
command = [
    {
        "$match": {
            "project_id": project_id
    },
    },
    {
        "$out": f"events_{project_id}"
    }
]

logger.info("Creating events collection")
await mongo_db["events"].aggregate(command).to_list(None)

# Same with sessions

command = [
    {
        "$match": {
            "project_id": project_id
        },
    },
    {
        "$out": f"sessions_{project_id}"
    }
]

logger.info("Creating sessions collection")
await mongo_db["sessions"].aggregate(command).to_list(None)

# Create indexes on the new collections: task_id

logger.info("Creating indexes on events collection")
await mongo_db[f"events_{project_id}"].create_index("task_id", unique=False, background=False)

logger.info("Creating indexes on sessions collection")
await mongo_db[f"sessions_{project_id}"].create_index("task_id",unique=False, background=False)


# Run the command
command = [
  {
    "$match": {
      "project_id": project_id,
    },
  },
  { "$sort": { "created_at": 1 } },
  {
    "$lookup": {
      "from": f"sessions_{project_id}",
      "localField": "id",
      "foreignField": "task_id",
      "as": "sessions",
    },
  },
  {
    "$unwind": {
      "path": "$sessions",
      "preserveNullAndEmptyArrays": True,
    },
  },
  {
    "$lookup": {
      "from": f"events_{project_id}",
      "localField": "id",
      "foreignField": "task_id",
      "as": "events",
    },
  },
  {
    "$addFields": {
      "events": {
        "$filter": {
          "input": "$events",
          "as": "event",
          "cond": {
            "$and": [
              { "$ne": ["$$event.removed", True] },
              {
                "$or": [
                  {
                    "$and": [
                      {
                        "$eq": [
                          "$$event.event_definition.is_last_task",
                          True,
                        ],
                      },
                      {
                        "$eq": [
                          "$is_last_task",
                          True,
                        ],
                      },
                    ],
                  },
                  {
                    "$not": [
                      "$$event.event_definition.is_last_task",
                    ],
                  },
                ],
              },
            ],
          },
        },
      },
    },
  },
  {
    "$set": {
      "events": {
        "$reduce": {
          "input": "$events",
          "initialValue": [],
          "in": {
            "$concatArrays": [
              "$$value",
              {
                "$cond": [
                  {
                    "$in": [
                      "$$this.event_definition.id",
                      "$$value.event_definition.id",
                    ],
                  },
                  [],
                  ["$$this"],
                ],
              },
            ],
          },
        },
      },
    },
  },
  {
    "$unwind": {
      "path": "$events",
      "preserveNullAndEmptyArrays": True,
    },
  },
  {
    "$project": {
        "_id": 0,
      "task_id": "$id",
      "task_input": "$input",
      "task_output": "$output",
      "task_metadata": "$metadata",
      "task_eval": "$flag",
      "task_eval_source": "$last_eval.source",
      "task_eval_at": "$last_eval.created_at",
      "task_created_at": "$created_at",
      "session_id": "$session_id",
      "task_position": "$task_position",
      "session_length": "$sessions.session_length",
      "event_name": "$events.event_name",
      "event_created_at": "$events.created_at",
      "event_confirmed": "$events.confirmed",
      "event_score_range_value":
        "$events.score_range.value",
      "event_score_range_min":
        "$events.score_range.min",
      "event_score_range_max":
        "$events.score_range.max",
      "event_score_range_score_type":
        "$events.score_range.score_type",
      "event_score_range_label":
        "$events.score_range.label",
      "event_source": "$events.source",
      "event_categories":
        "$events.event_definition.score_range_settings.categories",
    },
  },
  # Merge into a new collection
    {
        "$out": f"flattened_tasks_{project_id}"
    },
]

logger.info("Creating flattened_tasks collection")
await mongo_db["tasks"].aggregate(command).to_list(None)

[32m2024-11-21 17:56:31.166[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mCreating events collection[0m
[32m2024-11-21 17:56:41.554[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mCreating sessions collection[0m
[32m2024-11-21 17:56:42.808[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mCreating indexes on events collection[0m
[32m2024-11-21 17:56:44.988[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mCreating indexes on sessions collection[0m
[32m2024-11-21 17:56:45.175[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m182[0m - [1mCreating flattened_tasks collection[0m


[]

In [None]:
from app.services.integrations.postgresql import PostgresqlIntegration

postgresql = PostgresqlIntegration(
    org_id=...,
    project_id=project_id,
    project_name=...,
    org_name=...,
)


[32m2024-11-21 17:59:16.623[0m | [1mINFO    [0m | [36mapp.core.config[0m:[36m<module>[0m:[36m17[0m - [1mLoading environment variables from .env file[0m
[32m2024-11-21 17:59:16.624[0m | [1mINFO    [0m | [36mapp.core.config[0m:[36m<module>[0m:[36m23[0m - [1mENVIRONMENT: production[0m
[32m2024-11-21 17:59:16.624[0m | [31m[1mERROR   [0m | [36mapp.core.config[0m:[36m<module>[0m:[36m75[0m - [31m[1mPHOSPHO_AI_HUB_URL is missing from the environment variables[0m
[32m2024-11-21 17:59:18.398[0m | [31m[1mERROR   [0m | [36mapp.services.integrations.argilla[0m:[36m<module>[0m:[36m33[0m - [31m[1mYour Api endpoint at http://localhost:6900 is not available or not responding: [Errno 61] Connection refused[0m


In [5]:
await postgresql.push(batch_size=512, only_new=False, fetch_from_flattened_tasks=True)

[32m2024-11-21 18:03:54.472[0m | [1mINFO    [0m | [36mapp.services.integrations.postgresql[0m:[36mload_config[0m:[36m94[0m - [1mCredentials already loaded for a5724a02-a243-4025-9b34-080f40818a31[0m
[32m2024-11-21 18:03:54.474[0m | [1mINFO    [0m | [36mapp.services.integrations.postgresql[0m:[36mpush[0m:[36m398[0m - [1mStarting export of project 9645789f8b98419599ea63b1f2d8bcb8 to dedicated Postgres ep-misty-sky-a2ohgefo.eu-central-1.aws.neon.tech:michelin[0m
[32m2024-11-21 18:03:54.474[0m | [1mINFO    [0m | [36mapp.services.integrations.postgresql[0m:[36mpush[0m:[36m412[0m - [1mExporting all tasks[0m
[32m2024-11-21 18:03:54.475[0m | [1mINFO    [0m | [36mapp.services.integrations.postgresql[0m:[36mpush[0m:[36m427[0m - [1mFetching the existing flattened_tasks_9645789f8b98419599ea63b1f2d8bcb8 from Mongo[0m
[32m2024-11-21 18:03:54.704[0m | [1mINFO    [0m | [36mapp.services.integrations.postgresql[0m:[36mpush[0m:[36m441[0m - [1mTot

'success'