# BGES Project : Questions



In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, date

import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import Row

from sqlalchemy import create_engine
from sqlalchemy import text

In [9]:
spark = SparkSession.builder \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()

spark.conf.set("spark.sql.session.timeZone", "UTC")

## Vérification de l'ETL

In [20]:
# Database connection parameters
DB_USER = 'postgres'
DB_PASSWORD = 'postgres'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'postgres'

# Create SQLAlchemy engine
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Test the connection
try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1"))
        print("✓ Successfully connected to the database")
except Exception as e:
    print(f"Error connecting to database: {e}")

✓ Successfully connected to the database


In [22]:
# Example: List all tables
with engine.connect() as conn:
    tables = conn.execute(text("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'public'
    """))
    print("\nTables in the database:")
    for table in tables:
        print(f"- {table[0]}")


Tables in the database:
- dim_transport
- dim_location
- fact_employee_equipment
- fact_business_travel
- dim_employee
- dim_sector
- dim_equipment
- dim_time
- dim_mission_type


In [23]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_transport"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_transport:")
    print(df)


Rows from dim_transport:
   transport_id        transport_name emission_factor
0             1                 Avion          0.0000
1             2  Transports en commun          0.0000
2             3                 Train          0.0000
3             4                  Taxi          0.0000


In [24]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_sector"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_sector:")
    print(df)


Rows from dim_sector:
   sector_id         sector_name
0          1       Data Engineer
1          2  Business Executive
2          3           Economist
3          4                 HRD
4          5   Computer Engineer


In [25]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_location"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_location:")
    print(df)


Rows from dim_location:
    location_id            city      country
0             1          Berlin    Allemagne
1             2     Los Angeles          USA
2             3           Paris       France
3             4        New-York          USA
4             5        Shanghai        China
5             6          London      England
6             7       Sao Paulo       Brazil
7             8          Sidney    Australia
8             9           Rabat        Maroc
9            10      Washington          USA
10           11        Montreal       Canada
11           12           Dubaï      Emirats
12           13           Osaka        Japan
13           14            Oslo      Norvège
14           15           Pekin        China
15           16           Alger      Algeria
16           17          Bogota     Colombia
17           18       Vancouver       Canada
18           19            Lima         Peru
19           20        Helsinki     Finlande
20           21       Melbourn

In [26]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_time"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_time:")
    print(df)


Rows from dim_time:
        date_id  day  month  year
0    2024-10-16   16     10  2024
1    2024-10-02    2     10  2024
2    2024-05-26   26      5  2024
3    2024-07-24   24      7  2024
4    2024-07-30   30      7  2024
..          ...  ...    ...   ...
195  2024-07-01    1      7  2024
196  2024-07-29   29      7  2024
197  2024-05-17   17      5  2024
198  2024-05-03    3      5  2024
199  2024-10-27   27     10  2024

[200 rows x 4 columns]


In [27]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_mission_type"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_mission_type:")
    print(df)


Rows from dim_mission_type:
   mission_type_id mission_type_name
0                1        conference
1                2       development
2                3  business meeting
3                4          training
4                5           meeting


In [30]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_employee"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_employee:")
    print(df)


Rows from dim_employee:
                    employee_id last_name   first_name  birth_date  \
0        KeyPers_Berlin_1230000     Name0    FistName0  1993-11-04   
1        KeyPers_Berlin_1230001     Name1    FistName1  1932-11-22   
2        KeyPers_Berlin_1230002     Name2    FistName2  1990-08-12   
3        KeyPers_Berlin_1230003     Name3    FistName3  1965-05-26   
4        KeyPers_Berlin_1230004     Name4    FistName4  1959-01-18   
...                         ...       ...          ...         ...   
20567  KeyPers_Shanghai_1230995   Name995  FistName995  2011-01-13   
20568  KeyPers_Shanghai_1230996   Name996  FistName996  1939-11-10   
20569  KeyPers_Shanghai_1230997   Name997  FistName997  2003-05-11   
20570  KeyPers_Shanghai_1230998   Name998  FistName998  1934-11-22   
20571  KeyPers_Shanghai_1230999   Name999  FistName999  1992-09-21   

       birth_location_id social_security_number phone_country_code  \
0                     13            NS000000000                N