# BGES Project : Questions



In [2]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, date

import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import Row

from sqlalchemy import create_engine
from sqlalchemy import text

In [4]:
spark = SparkSession.builder \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()

spark.conf.set("spark.sql.session.timeZone", "UTC")

25/04/17 11:40:10 WARN Utils: Your hostname, MacBook-Pro-de-Paul.local resolves to a loopback address: 127.0.0.1; using 172.20.10.3 instead (on interface en0)
25/04/17 11:40:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/17 11:40:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Vérification de l'ETL

In [10]:
# Database connection parameters
DB_USER = 'postgres'
DB_PASSWORD = 'postgres'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'postgres'

# Create SQLAlchemy engine
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Test the connection
try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1"))
        print("✓ Successfully connected to the database")
except Exception as e:
    print(f"Error connecting to database: {e}")

✓ Successfully connected to the database


In [63]:
# Example: List all tables
with engine.connect() as conn:
    tables = conn.execute(text("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'public'
    """))
    print("\nTables in the database:")
    for table in tables:
        print(f"- {table[0]}")


Tables in the database:
- dim_mission_type
- fact_business_travel
- dim_sector
- dim_employee
- fact_employee_equipment
- dim_equipment
- dim_date_time
- dim_transport
- dim_location


In [64]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_transport"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_transport:")
    print(df)


Rows from dim_transport:
   transport_id    transport_name emission_factor
0             1             plane          0.0000
1             2  public transport          0.0000
2             3             train          0.0000
3             4              taxi          0.0000


In [65]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_sector"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_sector:")
    print(df)


Rows from dim_sector:
   sector_id         sector_name
0          1       data engineer
1          2  business executive
2          3           economist
3          4                 hrd
4          5   computer engineer


In [66]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_location"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_location:")
    print(df)


Rows from dim_location:
    location_id         city    country
0             1       berlin  allemagne
1             2  los angeles        usa
2             3        paris     france
3             4     new-york        usa
4             5     shanghai      china
..          ...          ...        ...
62           63     helsinki   finlande
63           64   washington        usa
64           65       mexico     mexico
65           66         oslo    norvège
66           67       berlin    germany

[67 rows x 3 columns]


In [67]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_date_time"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_date_time:")
    print(df)


Rows from dim_date_time:
                  date_id        date  day  month  year  hour  minute  second
0     2024-10-16 20:35:05  2024-10-16   16     10  2024    20      35       5
1     2024-10-16 05:53:43  2024-10-16   16     10  2024     5      53      43
2     2024-10-16 17:44:25  2024-10-16   16     10  2024    17      44      25
3     2024-10-16 10:11:26  2024-10-16   16     10  2024    10      11      26
4     2024-10-16 06:22:46  2024-10-16   16     10  2024     6      22      46
...                   ...         ...  ...    ...   ...   ...     ...     ...
32355 2024-06-29 11:30:19  2024-06-29   29      6  2024    11      30      19
32356 2024-06-29 14:46:34  2024-06-29   29      6  2024    14      46      34
32357 2024-06-29 16:41:39  2024-06-29   29      6  2024    16      41      39
32358 2024-06-29 15:36:29  2024-06-29   29      6  2024    15      36      29
32359 2024-06-29 14:12:29  2024-06-29   29      6  2024    14      12      29

[32360 rows x 8 columns]


In [68]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_mission_type"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_mission_type:")
    print(df)


Rows from dim_mission_type:
   mission_type_id mission_type_name
0                1        conference
1                2       development
2                3  business meeting
3                4          training
4                5           meeting


In [69]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_employee"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_employee:")
    print(df)


Rows from dim_employee:
                    employee_id last_name   first_name  birth_date  \
0        KeyPers_Berlin_1230000     Name0    FistName0  1993-11-04   
1        KeyPers_Berlin_1230001     Name1    FistName1  1932-11-22   
2        KeyPers_Berlin_1230002     Name2    FistName2  1990-08-12   
3        KeyPers_Berlin_1230003     Name3    FistName3  1965-05-26   
4        KeyPers_Berlin_1230004     Name4    FistName4  1959-01-18   
...                         ...       ...          ...         ...   
20567  KeyPers_Shanghai_1230995   Name995  FistName995  2011-01-13   
20568  KeyPers_Shanghai_1230996   Name996  FistName996  1939-11-10   
20569  KeyPers_Shanghai_1230997   Name997  FistName997  2003-05-11   
20570  KeyPers_Shanghai_1230998   Name998  FistName998  1934-11-22   
20571  KeyPers_Shanghai_1230999   Name999  FistName999  1992-09-21   

        birth_city birth_country social_security_number phone_country_code  \
0            osaka         japan            NS000000000 

In [70]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_equipment"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_equipment:")
    print(df)


Rows from dim_equipment:
                          equipment_id      equipment_type  \
0       BERLIN_MATERIEL_INFO_202406300               Ecran   
1       BERLIN_MATERIEL_INFO_202406301  PC fixe sans ecran   
2       BERLIN_MATERIEL_INFO_202406240          Disque dur   
3       BERLIN_MATERIEL_INFO_202406241  PC fixe sans ecran   
4       BERLIN_MATERIEL_INFO_202406242         PC portable   
...                                ...                 ...   
8199  Shanghai_MATERIEL_INFO_202406293  PC fixe sans ecran   
8200  Shanghai_MATERIEL_INFO_202406294         PC portable   
8201  Shanghai_MATERIEL_INFO_202406295   Station d'accueil   
8202  Shanghai_MATERIEL_INFO_202406296  PC fixe sans ecran   
8203  Shanghai_MATERIEL_INFO_202406297          Smartphone   

                     model co2_impact_kg  
0         jusqu'à 23pouces        590.00  
1     Precision tower 3xxx        300.00  
2        modèle par défaut         10.00  
3     Prodesk (Tower, SFF)        300.00  
4         Moye