# BGES Project : Questions



In [79]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [80]:
import pandas as pd
import numpy as np
from datetime import datetime, date

import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import Row

from sqlalchemy import create_engine
from sqlalchemy import text

In [81]:
spark = SparkSession.builder \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()

spark.conf.set("spark.sql.session.timeZone", "UTC")

## Vérification de l'ETL

In [82]:
# Database connection parameters
DB_USER = 'postgres'
DB_PASSWORD = 'postgres'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'postgres'

# Create SQLAlchemy engine
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Test the connection
try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1"))
        print("✓ Successfully connected to the database")
except Exception as e:
    print(f"Error connecting to database: {e}")

✓ Successfully connected to the database


In [83]:
# Example: List all tables
with engine.connect() as conn:
    tables = conn.execute(text("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'public'
    """))
    print("\nTables in the database:")
    for table in tables:
        print(f"- {table[0]}")


Tables in the database:
- dim_sector
- dim_employee
- fact_employee_equipment
- fact_business_travel
- dim_equipment
- dim_location
- dim_date_time
- dim_mission_type
- dim_transport


In [97]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_transport"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_transport:")
    print(df)


Rows from dim_transport:
   transport_id    transport_name emission_factor
0             1             plane          0.1994
1             2  public transport          0.1373
2             3             train          0.0106
3             4              taxi          0.1904


In [98]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_sector"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_sector:")
    print(df)


Rows from dim_sector:
   sector_id         sector_name
0          1       data engineer
1          2  business executive
2          3           economist
3          4                 hrd
4          5   computer engineer


In [116]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_location"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_location:")
    print(df.head())


Rows from dim_location:
   location_id         city    country
0            1       berlin  allemagne
1            2  los angeles        usa
2            3        paris     france
3            4     new-york        usa
4            5     shanghai      china


In [117]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_date_time"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_date_time:")
    print(df.head())
    
    


Rows from dim_date_time:
              date_id        date  day  month  year  hour  minute  second
0 2024-10-16 20:35:05  2024-10-16   16     10  2024    20      35       5
1 2024-10-16 05:53:43  2024-10-16   16     10  2024     5      53      43
2 2024-10-16 17:44:25  2024-10-16   16     10  2024    17      44      25
3 2024-10-16 10:11:26  2024-10-16   16     10  2024    10      11      26
4 2024-10-16 06:22:46  2024-10-16   16     10  2024     6      22      46


In [118]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_mission_type"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_mission_type:")
    print(df)


Rows from dim_mission_type:
   mission_type_id mission_type_name
0                1        conference
1                2       development
2                3  business meeting
3                4          training
4                5           meeting


In [119]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_employee"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_employee:")
    print(df.head())


Rows from dim_employee:
              employee_id last_name first_name  birth_date  birth_city  \
0  KeyPers_Berlin_1230000     Name0  FistName0  1993-11-04       osaka   
1  KeyPers_Berlin_1230001     Name1  FistName1  1932-11-22  wellington   
2  KeyPers_Berlin_1230002     Name2  FistName2  1990-08-12      sidney   
3  KeyPers_Berlin_1230003     Name3  FistName3  1965-05-26       rabat   
4  KeyPers_Berlin_1230004     Name4  FistName4  1959-01-18    shanghai   

  birth_country social_security_number phone_country_code   phone_number  \
0         japan            NS000000000                NaN  +336##0263188   
1   new zealand            NS000000001                NaN  +336##0401873   
2     australia            NS000000002                NaN  +336##0524126   
3         maroc            NS000000003                NaN  +336##0418484   
4         china            NS000000004                NaN  +336##0986317   

  address_street_number address_street_name address_complement postal_cod

In [120]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM dim_equipment"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from dim_equipment:")
    print(df.head())


Rows from dim_equipment:
                     equipment_id      equipment_type                 model  \
0  BERLIN_MATERIEL_INFO_202406300               Ecran      jusqu'à 23pouces   
1  BERLIN_MATERIEL_INFO_202406301  PC fixe sans ecran  Precision tower 3xxx   
2  BERLIN_MATERIEL_INFO_202406240          Disque dur     modèle par défaut   
3  BERLIN_MATERIEL_INFO_202406241  PC fixe sans ecran  Prodesk (Tower, SFF)   
4  BERLIN_MATERIEL_INFO_202406242         PC portable      Moyenne 13pouces   

  co2_impact_kg  
0        590.00  
1        300.00  
2         10.00  
3        300.00  
4        365.00  


In [121]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM fact_business_travel"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from fact_business_travel:")
    print(df.head())


Rows from fact_business_travel:
          travel_id             employee_id  mission_type_id  \
0  BERLIN_202410160  KeyPers_Berlin_1233137                1   
1  BERLIN_202410161  KeyPers_Berlin_1230804                1   
2  BERLIN_202410162  KeyPers_Berlin_1233236                2   
3  BERLIN_202410163  KeyPers_Berlin_1233797                3   
4  BERLIN_202410164  KeyPers_Berlin_1231232                4   

   departure_location_id  destination_location_id  transport_id  \
0                     62                       49             1   
1                     62                       42             1   
2                     62                       36             1   
3                     62                       37             1   
4                     62                       64             1   

              date_id distance_km  is_round_trip  
0 2024-10-16 20:35:05    12805.02           True  
1 2024-10-16 05:53:43    20461.82           True  
2 2024-10-16 17:44:25    3

In [122]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM fact_employee_equipment"))
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    print("\nRows from fact_employee_equipment:")
    print(df.head())


Rows from fact_employee_equipment:
                      id_materiel                    equipment_id  \
0  BERLIN_MATERIEL_INFO_202406300  BERLIN_MATERIEL_INFO_202406300   
1  BERLIN_MATERIEL_INFO_202406301  BERLIN_MATERIEL_INFO_202406301   
2  BERLIN_MATERIEL_INFO_202406240  BERLIN_MATERIEL_INFO_202406240   
3  BERLIN_MATERIEL_INFO_202406241  BERLIN_MATERIEL_INFO_202406241   
4  BERLIN_MATERIEL_INFO_202406242  BERLIN_MATERIEL_INFO_202406242   

              employee_id  location_id    purchase_date_id  
0  KeyPers_Berlin_1230163           67 2024-06-30 08:28:49  
1  KeyPers_Berlin_1230240           67 2024-06-30 09:13:13  
2  KeyPers_Berlin_1232208           67 2024-06-24 10:50:25  
3  KeyPers_Berlin_1231193           67 2024-06-24 10:54:42  
4  KeyPers_Berlin_1231013           67 2024-06-24 16:21:13  
