In [2]:
# import all necessary package

from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
import pandas as pd

In [3]:
# create session
spark = SparkSession.builder.appName('ExoPySpark').getOrCreate()

In [5]:
# create a temporary table for each csv file
dir = 'Exo_PySpark'

data_path_dict = {
    'traffic':f'{dir}/fichier1_trafic.csv',
    'topologie':f'{dir}/fichier2_topologie.csv',
    'geographique':f'{dir}/fichier3_Donnees_geographique.csv',
    'isis':f'{dir}/fichier4_couts_ISIS.csv'
}

for table_name , path in data_path_dict.items():
    spark.read.option('header',True).csv(path).createOrReplaceTempView(table_name)


In [6]:
# check if all tables are included
spark.catalog.listTables()

[Table(name='geographique', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='isis', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='topologie', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='traffic', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [7]:
# Filter on file 2
topologie = spark.table('topologie')
isis =  spark.table('isis')
geographique = spark.table('geographique')
traffic = spark.table('traffic')

# Issues :
# there is row with admin_state_port = Connexion KO
# Is the filter expression the best way to do the task


sub_topologie = topologie.filter("equipement LIKE 'PTN%' AND client LIKE '%PTN%' AND admin_state_port!='down' AND vlan=='NA' AND type_port_ou_mda == '10G'")\
    .withColumn('id_lien', sf.concat(topologie.equipement, sf.lit('_'), topologie.client))\
    .select('equipement', 'client', 'port', 'portlldp', 'interface_netwk')

In [8]:
# step to follow
# function which add a string value to the column name of a spark dataframe
def add_to_col_name(df, value):
    return [column_name + value for column_name in df.columns]

# 1. merge topology table filtered result with isis using inner join by (nom_equipement, port, interface_netwk)

isis_renamed = isis.withColumnRenamed("nom_equipement","equipement")
merged_result = sub_topologie.join(isis_renamed, on=['equipement', 'port', 'interface_netwk'], how='inner')

# 2. merge result by client = geographique.nom_equipement to give column with _A

geographique_renamed_column_A = geographique.toDF(*add_to_col_name(geographique, '_A'))
merged_result = merged_result.join(geographique_renamed_column_A, merged_result.equipement == geographique_renamed_column_A.nom_equipement_A)

# 3. merge result by client = geographique.nom_equipement to give column with _B

geographique_renamed_column_B = geographique.toDF(*add_to_col_name(geographique, '_B'))
merged_result = merged_result.join(geographique_renamed_column_B, merged_result.equipement == geographique_renamed_column_B.nom_equipement_B)


# 4.

In [9]:
# check the merge result
merged_result.toPandas()

Unnamed: 0,equipement,port,interface_netwk,client,portlldp,isis_level,isis_metric,nom_equipement_A,role_A,configuration_A,region_exploitation_A,nom_equipement_B,role_B,configuration_B,region_exploitation_B
0,PTN00122,1/1/2,TO_PTN00022_LAG22_WDM_FTTA,PTN00022,8/1/2,1,10,PTN00122,B,7750-SR7,IDF,PTN00122,B,7750-SR7,IDF
1,PTN00122,2/1/3,TO_PTN00022_LAG22_WDM_FTTA,PTN00022,8/1/4,1,10,PTN00122,B,7750-SR7,IDF,PTN00122,B,7750-SR7,IDF
2,PTN00820,2/1/3,TO_PTN00821_LAG_21,PTN00821,1/1/3,1,10,PTN00820,B,7750-SR7,WST,PTN00820,B,7750-SR7,WST
3,PTN00820,2/1/4,TO_PTN00821_LAG_21,PTN00821,2/1/1,1,10,PTN00820,B,7750-SR7,WST,PTN00820,B,7750-SR7,WST
4,PTN00651,4/1/2,TO_PTN00042_LAG22,PTN00042,2/1/2,1,10,PTN00651,B,7750-SR7,MED,PTN00651,B,7750-SR7,MED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2825,PTN01404,1/1/2,TO_PTN01731_LAG-20,PTN01731,1/1/2,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2826,PTN01404,1/1/3,TO_PTN01731_LAG-20,PTN01731,1/1/3,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2827,PTN01404,2/1/2,TO_PTN00258_LAG-27,PTN00258,No LLDP,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2828,PTN01404,2/1/3,TO_PTN00258_LAG-27,PTN00258,No LLDP,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
