In [84]:
# import all necessary package

from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
import pandas as pd

In [3]:
# create session
spark = SparkSession.builder.appName('ExoPySpark').getOrCreate()

In [85]:
# create a temporary table for each csv file
dir = 'Exo_PySpark'

data_path_dict = {
    'traffic':f'{dir}/fichier1_trafic.csv',
    'topologie':f'{dir}/fichier2_topologie.csv',
    'geographique':f'{dir}/fichier3_Donnees_geographique.csv',
    'isis':f'{dir}/fichier4_couts_ISIS.csv'
}

for table_name , path in data_path_dict.items():
    spark.read.option('header',True).csv(path).createOrReplaceTempView(table_name)


In [86]:
# check if all tables are included
spark.catalog.listTables()

[Table(name='geographique', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='isis', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='topologie', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='traffic', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [101]:
# Filter on file 2
topologie = spark.table('topologie')
isis =  spark.table('isis')
geographique = spark.table('geographique')
traffic = spark.table('traffic')

# Issues :
# there is row with admin_state_port = Connexion KO
# Is the filter expression the best way to do the task


sub_topologie = topologie.filter("equipement LIKE 'PTN%' AND client LIKE '%PTN%' AND admin_state_port!='down' AND vlan=='NA' AND type_port_ou_mda == '10G'")\
    .withColumn('id_lien', sf.concat(topologie.equipement, sf.lit('_'), topologie.client))\
    .select('equipement', 'client', 'port', 'portlldp', 'interface_netwk', 'port_lag_netwk')

In [102]:
# step to follow
# function which add a string value to the column name of a spark dataframe
def add_to_col_name(df, value):
    return [column_name + value for column_name in df.columns]

# 1. merge topology table filtered result with isis using inner join by (nom_equipement, port, interface_netwk)

isis_renamed = isis.withColumnRenamed("nom_equipement","equipement")
merged_result = sub_topologie.join(isis_renamed, on=['equipement', 'port', 'interface_netwk'], how='inner')

# 2. merge result by client = geographique.nom_equipement to give column with _A

geographique_renamed_column_A = geographique.toDF(*add_to_col_name(geographique, '_A'))
merged_result = merged_result.join(geographique_renamed_column_A, merged_result.equipement == geographique_renamed_column_A.nom_equipement_A)

# 3. merge result by client = geographique.nom_equipement to give column with _B

geographique_renamed_column_B = geographique.toDF(*add_to_col_name(geographique, '_B'))
merged_result = merged_result.join(geographique_renamed_column_B, merged_result.equipement == geographique_renamed_column_B.nom_equipement_B)


# 4.

In [54]:
# check the merge result
merged_result.toPandas()

Unnamed: 0,equipement,port,interface_netwk,client,portlldp,port_lag_netwk,isis_level,isis_metric,nom_equipement_A,role_A,configuration_A,region_exploitation_A,nom_equipement_B,role_B,configuration_B,region_exploitation_B
0,PTN00122,1/1/2,TO_PTN00022_LAG22_WDM_FTTA,PTN00022,8/1/2,lag-22,1,10,PTN00122,B,7750-SR7,IDF,PTN00122,B,7750-SR7,IDF
1,PTN00122,2/1/3,TO_PTN00022_LAG22_WDM_FTTA,PTN00022,8/1/4,lag-22,1,10,PTN00122,B,7750-SR7,IDF,PTN00122,B,7750-SR7,IDF
2,PTN00820,2/1/3,TO_PTN00821_LAG_21,PTN00821,1/1/3,lag-21,1,10,PTN00820,B,7750-SR7,WST,PTN00820,B,7750-SR7,WST
3,PTN00820,2/1/4,TO_PTN00821_LAG_21,PTN00821,2/1/1,lag-21,1,10,PTN00820,B,7750-SR7,WST,PTN00820,B,7750-SR7,WST
4,PTN00651,4/1/2,TO_PTN00042_LAG22,PTN00042,2/1/2,lag-22,1,10,PTN00651,B,7750-SR7,MED,PTN00651,B,7750-SR7,MED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2825,PTN01404,1/1/2,TO_PTN01731_LAG-20,PTN01731,1/1/2,lag-20,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2826,PTN01404,1/1/3,TO_PTN01731_LAG-20,PTN01731,1/1/3,lag-20,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2827,PTN01404,2/1/2,TO_PTN00258_LAG-27,PTN00258,No LLDP,lag-27,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2828,PTN01404,2/1/3,TO_PTN00258_LAG-27,PTN00258,No LLDP,lag-27,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST


In [55]:
# Partie II

Part II

In [60]:
# o	Identifier le maximum du trafic sur cet équipement entre le trafic rentrant et sortant (in & out), on appellera cette variable trafic_max

In [111]:
# change traffic_out and traffic_in type

traffic=traffic.withColumn('trafic_in', sf.regexp_replace('trafic_in', ',', '.'))
traffic=traffic.withColumn('trafic_out', sf.regexp_replace('trafic_out', ',', '.'))

traffic = traffic.withColumn('trafic_in', traffic.trafic_in.cast('float'))
traffic = traffic.withColumn('trafic_out', traffic.trafic_out.cast('float'))

traffic = traffic.filter("interface_equipement LIKE 'lag%'").withColumn('traffic_max',sf.greatest(traffic.trafic_in, traffic.trafic_out))
traffic_result = traffic.groupby(['equipement', 'interface_equipement']).max('traffic_max').show()

+----------+--------------------+----------------+
|equipement|interface_equipement|max(traffic_max)|
+----------+--------------------+----------------+
|  PTN00440|             lag-133|        33825.73|
|  PTN00070|              lag-31|        0.472888|
|  PTN00462|              lag-20|        0.811944|
|  PTN00992|              lag-21|        393.2941|
|  PTN00773|              lag-20|       16094.253|
|  PTN00354|              lag-21|       7224.8213|
|  PTN00218|              lag-23|        726.6768|
|  PTN00485|              lag-21|       2574.4377|
|  PTN00024|              lag-21|       1066.2867|
|  PTN00056|              lag-23|       4915.2656|
|  PTN02295|              lag-22|       1103.6776|
|  PTN00017|              lag-24|       2258.2336|
|  PTN00016|             lag-103|        0.106856|
|  PTN00624|              lag-21|       3573.4937|
|  PTN01404|              lag-20|       2473.3462|
|  PTN00083|              lag-13|       5119.5137|
|  PTN01086|              lag-2

In [110]:
sub_topologie.join(traffic_result)


Unnamed: 0,equipement,client,port,portlldp,interface_netwk,port_lag_netwk
0,PTN00122,PTN00022,1/1/2,8/1/2,TO_PTN00022_LAG22_WDM_FTTA,lag-22
1,PTN00122,PTN00022,2/1/3,8/1/4,TO_PTN00022_LAG22_WDM_FTTA,lag-22
2,PTN00820,PTN00821,2/1/3,1/1/3,TO_PTN00821_LAG_21,lag-21
3,PTN00820,PTN00821,2/1/4,2/1/1,TO_PTN00821_LAG_21,lag-21
4,PTN00651,PTN00042,4/1/2,2/1/2,TO_PTN00042_LAG22,lag-22
...,...,...,...,...,...,...
3232,PTN01404,PTN01731,1/1/2,1/1/2,TO_PTN01731_LAG-20,lag-20
3233,PTN01404,PTN01731,1/1/3,1/1/3,TO_PTN01731_LAG-20,lag-20
3234,PTN01404,PTN00258,2/1/2,No LLDP,TO_PTN00258_LAG-27,lag-27
3235,PTN01404,PTN00258,2/1/3,No LLDP,TO_PTN00258_LAG-27,lag-27


In [99]:
traffic.dtypes

[('equipement', 'string'),
 ('interface_equipement', 'string'),
 ('date', 'string'),
 ('heure', 'string'),
 ('trafic_in', 'float'),
 ('trafic_out', 'float')]