In [84]:
# import all necessary package

from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
import pandas as pd

In [3]:
# create session
spark = SparkSession.builder.appName('ExoPySpark').getOrCreate()

In [85]:
# create a temporary table for each csv file
dir = 'Exo_PySpark'

data_path_dict = {
    'traffic':f'{dir}/fichier1_trafic.csv',
    'topologie':f'{dir}/fichier2_topologie.csv',
    'geographique':f'{dir}/fichier3_Donnees_geographique.csv',
    'isis':f'{dir}/fichier4_couts_ISIS.csv'
}

for table_name , path in data_path_dict.items():
    spark.read.option('header',True).csv(path).createOrReplaceTempView(table_name)


In [86]:
# check if all tables are included
spark.catalog.listTables()

[Table(name='geographique', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='isis', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='topologie', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='traffic', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [199]:
# Filter on file 2
topologie = spark.table('topologie')
isis =  spark.table('isis')
geographique = spark.table('geographique')
traffic = spark.table('traffic')

# Issues :
# there is row with admin_state_port = Connexion KO
# Is the filter expression the best way to do the task


sub_topologie = topologie.filter("equipement LIKE 'PTN%' AND client LIKE '%PTN%' AND admin_state_port!='down' AND vlan=='NA' AND type_port_ou_mda == '10G'")\
    .withColumn('id_lien', sf.concat(topologie.equipement, sf.lit('_'), topologie.client))\
    .select('equipement', 'client', 'port', 'portlldp', 'interface_netwk', 'port_lag_netwk', 'type_port_ou_mda')

In [200]:
# step to follow
# function which add a string value to the column name of a spark dataframe
def add_to_col_name(df, value):
    return [column_name + value for column_name in df.columns]

# 1. merge topology table filtered result with isis using inner join by (nom_equipement, port, interface_netwk)

isis_renamed = isis.withColumnRenamed("nom_equipement","equipement")
merged_result = sub_topologie.join(isis_renamed, on=['equipement', 'port', 'interface_netwk'], how='inner')

# 2. merge result by client = geographique.nom_equipement to give column with _A

geographique_renamed_column_A = geographique.toDF(*add_to_col_name(geographique, '_A'))
merged_result = merged_result.join(geographique_renamed_column_A, merged_result.equipement == geographique_renamed_column_A.nom_equipement_A)

# 3. merge result by client = geographique.nom_equipement to give column with _B

geographique_renamed_column_B = geographique.toDF(*add_to_col_name(geographique, '_B'))
merged_result = merged_result.join(geographique_renamed_column_B, merged_result.equipement == geographique_renamed_column_B.nom_equipement_B)


# 4.

In [201]:
# check the merge result
merged_result.toPandas()

Unnamed: 0,equipement,port,interface_netwk,client,portlldp,port_lag_netwk,type_port_ou_mda,isis_level,isis_metric,nom_equipement_A,role_A,configuration_A,region_exploitation_A,nom_equipement_B,role_B,configuration_B,region_exploitation_B
0,PTN00122,1/1/2,TO_PTN00022_LAG22_WDM_FTTA,PTN00022,8/1/2,lag-22,10G,1,10,PTN00122,B,7750-SR7,IDF,PTN00122,B,7750-SR7,IDF
1,PTN00122,2/1/3,TO_PTN00022_LAG22_WDM_FTTA,PTN00022,8/1/4,lag-22,10G,1,10,PTN00122,B,7750-SR7,IDF,PTN00122,B,7750-SR7,IDF
2,PTN00820,2/1/3,TO_PTN00821_LAG_21,PTN00821,1/1/3,lag-21,10G,1,10,PTN00820,B,7750-SR7,WST,PTN00820,B,7750-SR7,WST
3,PTN00820,2/1/4,TO_PTN00821_LAG_21,PTN00821,2/1/1,lag-21,10G,1,10,PTN00820,B,7750-SR7,WST,PTN00820,B,7750-SR7,WST
4,PTN00651,4/1/2,TO_PTN00042_LAG22,PTN00042,2/1/2,lag-22,10G,1,10,PTN00651,B,7750-SR7,MED,PTN00651,B,7750-SR7,MED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2825,PTN01404,1/1/2,TO_PTN01731_LAG-20,PTN01731,1/1/2,lag-20,10G,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2826,PTN01404,1/1/3,TO_PTN01731_LAG-20,PTN01731,1/1/3,lag-20,10G,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2827,PTN01404,2/1/2,TO_PTN00258_LAG-27,PTN00258,No LLDP,lag-27,10G,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST
2828,PTN01404,2/1/3,TO_PTN00258_LAG-27,PTN00258,No LLDP,lag-27,10G,1,10,PTN01404,B,7750-SR7,WST,PTN01404,B,7750-SR7,WST


In [55]:
# Partie II

Part II

In [60]:
# o	Identifier le maximum du trafic sur cet équipement entre le trafic rentrant et sortant (in & out), on appellera cette variable trafic_max

In [202]:
# change traffic_out and traffic_in type

traffic=traffic.withColumn('trafic_in', sf.regexp_replace('trafic_in', ',', '.'))
traffic=traffic.withColumn('trafic_out', sf.regexp_replace('trafic_out', ',', '.'))

traffic = traffic.withColumn('trafic_in', traffic.trafic_in.cast('float'))
traffic = traffic.withColumn('trafic_out', traffic.trafic_out.cast('float'))

traffic = traffic.filter("interface_equipement LIKE 'lag%'").withColumn('traffic_max',sf.greatest(traffic.trafic_in, traffic.trafic_out))


traffic_result = traffic.groupby(['equipement', 'interface_equipement']).max('traffic_max', 'trafic_out', 'trafic_in').toDF(*['equipement', 'interface_equipement', 'trafic_max', 'trafic_out', 'trafic_in'])

In [203]:
traffic_result.toPandas()

Unnamed: 0,equipement,interface_equipement,trafic_max,trafic_out,trafic_in
0,PTN00440,lag-133,33825.730469,2660.219971,33825.730469
1,PTN00070,lag-31,0.472888,0.472888,0.293776
2,PTN00462,lag-20,0.811944,0.811944,0.170688
3,PTN00992,lag-21,393.294098,74.954163,393.294098
4,PTN00773,lag-20,16094.252930,16094.252930,1219.979126
...,...,...,...,...,...
1436,PTN00104,lag-20,4282.311035,4282.311035,382.316437
1437,PTN00440,lag-10,5130.512695,5130.512695,416.335266
1438,PTN00142,lag-20,22.486240,10.415432,22.486240
1439,PTN00666,lag-22,3610.095703,407.226074,3610.095703


In [231]:
# select unique by 'equipement', 'client', 'port_lag_netwk'
sub_top_unique = sub_topologie.dropDuplicates(['equipement', 'client', 'port_lag_netwk']).select('equipement', 'client', 'port_lag_netwk', 'type_port_ou_mda')

In [232]:
result = sub_top_unique.join(traffic_result, (traffic_result.equipement == sub_top_unique.equipement) & (traffic_result.interface_equipement == sub_top_unique.port_lag_netwk)).drop(traffic_result.equipement)

In [233]:
result.toPandas()

Unnamed: 0,equipement,client,port_lag_netwk,type_port_ou_mda,interface_equipement,trafic_max,trafic_out,trafic_in
0,PTN00462,PTN01077,lag-20,10,lag-20,0.811944,0.811944,0.170688
1,PTN00992,PTN00622,lag-21,10,lag-21,393.294098,74.954163,393.294098
2,PTN00773,PTN00774,lag-20,10,lag-20,16094.252930,16094.252930,1219.979126
3,PTN00354,PTN00352,lag-21,10,lag-21,7224.821289,953.789001,7224.821289
4,PTN00218,PTN00327,lag-23,10,lag-23,726.676819,726.676819,152.194534
...,...,...,...,...,...,...,...,...
887,PTN00247,PTN00246,lag-20,10,lag-20,1443.899902,1443.899902,415.637573
888,PTN00104,PTN00100,lag-20,10,lag-20,4282.311035,4282.311035,382.316437
889,PTN00142,PTN00343,lag-20,10,lag-20,22.486240,10.415432,22.486240
890,PTN00666,PTN00335,lag-22,10,lag-22,3610.095703,407.226074,3610.095703


In [234]:
result.filter("equipement IN ('PTN00216','PTN00218') AND client IN ('PTN00216','PTN00218')").toPandas()

Unnamed: 0,equipement,client,port_lag_netwk,type_port_ou_mda,interface_equipement,trafic_max,trafic_out,trafic_in
0,PTN00218,PTN00216,lag-21,10,lag-21,7034.388184,867.169983,7034.388184
1,PTN00216,PTN00218,lag-21,10,lag-21,7023.918457,7023.918457,869.327148


In [235]:
geographique_renamed_column_A = geographique.toDF(*add_to_col_name(geographique, '_A'))
merged_result = result.join(geographique_renamed_column_A, result.equipement == geographique_renamed_column_A.nom_equipement_A)

# 3. merge result by client = geographique.nom_equipement to give column with _B

geographique_renamed_column_B = geographique.toDF(*add_to_col_name(geographique, '_B'))
merged_result = merged_result.join(geographique_renamed_column_B, merged_result.equipement == geographique_renamed_column_B.nom_equipement_B)


In [236]:
merged_result = merged_result.drop(*['nom_equipement_A', 'nom_equipement_B'])

In [237]:
# get capacite lien
sub_topologie=sub_topologie.withColumn('type_port_ou_mda', sf.regexp_replace('type_port_ou_mda', 'G', ''))
sub_topologie=sub_topologie.withColumn('type_port_ou_mda',sub_topologie.type_port_ou_mda.cast('int'))
capacite_lien = sub_topologie.groupby(['equipement', 'client']).sum('type_port_ou_mda').withColumnRenamed('sum(type_port_ou_mda)', 'capacite_lien')

In [278]:
res = merged_result.join(capacite_lien, on=['equipement', 'client'])

In [279]:
res.toPandas()

Unnamed: 0,equipement,client,port_lag_netwk,type_port_ou_mda,interface_equipement,trafic_max,trafic_out,trafic_in,role_A,configuration_A,region_exploitation_A,role_B,configuration_B,region_exploitation_B,capacite_lien
0,PTN00462,PTN01077,lag-20,10,lag-20,0.811944,0.811944,0.170688,B,7750-SR7,MED,B,7750-SR7,MED,40
1,PTN00992,PTN00622,lag-21,10,lag-21,393.294098,74.954163,393.294098,B,7750-SR7,NOE,B,7750-SR7,NOE,30
2,PTN00773,PTN00774,lag-20,10,lag-20,16094.252930,16094.252930,1219.979126,A,7750-SR12,IDF,A,7750-SR12,IDF,60
3,PTN00354,PTN00352,lag-21,10,lag-21,7224.821289,953.789001,7224.821289,B,7750-SR7,SWT,B,7750-SR7,SWT,20
4,PTN00218,PTN00327,lag-23,10,lag-23,726.676819,726.676819,152.194534,A,7750-SR12,NOE,A,7750-SR12,NOE,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,PTN00247,PTN00246,lag-20,10,lag-20,1443.899902,1443.899902,415.637573,B,7750-SR7,WST,B,7750-SR7,WST,40
888,PTN00104,PTN00100,lag-20,10,lag-20,4282.311035,4282.311035,382.316437,B,7750-SR7,MED,B,7750-SR7,MED,20
889,PTN00142,PTN00343,lag-20,10,lag-20,22.486240,10.415432,22.486240,B,7750-SR7,MED,B,7750-SR7,MED,40
890,PTN00666,PTN00335,lag-22,10,lag-22,3610.095703,407.226074,3610.095703,B,7750-SR7,WST,B,7750-SR7,WST,40


In [272]:
res = res.withColumn('tdc', ((res.trafic_max)/1000)/res.capacite_lien * 100)

In [273]:
def get_intervalle_charge(value):
    if value <=70:
        return 'Inf 70%'
    else:
        if value<=90:
            return 'Sup 70% Inf 90%'
        else:
            if value<100:
                return 'Sup 90%'
            else:
                return 'Erreur'

get_intervalle_charge_udf = sf.udf(get_intervalle_charge)

In [280]:
res.toPandas()

Unnamed: 0,equipement,client,port_lag_netwk,type_port_ou_mda,interface_equipement,trafic_max,trafic_out,trafic_in,role_A,configuration_A,region_exploitation_A,role_B,configuration_B,region_exploitation_B,capacite_lien
0,PTN00462,PTN01077,lag-20,10,lag-20,0.811944,0.811944,0.170688,B,7750-SR7,MED,B,7750-SR7,MED,40
1,PTN00992,PTN00622,lag-21,10,lag-21,393.294098,74.954163,393.294098,B,7750-SR7,NOE,B,7750-SR7,NOE,30
2,PTN00773,PTN00774,lag-20,10,lag-20,16094.252930,16094.252930,1219.979126,A,7750-SR12,IDF,A,7750-SR12,IDF,60
3,PTN00354,PTN00352,lag-21,10,lag-21,7224.821289,953.789001,7224.821289,B,7750-SR7,SWT,B,7750-SR7,SWT,20
4,PTN00218,PTN00327,lag-23,10,lag-23,726.676819,726.676819,152.194534,A,7750-SR12,NOE,A,7750-SR12,NOE,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,PTN00247,PTN00246,lag-20,10,lag-20,1443.899902,1443.899902,415.637573,B,7750-SR7,WST,B,7750-SR7,WST,40
888,PTN00104,PTN00100,lag-20,10,lag-20,4282.311035,4282.311035,382.316437,B,7750-SR7,MED,B,7750-SR7,MED,20
889,PTN00142,PTN00343,lag-20,10,lag-20,22.486240,10.415432,22.486240,B,7750-SR7,MED,B,7750-SR7,MED,40
890,PTN00666,PTN00335,lag-22,10,lag-22,3610.095703,407.226074,3610.095703,B,7750-SR7,WST,B,7750-SR7,WST,40


In [275]:
res = res.withColumn("interval_charge", get_intervalle_charge_udf('tdc'))