In [133]:
from tensorflow import keras

m = 16  # 2^n
model = keras.Sequential([
    keras.Input(shape=(5,)),
    keras.layers.Dense(m, activation='relu'),
    keras.layers.Dense(m, activation='relu'),
    keras.layers.Dense(m, activation='relu'),
    keras.layers.Dense(m, activation='relu'),
    keras.layers.Dense(m, activation='relu'),
    keras.layers.Dense(3, activation='sigmoid')])  # sigmoid a verifier

model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])  #try d'autre loss



In [134]:
import os
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split

In [135]:
def distance_haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Rayon de la Terre en kilomètres
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

In [136]:
def create_distance_matrix(data):
    n = len(data)
    matrix = np.zeros((n, n))
    for i, row1 in enumerate(data.itertuples()):
        for j, row2 in enumerate(data.itertuples()):
            if i != j:
                matrix[i][j] = distance_haversine(row1.latitude, row1.longitude, row2.latitude,
                                                  row2.longitude)
    return matrix

In [137]:
def avg_distance(df, nomLine, sens):
    # Filtrer les lignes en fonction de la condition nomcourtligne == 'C1' et sens == 0
    filtered_df = df[df["nomcourtligne"] == nomLine]
    filtered_df = filtered_df[filtered_df["sens"] == sens]

    # Extraire les coordonnées et les diviser en latitude et longitude
    filtered_df['latitude'] = filtered_df['coordonnees'].apply(
        lambda x: float(x.replace("[", "").split(";")[0].strip('[]')))
    filtered_df['longitude'] = filtered_df['coordonnees'].apply(
        lambda x: float(x.replace("]", "").split(";")[1].strip('[]')))

    # Create a distance matrix from the data
    distance_matrix = create_distance_matrix(filtered_df)

    avg_distance = 0
    for line in distance_matrix:
        if sum(line) / len(line) > avg_distance:
            avg_distance = sum(line) / len(line)
    return avg_distance

In [138]:
def avg_time_diff(df, nomLine, sens):
    # Filtrer les lignes en fonction de la condition nomcourtligne == 'C1' et sens == 0
    filtered_df = df[df["nomcourtligne"] == nomLine]
    filtered_df = filtered_df[filtered_df["sens"] == sens]

    # Extract the "ecartsecondes" column and convert it to a list
    time_diffs = filtered_df['ecartsecondes'].tolist()

    # Calculate the average time difference
    if time_diffs:
        avg_time_diff = sum(time_diffs) / len(time_diffs)
    else:
        avg_time_diff = None
    return avg_time_diff

In [139]:
def bus_count(df, nomLine, sens):
    # Filtrer les lignes en fonction de la condition nomcourtligne == 'C1' et sens == 0
    filtered_df = df[df["nomcourtligne"] == nomLine]
    filtered_df = filtered_df[filtered_df["sens"] == sens]

    # Obtenir le nombre de bus
    bus_count = len(filtered_df)
    return bus_count

In [140]:
def lenOfLine(nomLine, sens):
    with open('tco-bus-topologie-parcours-td.csv', 'r') as f:
        lines = f.readlines()
        lines = [line.strip().replace('"', '') for line in lines]
        lines = [line.split(';') for line in lines]
    #convert the list to a dataframe with line 0 the header
    dfTopo = pd.DataFrame(lines[1:], columns=lines[0])
    #get the value the column Longueur if Ligne (nom court) == nomLine and Code du sens == sens
    return dfTopo.loc[(dfTopo['Ligne (nom court)'] == nomLine.replace("'", "").replace(" ", "")) & (
            dfTopo['Code du sens'] == str(sens)) & (
                              dfTopo['Type'] == 'Principal'), 'Longueur'].iloc[0]

In [141]:
def getDay(df):
    #get date from df and convert format YYYY-MM-DD into format YYYY-MM-DD
    D = pd.to_datetime(df['date']).dt.day_name().iloc[0]
    D = D.replace('Monday', '0').replace('Tuesday', '0').replace('Wednesday', '0').replace('Thursday',
                                                                                           '0').replace(
        'Friday', '0').replace('Saturday', '1').replace('Sunday', '2')
    return int(D)

In [142]:
def getMaxFreq(df, nomLine, level):
    with open('mkt-frequentation-niveau-freq-max-ligne.csv', 'r') as f:
        lines = f.readlines()
        lines = [line.strip().replace('"', '') for line in lines]
        lines = [line.split(';') for line in lines]
    #convert the list to a dataframe with line 0 the header
    dfFreq = pd.DataFrame(lines[1:], columns=lines[0])

    # get houre from df and convert format HH:MM:SS into format HH:MM and round to the nearest 30 minutes
    H = pd.to_datetime(df['heure']).dt.round('30min').dt.time.iloc[0]
    H = str(H).replace(":00:00", ":00").replace(":30:00", ":30")
    # get date from df and convert format YYYY-MM-DD into format lundi, mardi, mercredi, jeudi, vendredi, samedi, dimanche
    D = pd.to_datetime(df['date']).dt.day_name().iloc[0]
    # convert to french
    D = D.replace('Monday', 'Lundi-vendredi').replace('Tuesday', 'Lundi-vendredi').replace(
        'Wednesday', 'Lundi-vendredi').replace('Thursday', 'Lundi-vendredi').replace('Friday',
                                                                                     'Lundi-vendredi').replace(
        'Saturday', 'Samedi').replace('Sunday', 'Dimanche')

    # get the value the column Niveau_fréquentation if Ligne == nomLine and Tranche_horaire == H and Jour == D
    tmp = dfFreq.loc[(dfFreq['Ligne'] == nomLine.replace("'", "").replace(" ", "")) & (
            dfFreq['Tranche_horaire'] == str(H)) & (
                             dfFreq['Jour_semaine'] == D), 'Niveau_fréquentation'].iloc[0]

    if level == tmp:
        return 1
    else:
        return 0



In [144]:
# Chemin du dossier contenant les fichiers CSV
data_folder = "dataTest"

nomLine = " 'C1'"
sens = 0

dfTotal = pd.DataFrame(
    columns=['avg_distance', 'avg_time_diff', 'bus_count', 'length', 'day', 'maxFreqL', 'maxFreqM', 'maxFreqH'])

# Parcourir tous les fichiers du dossier
# Convert the data type to float32
for file in os.listdir(data_folder):
    # Vérifier si le fichier est un fichier CSV
    if file.endswith(".csv"):
        # Construire le chemin complet du fichier
        file_path = os.path.join(data_folder, file)

        # Lire le fichier CSV et le convertir en dataframe
        df = pd.read_csv(file_path)

        # Create a new dataframe with the average distance difference, average time difference, and the number of buses, the length of the line , the max frequency
        data = {'avg_distance': [avg_distance(df, nomLine, sens)],
                'avg_time_diff': [avg_time_diff(df, nomLine, sens)],
                'bus_count': [bus_count(df, nomLine, sens)], 'length': [lenOfLine(nomLine, sens)],
                'day': [getDay(df)],
                'maxFreqL': [getMaxFreq(df, nomLine, 1)],
                'maxFreqM': [getMaxFreq(df, nomLine, 2)],
                'maxFreqH': [getMaxFreq(df, nomLine, 3)]}
        summary_df = pd.DataFrame(data, columns=['avg_distance', 'avg_time_diff', 'bus_count', 'length',
                                                 'day', 'maxFreqL', 'maxFreqM', 'maxFreqH'])

        # add the new dataframe to the total dataframe with concat
        dfTotal = pd.concat([dfTotal, summary_df], ignore_index=True)

dfTotal = dfTotal.astype(np.float32)
print(dfTotal.head())

# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    dfTotal[['avg_distance', 'avg_time_diff', 'bus_count', 'length', 'day']],
    dfTotal[['maxFreqL', 'maxFreqM', 'maxFreqH']], test_size=0.2)


   avg_distance  avg_time_diff  bus_count        length  day  maxFreqL  \
0    212.054642     145.500000        6.0  13219.599609  1.0       0.0   
1    212.054642     145.500000        6.0  13219.599609  1.0       0.0   
2    210.000168     140.166672        6.0  13219.599609  1.0       0.0   
3    210.000168     140.166672        6.0  13219.599609  1.0       0.0   
4    213.290405     156.333328        6.0  13219.599609  1.0       0.0   

   maxFreqM  maxFreqH  
0       0.0       0.0  
1       0.0       0.0  
2       0.0       0.0  
3       0.0       0.0  
4       0.0       0.0  


In [145]:
model.fit(X_train, y_train, epochs=500, validation_data=(X_test, y_test))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7f07ea3b5f60>

In [146]:
model.evaluate(X_test, y_test)



[0.0, 0.0]