In [2]:
import pandas as pd
from datetime import datetime
from datetime import timedelta
import statistics
from math import sin, cos, sqrt, atan2, radians

In [6]:
cities_df= pd.read_csv("/content/cities.csv")
providers_df = pd.read_csv("/content/providers.csv")
stations_df = pd.read_csv("/content/stations.csv")
ticket_data_df = pd.read_csv("/content/ticket_data.csv")

In [24]:
#on crée des dictionnaires pour rendre l'accès plus rapide par la suite

def df_dic(df,keys,values):
    dic={}
    for _,row in df.iterrows():
        dic[row[keys]]= row[values]
    return dic

dic_city_to_id = df_dic(cities_df,"unique_name","id")
dic_id_to_city = df_dic(cities_df,"id","unique_name")
dic_id_to_station = df_dic(stations_df,"id","unique_name")
dic_cityid_to_long = df_dic(cities_df,"id","longitude")
dic_cityid_to_lat = df_dic(cities_df,"id","latitude")
dic_id_to_compagnie = df_dic(providers_df,"id","fullname")
dic_id_to_transport_type = df_dic(providers_df,"id","transport_type")

In [12]:
#on crée un index inversé afin de raccourcir les temps de recherche dans les trajets

try:
    index_inverse_villes
    print("l'index est déjà généré")
except:
    index_inverse_villes ={}
    for index,trajet in ticket_data_df.iterrows():
        if trajet["o_city"] in index_inverse_villes.keys():
            index_inverse_villes[trajet["o_city"]].append(trajet)
        else:
            index_inverse_villes[trajet["o_city"]]=[(trajet)]
    print("index inversé généré")

l'index est déjà généré


In [14]:
def temps_trajet(o_ts,d_ts):
    return datetime.strptime(d_ts, '%Y-%m-%d %H:%M:%S+%f') - datetime.strptime(o_ts, '%Y-%m-%d %H:%M:%S+%f')

class Trajet:
    def __init__(self,row):
        self.id = row["id"]
        self.company = row["company"]
        self.o_station = row["o_station"]
        self.d_station = row["d_station"]
        self.departure_ts = row["departure_ts"]
        self.arrival_ts = row["arrival_ts"] 
        self.price_in_cents = row["price_in_cents"]
        self.search_ts = row["search_ts"]
        self.middle_stations = row["middle_stations"]
        self.other_companies = row["other_companies"]
        self.o_city = row["o_city"]
        self.d_city = row["d_city"] 
    def temps_trajet(self):
        return datetime.strptime(self.arrival_ts, '%Y-%m-%d %H:%M:%S+%f') - datetime.strptime(self.departure_ts, '%Y-%m-%d %H:%M:%S+%f')
    def distance(self):
      #distance euclidienne entre deux points
            R = 6373.0
            lat1 = radians(dic_cityid_to_lat[self.o_city])
            lon1 = radians(dic_cityid_to_long[self.o_city])
            lat2 = radians(dic_cityid_to_lat[self.d_city])
            lon2 = radians(dic_cityid_to_long[self.d_city])
            dlon = lon2 - lon1
            dlat = lat2 - lat1
            a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
            c = 2 * atan2(sqrt(a), sqrt(1 - a))
            return R * c
    def __str__(self):
        try:
            o_city_str = dic_id_to_city[self.o_city]
        except:
            print("[ERREUR] ville de départ absente de la BDD" )
            o_city_str = "---"
        try:
            d_city_str = dic_id_to_city[self.d_city]
        except:
            print("[ERREUR] ville de d'arrivée absente de la BDD")
            d_city_str = "---" 

        return o_city_str + " -> " + d_city_str + " temps: "+ str(temps_trajet(self.departure_ts,self.arrival_ts)) + " prix: "+str(self.price_in_cents/100)+"€"


In [30]:
#repertorie les trajets entre deux villes 
def trajets_original_to_destination(o_city,d_city):
    list=[]
    for trajet in index_inverse_villes[dic_city_to_id[o_city]]:
        if (dic_city_to_id[d_city] == int(trajet["d_city"])): 
            list.append(Trajet(trajet))
    return list

def str_to_date(str):
    date_time_obj = datetime.datetime.strptime(str, '%Y-%m-%d %H:%M:%S.%f')
    print(date_time_obj.date())

def info_temps_depart_arrivee(city1,city2):
    list = trajets_original_to_destination(city1,city2)
    trajet_min=list[0]
    print(city1 + " -> "+city2)
    #MIN
    for trajet in list:
        if trajet.temps_trajet()<trajet_min.temps_trajet():
            trajet_min = trajet
    print("trajet le plus court : "+str(trajet_min.id) + " - " +str(trajet_min.temps_trajet()))
    #MAX
    trajet_min=list[0]
    for trajet in list:
        if trajet.temps_trajet()>trajet_min.temps_trajet():
            trajet_min = trajet
    print("trajet le plus long : "+str(trajet_min.id) + " - " +str(trajet_min.temps_trajet()))
    #AVG
    sum=0
    for trajet in list:
        sum += trajet.temps_trajet().seconds
    print("moyenne des durées : "+str(timedelta(seconds=round(sum/len(list)))))
    print("\n")


def info_prix_depart_arrivee(city1,city2):
    list = trajets_original_to_destination(city1,city2)
    trajet_min=list[0]
    print(city1 + " -> "+city2)
    #MIN
    for trajet in list:
        if trajet.price_in_cents<trajet_min.price_in_cents:
            trajet_min = trajet
    print("trajet le moins cher : " + str(trajet_min.id) + " - " + str(trajet_min.price_in_cents/100) + "€")
    #MAX
    trajet_min=list[0]
    for trajet in list:
        if trajet.price_in_cents>trajet_min.price_in_cents:
            trajet_min = trajet
    print("trajet le plus cher : " + str(trajet_min.id) + " - " + str(trajet_min.price_in_cents/100) + "€")
    #AVG
    sum=0
    for trajet in list:
        sum += trajet.price_in_cents
    print("moyenne des prix : "+str(round(sum/(100*len(list)),2))+"€" )
    print("\n")

#on centre et on reduit le temps pour pouvoir comparer les temps et les prix. on additionne les deux afin d'obtenir un score de performance et on classe selon ce score. On pourrait rajouter des coeficients pour le prix ou le temps
def score_trajet(city1,city2):
    liste = trajets_original_to_destination(city1,city2)
    list_prix = list(map(lambda x: x.price_in_cents,liste))
    mean_price = statistics.mean(list_prix)
    pstdev_price = statistics.pstdev(list_prix)
    key_price = lambda x:((x.price_in_cents-mean_price)/pstdev_price)
    list_temps = list(map(lambda x: x.temps_trajet().total_seconds(),liste))
    mean_temps = statistics.mean(list_temps)
    pstdev_temps = statistics.pstdev(list_temps)
    key_temps = lambda x:round(((x.temps_trajet().total_seconds()-mean_temps)/pstdev_temps),0)
    return sorted(liste,key = lambda x :key_price(x)+key_temps(x))

def analyse_transport(city1,city2):
    print(city1 +" -> "+city2)
    sc = trajets_original_to_destination(city1,city2)
    dic_transport = {}
    dic_result = {}
    for trajet in sc:
        transport_type = dic_id_to_transport_type[trajet.company]
        if transport_type not in dic_transport.keys():
            dic_transport[transport_type] = []
        dic_transport[transport_type].append(trajet)
    for t_type in dic_transport.keys():
        prix_min = round(min(list(map(lambda x:x.price_in_cents,dic_transport[t_type])))/100,2)
        prix_moyen = round(statistics.mean(list(map(lambda x:x.price_in_cents,dic_transport[t_type])))/100,2)
        prix_max = round(max(list(map(lambda x:x.price_in_cents,dic_transport[t_type])))/100,2)
        temps_min = timedelta(seconds = round(min(list(map(lambda x:x.temps_trajet().total_seconds(),dic_transport[t_type])))))
        temps_moyen = timedelta(seconds = round(statistics.mean(list(map(lambda x:x.temps_trajet().total_seconds(),dic_transport[t_type])))))
        temps_max = timedelta(seconds = round(max(list(map(lambda x:x.temps_trajet().total_seconds(),dic_transport[t_type])))))
        dic_result[t_type]=[prix_min,prix_moyen,prix_max,temps_min,temps_moyen,temps_max]
    return dic_result   

def print_analyse_transport(dic):
    for t_type in dic.keys():
        print(t_type + " - prix minimum : " + str(dic[t_type][0])+"€")
        print(t_type + " - prix moyen : " + str(dic[t_type][1])+"€")
        print(t_type + " - prix maximum : " + str(dic[t_type][2])+"€")
        print(t_type + " - temps minimum : " +str(dic[t_type][3]))
        print(t_type + " - temps moyen : " +str(dic[t_type][4]))
        print(t_type + " - temps maximum : " +str(dic[t_type][5]))

  

In [21]:
info_temps_depart_arrivee("orleans","montpellier")
info_prix_depart_arrivee("orleans","montpellier")

orleans -> montpellier
trajet le plus court : 6795025 - 6:10:00
trajet le plus long : 6795027 - 1 day, 7:57:00
moyenne des durées : 15:03:36


orleans -> montpellier
trajet le moins cher : 6795026 - 14.5€
trajet le plus cher : 6795028 - 135.0€
moyenne des prix : 69.22€




In [22]:
info_prix_depart_arrivee("paris","lille")
info_temps_depart_arrivee("paris","lille")

paris -> lille
trajet le moins cher : 6800181 - 10.0€
trajet le plus cher : 6839070 - 134.5€
moyenne des prix : 20.31€


paris -> lille
trajet le plus court : 6814308 - 1:08:00
trajet le plus long : 6832420 - 1 day, 13:20:00
moyenne des durées : 3:07:22




In [31]:
print(print_analyse_transport(analyse_transport("paris","lille")))

paris -> lille
carpooling - prix minimum : 11.0€
carpooling - prix moyen : 16.89€
carpooling - prix maximum : 134.5€
carpooling - temps minimum : 1:50:00
carpooling - temps moyen : 2:41:56
carpooling - temps maximum : 14:20:00
bus - prix minimum : 10.0€
bus - prix moyen : 18.79€
bus - prix maximum : 26.8€
bus - temps minimum : 4:10:00
bus - temps moyen : 12:02:15
bus - temps maximum : 1 day, 13:20:00
train - prix minimum : 19.5€
train - prix moyen : 42.39€
train - prix maximum : 98.0€
train - temps minimum : 1:08:00
train - temps moyen : 3:29:04
train - temps maximum : 1 day, 9:29:00
None


In [32]:
trajet_par_distance=[[],[],[],[]]
for index,row in ticket_data_df.iterrows():
    trajet = Trajet(row)
    if trajet.distance()<=200:
        trajet_par_distance[0].append(trajet)
    elif trajet.distance()<=800:
        trajet_par_distance[1].append(trajet)
    elif trajet.distance()<=2000:
        trajet_par_distance[2].append(trajet)
    else:
        trajet_par_distance[3].append(trajet)

In [33]:
for sc in trajet_par_distance:
    print(len(sc))
    dic_transport = {}
    dic_result = {}
    for trajet in sc:
        transport_type = dic_id_to_transport_type[trajet.company]
        if transport_type not in dic_transport.keys():
            dic_transport[transport_type] = []
        dic_transport[transport_type].append(trajet)
    for t_type in dic_transport.keys():
        prix_min = round(min(list(map(lambda x:x.price_in_cents,dic_transport[t_type])))/100,2)
        prix_moyen = round(statistics.mean(list(map(lambda x:x.price_in_cents,dic_transport[t_type])))/100,2)
        prix_max = round(max(list(map(lambda x:x.price_in_cents,dic_transport[t_type])))/100,2)
        temps_min = timedelta(seconds = round(min(list(map(lambda x:x.temps_trajet().total_seconds(),dic_transport[t_type])))))
        temps_moyen = timedelta(seconds = round(statistics.mean(list(map(lambda x:x.temps_trajet().total_seconds(),dic_transport[t_type])))))
        temps_max = timedelta(seconds = round(max(list(map(lambda x:x.temps_trajet().total_seconds(),dic_transport[t_type])))))
        dic_result[t_type]=[prix_min,prix_moyen,prix_max,temps_min,temps_moyen,temps_max]
    for t_type in dic_result.keys():
        print(t_type + " - prix minimum : " + str(dic_result[t_type][0])+"€")
        print(t_type + " - prix moyen : " + str(dic_result[t_type][1])+"€")
        print(t_type + " - prix maximum : " + str(dic_result[t_type][2])+"€")
        print(t_type + " - temps minimum : " +str(dic_result[t_type][3]))
        print(t_type + " - temps moyen : " +str(dic_result[t_type][4]))
        print(t_type + " - temps maximum : " +str(dic_result[t_type][5]))
    print("\n")


13724
carpooling - prix minimum : 3.0€
carpooling - prix moyen : 11.77€
carpooling - prix maximum : 128.5€
carpooling - temps minimum : 0:20:00
carpooling - temps moyen : 1:57:07
carpooling - temps maximum : 17:20:00
bus - prix minimum : 8.5€
bus - prix moyen : 21.83€
bus - prix maximum : 229.0€
bus - temps minimum : 1:05:00
bus - temps moyen : 10:13:57
bus - temps maximum : 13 days, 5:45:00
train - prix minimum : 4.9€
train - prix moyen : 34.88€
train - prix maximum : 251.0€
train - temps minimum : 0:39:00
train - temps moyen : 4:25:10
train - temps maximum : 1 day, 10:27:00


58877
carpooling - prix minimum : 8.5€
carpooling - prix moyen : 32.18€
carpooling - prix maximum : 138.0€
carpooling - temps minimum : 1:20:00
carpooling - temps moyen : 4:45:47
carpooling - temps maximum : 19:50:00
bus - prix minimum : 10.0€
bus - prix moyen : 34.95€
bus - prix maximum : 224.8€
bus - temps minimum : 2:29:00
bus - temps moyen : 15:04:29
bus - temps maximum : 20 days, 12:51:00
train - prix minim