# SNCF - OPEN DATA - API TRANSILIEN - "PROCHAINS DEPARTS"

In [1]:
import json
import requests
import xmltodict
from collections import OrderedDict

In [2]:
class TransilienApi:
    
    def __init__(self, login, password, managed_stations=None):
        self.login = login
        self.passwd = password
        self.managed_stations = managed_stations
        self.trains_data = None
        self.stations_by_code = None
        self.stations_by_name = None
        self.converter = None
    
    def set_converter(self, converter):
        assert(isinstance(converter, Converter))
        self.converter = converter
       
    def set_managed_stations(self, managed_stations):
        if  managed_stations == '*':
            self.managed_stations = list(t.get_stations_code().keys())
        else:
            assert(isinstance(managed_stations, list))
            assert(len(managed_stations))
            self.managed_stations = managed_stations
        
    def load_stations_code(self, fullpath):
        with open(fullpath, "r", encoding='utf-8') as f:
            tmp = json.load(f)
        self.stations_by_code = OrderedDict(sorted(tmp.items(), key=lambda x: x[0]))
        
    def get_stations_code(self):
        return self.stations_by_code
    
    def poll_trains_data(self):
        assert(self.managed_stations is not None)
        assert(self.stations_by_code is not None)
        trains_data = {}
        for station in self.managed_stations:
            url = f"https://api.transilien.com/gare/{station}/depart"
            response = requests.get(url, auth=(self.login, self.passwd))
            trains_data[station] = self.parse_xml_data(station, response.content)
        if self.converter is not None:
            trains_data = self.converter.convert(trains_data)
        return trains_data
    
    def parse_xml_data(self, station, xml_response):
        station_trains_data = {
            'station': {
                'code':station, 
                'label':self.stations_by_code[station]['label'], 
                'latitude':self.stations_by_code[station]['latitude'], 
                'longitude':self.stations_by_code[station]['longitude']}, 
            'departures':[]
        }
        try:
            xml2dict = xmltodict.parse(xml_response)
            xml2dict_trains = xml2dict['passages']['train']
            for entry in xml2dict_trains:
                # project constraint: real date/time only
                #if entry['date']['@mode'] != 'R':
                #    continue
                # project constraint: trains belonging to the 'managed Transilien line' only
                # here we translate this constraint by selecting trains with 'known terminus' only
                terminus_code = entry.get('term', None)
                if terminus_code is None or terminus_code not in self.stations_by_code:
                    continue
                # ok, the current 'entry' is releated to  
                train = {}
                train['date'] = entry['date']['#text'].split(' ')[0]
                train['time'] = entry['date']['#text'].split(' ')[1]
                train['number'] = entry['num']
                train['mission'] = entry['miss']
                train['terminus'] = {'code':terminus_code, 'label':self.stations_by_code[terminus_code]}
                station_trains_data['departures'].append(train)
        except Exception as e:
            #print(e)
            station_trains_data['departures'] = []
        return station_trains_data

Les infos de login relatives à l'API Transilien sont chargées depuis un fichier local nommé 'api_transilien_login.json' contenant le dictionnaire suivant (attention il est important de mettre login et  password entre "quotes" afin qu'ils soient interprétés comme des chaines de caractères):
```
{
    login: "xxxxxxx",
    password: "xxxxxxx"
}
```
Vous pouvez créer ce ficher à l'aide de la cellule suivante. Il suffit de la convertir de 'Markdown' à 'Code' (cf. bar de menu du notebook) et de l'executer (pensez à changer le loggin et password :-) sinon, recommencer, l'option 'w+' écrase le fichier éxistant)

Chargement des credentials...

In [3]:
with open('./api_transilien_login.json', 'r', encoding='utf-8') as f:
    credentials = json.load(f)

Liste des gares d'intérêt (gares pour lesquelles sont souhaite obtenir les horaires de trains)

In [4]:
managed_stations = [ 
    #'87384008', # PARIS SAINT-LAZARE (GARE SAINT-LAZARE)
    '87381111'  # PONT CARDINET
]

Instanciation de la classe TransilienApi

In [5]:
t = TransilienApi(credentials['login'], credentials['password'])

Chargement des dictionnaires (code-gare <-> label-gare) 

In [6]:
t.load_stations_code("./transilien_line_l_stations_by_code.json")

In [7]:
t.set_managed_stations('*')

Requête de données temps réel

Y'a plus qu'à pousser ça dans un message Kafka...

On définit une classe 'Converter' dont les classes filles ont pour rôle est de convertir les données vers différents formats...

In [8]:
class Converter:
    
    def convert(trains_data):
        raise Exception("Converter.convert: default impl. called!")

'ProtobufConverter' est un 'Converter' dédié au format google protocol buffer...

In [9]:
class JsonConverter(Converter):

    def convert(self, trains_data):
        assert(isinstance(trains_data, (dict, OrderedDict)))
        departures = []
        for station, station_data in trains_data.items():
            for train_data in station_data['departures']:
                time = f"{train_data['time']}"
                date = '-'.join(train_data['date'].split('/')[::-1])
                timestamp = f"{date}T{time}:00.000Z"
                departure = {
                    'station':int(station),
                    'timestamp':timestamp,
                    'train': train_data['number']
                } 
                departures.append(departure)
        return departures      

Attachons un 'ProtobufConverter' à notre instance de 'TransilienApi'...

In [10]:
t.set_converter(JsonConverter())

So far, so good... y'a plus qu'à pousser 'pb_msg' dans Kafka... (en tant que bunch d'octets ???)

In [11]:
from kafka import KafkaProducer

In [12]:
producer = KafkaProducer(
    client_id='transilien-producer-01',
    bootstrap_servers=['sandbox-hdp.hortonworks.com:6667'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8'),
    acks=0,
    api_version=(0, 10, 1)
)

In [13]:
import time

In [None]:
topic = "transilien-02"

In [None]:
for n in range(1000):
    departures = t.poll_trains_data()
    for departure in departures:
        try:
            producer.send(topic, departure)
        except Exception as e:
            print(e)
    time.sleep(30)