# PROJET HADOOP/SPARK - MS-SIO-2019 - API SNCF TRANSILIEN - PARTIE I & II
## SPARK STRUCTURED STREAMING - KAFKA PRODUCER
### P.Hamy, N.Leclercq, L.Poncet - MS-SIO-2019

In [None]:
import json
import time
import logging
import requests
import xmltodict
from collections import OrderedDict
from kafka import KafkaProducer 
from task import *
from api_transilien_tools import NotebookCellContent

In [None]:
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.ERROR, datefmt='%H:%M:%S')

Les infos de login relatives à l'API Transilien sont chargées depuis un fichier local nommé 'api_transilien_login.json' contenant le dictionnaire suivant (attention il est important de mettre login et  password entre "quotes" afin qu'ils soient interprétés comme des chaines de caractères):
```
{
    login: "xxxxxxx",
    password: "xxxxxxx"
}
```
Vous pouvez créer ce ficher à l'aide de la cellule suivante. Il suffit de la convertir de 'Markdown' à 'Code' (cf. bar de menu du notebook) et de l'executer (pensez à changer le loggin et password :-) sinon, recommencer, l'option 'w+' écrase le fichier éxistant)

**TransilienApi** : classe d'interface de l'API SNCF 

In [None]:
class TransilienApi(NotebookCellContent):
    
    def __init__(self, credentials_file, managed_stations=None, log_output=None):
        NotebookCellContent.__init__(self, output=log_output)
        self.managed_stations = managed_stations
        self.trains_data = None
        self.stations_by_code = None
        self.stations_by_name = None
        self.converter = None
        self.stations_iterator = None
        self.logins = None
        self.passwords = None
        self.logins_iterator = None
        self.passwords_iterator = None
        self.__load_credentials(credentials_file)

    def __load_credentials(self, credentials_file):
        with open(credentials_file, 'r', encoding='utf-8') as f:
            self.credentials = json.load(f)
        self.logins = self.credentials.get('logins', [self.credentials['login']])
        self.passwords = self.credentials.get('passwords', [self.credentials['password']])
        
    def set_converter(self, converter):
        assert(isinstance(converter, Converter))
        self.converter = converter
       
    def set_managed_stations(self, managed_stations):
        if  managed_stations == '*':
            self.managed_stations = list(self.get_stations_code().keys())
        else:
            assert(isinstance(managed_stations, list))
            assert(len(managed_stations))
            self.managed_stations = managed_stations
        
    def load_stations_code(self, fullpath):
        with open(fullpath, "r", encoding='utf-8') as f:
            tmp = json.load(f)
        self.stations_by_code = OrderedDict(sorted(tmp.items(), key=lambda x: x[0]))
        
    def get_stations_code(self):
        return self.stations_by_code
    
    def __next_login(self):
        try:
            return next(self.logins_iterator)
        except:
            self.logins_iterator = iter(self.logins)
            return next(self.logins_iterator)
      
    def __next_password(self):
        try:
            return next(self.passwords_iterator)
        except:
            self.passwords_iterator = iter(self.passwords)
            return next(self.passwords_iterator)
        
    def __next_station(self):
        try:
            return next(self.stations_iterator)
        except:
            self.stations_iterator = iter(self.managed_stations)
            return next(self.stations_iterator)
        
    def poll_next_station_data(self):
        assert(self.managed_stations is not None)
        assert(self.stations_by_code is not None)
        trains_data = {}
        for i in range(len(self.logins)):
            station = self.__next_station()
            self.debug(f"API: polling data from station {station}...")
            url = f"https://api.transilien.com/gare/{station}/depart"
            response = requests.get(url, auth=(self.__next_login(), self.__next_password()))
            self.debug(f"API: request response {response}")
            trains_data[station] = self.__parse_xml_data(station, response.content)
        if self.converter is not None:
            trains_data = self.converter.convert(trains_data)
        return trains_data
            
    def poll_trains_data_backup(self):
        assert(self.managed_stations is not None)
        assert(self.stations_by_code is not None)
        trains_data = {}
        for station in self.managed_stations:
            url = f"https://api.transilien.com/gare/{station}/depart"
            response = requests.get(url, auth=(self.login, self.passwd))
            self.debug(f"API request response: {response}")
            trains_data[station] = self.__parse_xml_data(station, response.content)
        if self.converter is not None:
            trains_data = self.converter.convert(trains_data)
        return trains_data
    
    def __parse_xml_data(self, station, xml_response):
        station_trains_data = {
            'station': {
                'code':station, 
                'label':self.stations_by_code[station]['label'], 
                'latitude':self.stations_by_code[station]['latitude'], 
                'longitude':self.stations_by_code[station]['longitude']}, 
            'departures':[]
        }
        try:
            xml2dict = xmltodict.parse(xml_response)
            xml2dict_trains = xml2dict['passages']['train']
            for entry in xml2dict_trains:
                # project constraint: trains belonging to the 'managed Transilien line' only
                # here we translate this constraint by selecting trains with 'known terminus' only
                terminus_code = entry.get('term', None)
                if terminus_code is None or terminus_code not in self.stations_by_code:
                    continue
                # ok, the current 'entry' is releated to  
                train = {}
                train['date'] = entry['date']['#text'].split(' ')[0]
                train['time'] = entry['date']['#text'].split(' ')[1]
                train['number'] = entry['num']
                train['mission'] = entry['miss']
                train['mode'] = entry['date']['@mode']
                train['terminus'] = {'code':terminus_code, 'label':self.stations_by_code[terminus_code]}
                station_trains_data['departures'].append(train)
        except Exception as e:
            #print(e)
            station_trains_data['departures'] = []
        return station_trains_data

On définit une classe **Converter** dont les classes filles ont pour rôle est de convertir les données vers différents formats

In [None]:
class Converter:
    
    def convert(trains_data):
        raise Exception("Converter.convert: default impl. called!")

**JsonConverter** est un **Converter** dédié au format json...

In [None]:
class JsonConverter(Converter):

    def convert(self, trains_data):
        assert(isinstance(trains_data, (dict, OrderedDict)))
        departures = []
        for station, station_data in trains_data.items():
            for train_data in station_data['departures']:
                time = f"{train_data['time']}"
                date = '-'.join(train_data['date'].split('/')[::-1])
                timestamp = f"{date}T{time}:00.000Z"
                departure = {
                    # station identifier (number)
                    'station':int(station), 
                    # train identifier (string)
                    'train': train_data['number'], 
                    # departure time (string)
                    'timestamp':timestamp,
                    # departure mode (string) 
                    'mode':train_data['mode'],
                    # mission code (string)
                    'mission':train_data['mission'],
                    # terminus (i.e. station) identifier (number)
                    'terminus':int(train_data['terminus']['code'])
                } 
                departures.append(departure)
        return departures      

**KafkaProducerTask** : une task (i.e. thread avec message queue) qui gére le polling des données sur l'API SNCF et leur injection dans le streame Kafka.

In [None]:
class KafkaProducerTask(Task, NotebookCellContent):

    # -------------------------------------------------------------------------------
    def __init__(self, config):
    # -------------------------------------------------------------------------------
        Task.__init__(self, "KafkaProducerTask")
        NotebookCellContent.__init__(self, "KafkaProducerTask")
        self.config = config
        # setup logging
        self.last_clear_outputs_ts = time.time()
        self.set_logging_level(logging.DEBUG)
        self.debug("TSP:initializing...")
        # setup data polling & streaming
        self.producer = None
        self.__setup_api()
        self.debug("TSP:`-> done!")
    
    # -------------------------------------------------------------------------------
    def __setup_api(self):
    # -------------------------------------------------------------------------------
        # setup the SNCF API
        credentials_file = self.config.get('credentials', './api_transilien_login.json')
        self.api = TransilienApi(credentials_file, log_output=self.output)
        self.api.set_converter(JsonConverter())
        self.api.load_stations_code("./transilien_line_l_stations_by_code.json")
        self.api.set_managed_stations('*')
        
    # -------------------------------------------------------------------------------
    def on_init(self):
    # -------------------------------------------------------------------------------
        # instanciate the KafkaProducer
        self.debug("KafkaProducerTask: intializing KafkaProducer instance...")
        self.producer = KafkaProducer(
                            client_id='transilien-producer-01',
                            bootstrap_servers = self.config.get('bootstrap_servers', ['sandbox-hdp.hortonworks.com:6667']),
                            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                            acks=0,
                            api_version=(0, 10, 1)
                        )
        self.debug("`-> done!")
        # force call to handle_periodic_message (force data update)
        self.handle_periodic_message()
        # then setup ourself to poll data from the SNCF API at a given period (in seconds)
        p = self.config.get('api_polling_period_in_seconds', 2.)
        self.enable_periodic_message(p)
        
    # -------------------------------------------------------------------------------
    def on_exit(self):
    # -------------------------------------------------------------------------------
        # close the KafkaProducer
        self.debug("KafkaProducerTask: closing the KafkaProducer instance...")
        self.producer.close()
        self.debug("`-> done!")
       
    # -------------------------------------------------------------------------------
    def clearOutputs(self):
    # -------------------------------------------------------------------------------
        # clear outputs (i.e. clear our 'mother notebook-cell')
        clear_outputs_period = self.config.get('clear_outputs_period', 4)
        if (time.time() - self.last_clear_outputs_ts) > clear_outputs_period:
            self.clear_output()
            self.last_clear_outputs_ts = time.time()
            
    # -------------------------------------------------------------------------------
    def handle_periodic_message(self):
    # -------------------------------------------------------------------------------
        # asynchronous periodic job...
        try:
            # clear cell content (avoid cumiulating to mush log into the notebook cell)
            #self.clearOutputs()
            self.clear_output()
            # do the job...
            self.debug("KafkaProducerTask: polling data from the SNCF API...")
            t = time.time()
            #departures = self.api.poll_trains_data()
            departures = self.api.poll_next_station_data()
            self.debug(f"`-> obtained {len(departures)} train entries in {round(time.time() - t, 2)} s")
            self.debug(f"KafkaProducerTask: injecting data into the Kafka topic '{self.config['topic']}'")
            t = time.time()
            for departure in departures:
                try:
                    self.producer.send(self.config['topic'], departure)
                except Exception as e:
                    print(e)
            self.debug(f"`-> took {round(time.time() - t, 2)} s")
        except Exception as e:
            self.error(e)

In [None]:
producer_task_config = {
    'bootstrap_servers': ['sandbox-hdp.hortonworks.com:6667'],
    'topic': 'transilien-02',
    'api_polling_period_in_seconds':2.
}

In [None]:
producer_task = KafkaProducerTask(producer_task_config)

In [None]:
producer_task.start_asynchronously()

In [None]:
producer_task.exit()