# 1. Import Libraries and Data
First, all necessary libraries are imported along with the dataset.

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import time
import datetime
import sklearn
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.neural_network
import sklearn.ensemble
import tensorflow as tf

# Check SUMO_HOME is set
import os, sys
if 'SUMO_HOME' in os.environ:
    tools = os.path.join(os.environ['SUMO_HOME'], 'tools')
    sys.path.append(tools)
else:
    sys.exit("Please declare environment variable 'SUMO_HOME'")
    
import traci
import sumolib
import altair as alt
import folium
import csv

In [None]:
col_types = {
    'count_point_id': 'string',
    'direction_of_travel': 'string',
    'count_date': 'string',
    'hour': 'string',
    'road_name': 'string',
    'road_type': 'string',
    'latitude': 'float',
    'longitude': 'float',
    'link_length_km': 'float',
    'pedal_cycles': 'int',
    'two_wheeled_motor_vehicles': 'int',
    'cars_and_taxis': 'int',
    'buses_and_coaches': 'int',
    'lgvs': 'int',
    'all_hgvs': 'int',
    'all_motor_vehicles': 'int' 
}

cols = list(col_types.keys())

dft_counts = pd.read_csv('dft_count_swansea.csv', sep=',', header=0,
                         index_col=None, dtype=col_types, usecols=cols, na_values='')
dft_counts['count_date'] = pd.to_datetime(dft_counts['count_date'], format= '%Y-%m-%d')
dft_counts

# 2. Cleaning the Data
In this section:
- Additional features are imputed into the dataset.
- Date values are split into day, month and year. 
- The hourly count data is unstacked and reshaped.
- Categorical features are encoded. 

In [None]:
def find_edge_speed(coordinates, net, radius=150):    
    # coordinates must be latitude then longitude
    x, y = net.convertLonLat2XY(coordinates[1], coordinates[0]) # find x/y positions within network
    edges = net.getNeighboringEdges(x, y, radius) # create list of edges in radius of point
    
    if len(edges) == 0:
        raise ValueError(f'There were no edges found within the {radius} radius of {coordinates}')
    
    # pick closest edge according to distance
    try:
        distancesAndEdges = sorted([(dist, edge) for edge, dist in edges], key=lambda x: x[0])
    except: 
        # if an edge has two directions and thus each direction has same distance      
        raise ValueError('Multiple closest edges')
        # can use the following to get a list of all same occurences
        # sorted_edges = sorted([(dist, edge) for edge, dist in edges], key=lambda x: x[0])
        # occ = [edge_tuple for edge_tuple in sorted_edges if edge_tuple[0] == sorted_edges[0][0]]
        
    dist, closestEdge = distancesAndEdges[0]
    return closestEdge._speed


def get_cp_speeds(dft_counts, net):
    start = time.time()
    
    # extract unique count points and compute edge speeds 
    count_points = dft_counts.groupby('count_point_id').first().reset_index()
    cols_to_keep = ['count_point_id', 'road_name', 'road_type',
                    'latitude', 'longitude', 'link_length_km']
    count_points = count_points[cols_to_keep]
    edge_speeds = [find_edge_speed((lat, long),net) for lat, long in
                   zip(count_points['latitude'], count_points['longitude'])]
    
    MPS_TO_KPH = 3.6 # MPS to KPH
    
    # take 5 mph off max road speed to better represent avg speed of vehicles
    # count_points['avg_speed'] = [speed - (5/MPH_TO_MPS) for speed in edge_speeds]
    count_points['avg_speed'] = [speed for speed in edge_speeds]
    
    print(f'Got count point speeds in {np.round(time.time()-start,2)} seconds')
    return count_points

net = sumolib.net.readNet('osm.net.xml') # read net file
count_points = get_cp_speeds(dft_counts, net)

In [None]:
def impute_speeds(dft_counts, count_points):
    start = time.time()
    dft_counts = pd.merge(dft_counts,
                          count_points[['count_point_id','avg_speed']],
                          on ='count_point_id', 
                          how ='inner')
    print(f'Imputed speeds in {np.round(time.time()-start,2)} seconds')
    return dft_counts

dft_counts = impute_speeds(dft_counts, count_points)

In [None]:
def clean_cols(df):
    # unstack hourly counts
    clean_df = df.set_index(['count_point_id','direction_of_travel','count_date','hour'])
    clean_df = clean_df[['all_motor_vehicles']]
    clean_df = clean_df.unstack(level=-1)
    clean_df.reset_index(inplace=True)
    
    # flatten multi-level col index and rename
    clean_df.columns = clean_df.columns.to_flat_index()
    col_names = [a for a in clean_df.columns]
    col_names = [name[0] if col_names.index(name) <= 2 else name[1] for name in col_names]
    clean_df.columns = col_names
    ordered_col_names = ['count_point_id','direction_of_travel','count_date',
                         '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']
    clean_df = clean_df[ordered_col_names]
    return clean_df

dft_counts_clean = clean_cols(dft_counts)

In [None]:
def get_cp_date_ranges(df):
    cps = [cp_id for cp_id in pd.unique(df['count_point_id'])]
    min_date = []
    max_date = []
    for cp_id in cps:
        min_date.append(df.loc[df['count_point_id'] == cp_id]['count_date'].min())
        max_date.append(df.loc[df['count_point_id'] == cp_id]['count_date'].max())
    cp_date_ranges = pd.DataFrame(
        {
            'cp_id': cps,
            'min_date': min_date,
            'max_date': max_date
        })
    return cp_date_ranges


def get_cp_dates(df):
    cps = [cp_id for cp_id in pd.unique(df['count_point_id'])]
    cps_longform = []
    dates = []
    for cp_id in cps:
        filtered_df = df.loc[df['count_point_id'] == cp_id]
        cps_longform = cps_longform + [i for i in df.loc[df['count_point_id'] == cp_id]['count_point_id']]
        dates = dates + [i for i in df.loc[df['count_point_id'] == cp_id]['count_date']]
    cp_dates = pd.DataFrame(
        {
            'cp_id': cps_longform,
            'dates': dates
        })
    return cp_dates

cp_date_ranges = get_cp_date_ranges(dft_counts_clean)
cp_dates = get_cp_dates(dft_counts_clean)

# Visualise
selector = alt.selection_interval(encodings=['y'])

dates_range = alt.Chart(cp_date_ranges).mark_bar().encode(
    x=alt.X('cp_id:O', title='Count Point ID'),
    y=alt.Y('min_date', title='Minimum - Maximum Date'),
    y2=alt.Y2('max_date', title=None),
    tooltip=[ 'cp_id','min_date', 'max_date']
).properties(
    title = 'Count Point Active Dates',
    width=600,
    height=400
)

dates = alt.Chart(cp_dates).mark_circle(color='orange').encode(
    x=alt.X('cp_id:O'),
    y=alt.Y('dates'),
    tooltip=[ 'cp_id','dates']
).properties(
    width=600,
    height=400
).add_selection(
    selector
)

dates_range + dates

# 3. Predictive Models
## 3.1 Random Forest
In this section, we will use an Sklearn Random Forest (RF) Regressor to predict the 18th hour traffic counts for each unique count point and direction of travel. This is similar to the approach in [this paper](https://ieeexplore.ieee.org/abstract/document/9230762?casa_token=LawxRQvGmnwAAAAA:rNcIU4KH2tkbI9x6f4ZIQfzioCoE7dgoAQCbcpUnCpYLl8h3p_md8eYN7FtIMtQe8Zbiz9brh3U), however they used a CNN-LTSM. By using an RF, this method does not really capture the temporal aspect of the dataset, i.e. the year the counting took place. In further sections we will explore more sophisticated and up-to-date predictive methods; ones that can be used to predict the next year traffic counts based on the previous 5 years. However, this will require various different data manipulations. 

In [None]:
# group counts by cp id, year, and direction of travel and compute the mean for each group
grouped_counts = dft_counts_clean.groupby(['count_point_id',dft_counts_clean['count_date'].dt.year,'direction_of_travel']).mean().reset_index()
# add speed and road type to the grouped data according to cp id
grouped_counts = pd.merge(grouped_counts,
                          count_points[['count_point_id','avg_speed','road_type']], # avg speed may not be useful
                          on ='count_point_id', 
                          how ='inner')
# one hot encode direction of travel and road type
counts_for_model = pd.get_dummies(grouped_counts, prefix=['direction_of_travel', 'road_type'], columns=['direction_of_travel', 'road_type'])
counts_for_model

The following function can be used to count how many years each unique count point is active, and record which years.

```python
def how_many_years(df):
    '''
    Function to count how many years each unique count point is active, and record which years.
    '''
    how_many_years = {}
    
    for cp in [cp_id for cp_id in pd.unique(df['count_point_id'])]:
        years = [year for year in pd.unique(df.loc[df['count_point_id']==cp]['count_date'])]
        count = len(years)
        how_many_years[cp] = (count, years)
    return how_many_years

how_many_years(grouped_counts)
```

In [None]:
# fix numpy seed
SEED = 202
np.random.seed(SEED) 

# split data into train and test sets
x = counts_for_model.drop('18', axis=1)
y = counts_for_model['18']
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=22)

```python
# list models to be optimised
rf = sklearn.ensemble.RandomForestRegressor()

# list parameters to search in dicts
rf_params = {
    'n_estimators': [100, 200, 500, 1000],
    'criterion': ['mse','mae'],
    'max_depth': [None, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_list = [rf]
model_names = ['RF']
model_params = [rf_params]

def optimise_hyperparameters(model_list, model_params, x_train=x_train, y_train=y_train):
    grid_search_list = []
    for i in range(len(model_list)):
        start = time.time()
        model, params = model_list[i], model_params[i]
        grid_search = sklearn.model_selection.GridSearchCV(model, params, n_jobs=5, cv=5)
        grid_search.fit(x_train, y_train)
        grid_search_list.append(grid_search)
        print(f'Found optimal hyperparameter combination for {model} in {np.round(time.time()-start,4)} seconds')
        print(grid_search.best_estimator_)
        print(grid_search.best_score_)
    return grid_search_list

grid_search_list = optimise_hyperparameters(model_list, model_params)

Found optimal hyperparameter combination for RandomForestRegressor() in 308.3771 seconds
RandomForestRegressor(max_depth=10, n_estimators=200)
0.9809518061349201
```

In [None]:
# define all the models and their hyperparameters
rf = sklearn.ensemble.RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=10, 
                                            max_features='sqrt', random_state=22)

# bundle in a list to pass to function
model_list = [rf]
model_names = ['RF']

def train_models(model_list, model_names, 
                 x_train=x_train, x_test=x_test,
                 y_train=y_train, y_test=y_test):
    '''
    Function that takes a list of models, and the x and y data.
    It fits each model and calls the accuracy function to compute a number of statistical measures. 
    Finally, a DataFrame with all the data is produced.

    '''
    accuracy_list = []
    for i in range(len(model_list)):
        model, model_name = model_list[i], model_names[i] # store the model object and name
        start = time.time() # start timer
        model.fit(x_train, y_train)# fit model to training data
        lap = time.time()-start
        print(f'Trained {model_name} in {np.round(lap,4)} seconds')
        accuracy_list.append(accuracy(model, model_name, x_train, y_train, lap)) # train measures
        accuracy_list.append(accuracy(model, model_name, x_test, y_test, lap)) # test measures                  
    accuracy_table = pd.DataFrame(accuracy_list) # make table from list of dicts
    return accuracy_table
        
        
def accuracy(model, model_name, x, y, train_time):
    '''
    Function that takes a model, and x and y data,
    and computes a number of statistical measures.
    
    '''
    predictions = model.predict(x)
    predictions = [np.float64(item) for item in predictions]
    measures = {
        'Model': model_name,
        'Train Time': train_time,
        'MSE': np.mean((y-predictions)**2),
        'RMSE': np.sqrt(np.mean((y-predictions)**2)),
        'MAE': np.mean(abs(y-predictions)),
        'MAPE': 100 - np.mean(100*(abs(y-predictions)/y)),
        'R2': model.score(x, y)}
    return measures

measures = train_models(model_list, model_names)
measures

The measures DataFrame returns an infinity becuase `y_test.iloc[189] == 0`. When calculating the MAPE you divide by the actual value which in this case is zero, thus, inf is returned. A great explanation of shortcomings of MAPE can be found [here](https://stats.stackexchange.com/a/299713).

Also, **please be cautious of which accuracy measures you are reporting!** If you were to report the prediction accuracy of the model when predicting the traffic counts of the entire dataset. These accuracy values are going to be abnormally high due to the fact the model was trained, and has seen, 80% of the data already. 

The following function can be used to un-dummify data passed to the model.\
However, since the seperation of grouped_counts and counts_for_model, this has become obsolete.

```python
def undummify(df, prefix_sep="_"):
    '''
    Function collapses a "dummified" dataframe while keeping the order of columns.
    Taken from: https://stackoverflow.com/a/62085741/13937247
    '''
    keys = [item.split(prefix_sep)[:-1] for item in df.columns]
    cols2collapse = {}
    for i in range(len(keys)):
        if len(keys[i]) > 1:
            key = '_'.join(keys[i])
            cols2collapse[key] = '_' in df.columns[i]
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=-1)[-1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

# extract and convert dummied columns
dummied_cols = ['direction_of_travel_E','direction_of_travel_N',
                'direction_of_travel_S','direction_of_travel_W',
                'road_type_Major','road_type_Minor']
dummied_df = counts_for_model[dummied_cols]
undummied_df = undummify(dummied_df)

# replace dummied columns with undummied
grouped_counts.drop(columns=dummied_cols, inplace=True)
grouped_counts = grouped_counts.join(undummied_df)
grouped_counts
```

In [None]:
# predict counts for each count point/direction of travel, for each year they were active. 
y_pred = rf.predict(x)
results = grouped_counts.copy()
results['18_pred'] = np.round(y_pred).astype(int) # round to nearest int, and convert from float -> int
results.drop(columns='18', inplace=True)

def generate_data_for_sumo(grouped_df):
    '''
    Function to aggregate data into form that SUMO wants to create traffic demand.
    At the moment, this function takes the average traffic count across all years for each count point and direction. 
    '''
    cols_to_keep = ['count_point_id','direction_of_travel','avg_speed','18_pred']
    df_for_sumo = grouped_df[cols_to_keep]
    df_for_sumo = df_for_sumo.groupby(['count_point_id', 'direction_of_travel']).mean().reset_index()
    df_for_sumo['18_pred'] = np.round(df_for_sumo['18_pred']).astype(int)
    return df_for_sumo

sumo_counts = generate_data_for_sumo(results)
sumo_counts

## 3.2 CNN
In this section we use a CNN to predict the 18th hour traffic counts after being exposed to the *m* \* *n* matrix of data as used above; where *m* is the directional count points and *n* is the hourly counts.

In [None]:
# tensorflow model here ...

# 4. Traffic Generation
In this we generate traffic demand using SUMO's [routes from observation points](https://sumo.dlr.de/docs/Demand/Routes_from_Observation_Points.html) tool.

## 4.1 Flowrouter.py
SUMO's [flowrouter.py](https://sumo.dlr.de/docs/Tools/Detector.html#flowrouterpy) serves as a drop in replacement for dfrouter which calculates a set of routes **-o** and traffic flows **-e** from given detectors **-d** and their measurements **-f** on a given network (option **-n**). Flowrouter works by solving a maximum flow problem in the given network assuming the measured flows as capacity. The input data is by default aggregated over the whole file but can be split into intervals by setting **-i**. Example call for hourly aggregation:
`<SUMO_HOME>/tools/detector/flowrouter.py -n input_net.net.xml -d detectors.xml -f flows20140520.csv -o routes.xml -e flows.xml -i 60`
Detectors which have no data (in the specified interval) or are permanently zero are ignored by default. To include them into the calculations use **--respect-zero**.

The flowrouter requires the following input files:
The traffic flow file is described in the corresponding [dfrouter documentation](https://sumo.dlr.de/docs/Demand/Routes_from_Observation_Points.html#computing_flows).

As detector file input you can either use:
- The detector file as described in the [dfrouter documentation](https://sumo.dlr.de/docs/Demand/Routes_from_Observation_Points.html#computing_detector_types)
- A detector file with types as [generated by dfrouter](https://sumo.dlr.de/docs/Demand/Routes_from_Observation_Points.html#computing_detector_types)
- A file with [induction loop definitions](https://sumo.dlr.de/docs/Simulation/Output/Induction_Loops_Detectors_%28E1%29.html)

When loading a detector file without `type` information or setting the option **--revalidate-detectors**, all network edges will be re-classified as sources, sinks or in-between.
- any edge without incoming edges will be marked as a source
- any edge without outgoing edges will be marked as a sink
- any edge that is neither source or sink is in-between

In [None]:
def find_closest_edges(coordinates, net, radius=150, edges_to_return=2):
    '''
    Function to find closest n edges to specified coordinates.
    '''
    # coordinates must be tuple of latitude then longitude
    x, y = net.convertLonLat2XY(coordinates[1], coordinates[0]) # find x/y positions within network
    edges = net.getNeighboringEdges(x, y, radius) # create list of edges in radius of point
    
    if len(edges) == 0:
        raise ValueError(f'There were no edges found within the {radius} radius of {coordinates}')
    
    # find indices of n closest edges according to distance
    distances = np.array([info[1] for info in edges])
    min_dist_indices = np.argsort(distances)[:edges_to_return]
    closest_edges = np.array(edges)[min_dist_indices] 
    closest_edges = [edge[0] for edge in closest_edges]
    
    return closest_edges

count_points['closest_edges'] = [find_closest_edges((lat, long), net, edges_to_return=2)
                                 for lat, long in zip(count_points['latitude'], count_points['longitude'])]

In [None]:
def get_edge_direction(edge):
    '''
    Function to determine which direction a specific edge is going. 
    '''
    from_x, from_y, from_z = edge._from._coord
    to_x, to_y, to_z = edge._to._coord
    dx = to_x - from_x
    dy = to_y - from_y
    if abs(dx) > abs(dy): # then edge direction is east or west
        if to_x > from_x: 
            direction = 'E'
        else: 
            direction = 'W'
    if abs(dx) < abs(dy): # then edge direction is north or south
        if to_y > from_y:
            direction = 'N'
        else:
            direction = 'S'
    return direction
    

def match_edges_directions(sumo_counts, count_points):
    '''
    Function to match edges with count point directions.
    In some cases, count points have more than two directions when they are located near junctions/roundabouts;
    here, the edge is set to NaN, and is dropped for simplicity.
    Note: only two edges are retrieved in the above cell
    '''
    start = time.time()
    
    edge_col_list = []
    for cp_id in count_points['count_point_id']:
        cp_actual_dirs = [actual_dir for actual_dir in
                          sumo_counts.loc[sumo_counts['count_point_id']==cp_id]['direction_of_travel']]
        edges = count_points.loc[count_points['count_point_id']==cp_id]['closest_edges'].iloc[0]
        edge_dirs = [get_edge_direction(edge) for edge in edges]
        for direction in cp_actual_dirs:
            if direction in edge_dirs:
                idx = edge_dirs.index(direction)
                cp_actual_dirs[cp_actual_dirs.index(direction)] = edges[idx]
            else: 
                cp_actual_dirs[cp_actual_dirs.index(direction)] = np.nan
        edge_col_list = edge_col_list + cp_actual_dirs
    sumo_counts['edge'] = edge_col_list
    sumo_counts.dropna(axis=0, inplace=True)
    sumo_counts.reset_index(drop=True, inplace=True)
    
    print(f'Matched edges and directions in {np.round(time.time()-start, 2)} seconds')
    return sumo_counts


sumo_counts = match_edges_directions(sumo_counts, count_points)

In [None]:
def impute_detector_type(df, cp_df, how='polygon'):
    '''
    Function to determine wether a count point will be classed as a source, sink or inbetween.
    This function can decide how to class the count points via several methods: 'polygon' or 'random'.
    'polygon' -> points outside the polygon, i.e., count points close to the Swansea council boundary are either sources or sinks;
    others within the polygon are inbetween. 
    'random' -> all points are randomly assigned source, sink, or inbetween. 
    '''
    detector_types = []
    
    if how == 'polygon':
        polygon_df = gpd.read_file('odpolygon.json')       
        polygon = polygon_df.geometry.unary_union # return a geometry containing the union of all geometries in the GeoSeries.
        # transform longitude and latitude into a list of shapely.Point objects
        points_df = gpd.GeoDataFrame(count_points, geometry=gpd.points_from_xy(count_points.longitude, count_points.latitude))
        in_polygon = points_df.geometry.within(polygon).to_list()
        for idx, boolean in enumerate(in_polygon):
            if boolean:
                detector_types.append('between')
            else:
                detector_types.append(np.random.choice(['source','sink']))  
        
    if how == 'random':
        for i in range(cp_df.shape[0]):
            detector_types.append(np.random.choice(['source','sink', 'between']))
            
    cp_df['detector_type'] = detector_types
    merged_df = pd.merge(df, cp_df[['count_point_id','detector_type']], on='count_point_id', how='inner')        
    return merged_df, cp_df

flowrouter_data, count_points = impute_detector_type(sumo_counts, count_points, how='random')
flowrouter_data

In [None]:
polygon = gpd.read_file('odpolygon.json')
swansea_coords = [51.6195955, -3.9459248]
m = folium.Map(location=swansea_coords, zoom_start=11)
folium.GeoJson(polygon, name='poly').add_to(m)
for index, row in count_points.iterrows():
    if row.detector_type == 'between':
        colour = 'black'
    elif row.detector_type == 'source':
        colour = 'green'
    elif row.detector_type == 'sink':
        colour = 'red'
    folium.Marker((row.latitude, row.longitude), popup = row.count_point_id, icon=folium.Icon(color=colour)).add_to(m)
m

In [None]:
def generate_detectors_file(df):
    '''
    Function to generate the SUMO detector file from the sumo_count df.
    '''
    with open("detectors.xml", 'w') as outf:
        outf.write("<detectors>\n")
        for index, row in df.iterrows():
            out_string = (f"    <detectorDefinition id='{row.count_point_id+'_'+row.direction_of_travel}' lane='{row.edge._lanes[0].getID()}' pos='{row.edge._lanes[0]._length/2}' type='{row.detector_type}'/>\n")
            outf.write(out_string.replace("'",'"'))
        outf.write("</detectors>")
        

def generate_detector_flows_file(df):
    '''
    Function to generate the SUMO detector flow file from the sumo_count df.
    Note: Speeds need to be in kph!
    '''
    begin_time = 0
    with open("detector_flows.csv", 'w') as outf:
        outf.write("Detector;Time;qPKW;vPKW\n")
        for index, row in df.iterrows():
            out_string = (f"{row.count_point_id+'_'+row.direction_of_travel};{begin_time};{row['18_pred']};{np.round(row.avg_speed,4)}\n")
            outf.write(out_string)


generate_detectors_file(flowrouter_data)
generate_detector_flows_file(flowrouter_data)

Apparently you can run CMD commands [straight from a Jupyter Notebook cell](https://anaconda.zendesk.com/hc/en-us/articles/360023858254-Executing-Terminal-Commands-in-Jupyter-Notebooks). The following commands use SUMO's flowrouter.py script to generate traffic from the detector files we have just made.

```
!cd C:\Users\ollir\OneDrive\Documents\University\Data Science MSc\Dissertation\To-Fly-or-Not-to-Fly\simulation
!flowrouter.py -n osm.net.xml -d detectors.xml -f detector_flows.csv -o traffic_routes.xml -e traffic_flows.xml -v
```

Output from using 'polygon' to classify sources and sinks.
```
Reading net
35388 edges read
Reading detectors
Loaded 33 sources and 39 sinks from detector file. Added 0 sources and 0 sinks from the network
Reading flows
Calculating routes
33 sources, 0 unlimited
39 sinks, 0 unlimited
Writing 4886 vehicles from 25 sources between time 0 and 60 (minutes)
  unused sources: -24540305#0 -60187252#5 -60861135#10 25384719#1 25744946#0 60861135#6 65331896#2 7817485#0
```

Output from using 'random' to classify sources and sinks.
```
Reading net
35388 edges read
Reading detectors
Loaded 63 sources and 68 sinks from detector file. Added 0 sources and 0 sinks from the network
Reading flows
Calculating routes
63 sources, 0 unlimited
68 sinks, 0 unlimited
Writing 11524 vehicles from 39 sources between time 0 and 60 (minutes)
  unused sources: -137918290#1 -137918296#2 -139239682#3 -15522127#2 -22564929 -22963191 -23494024#2 -40301262#0 -50249435 -69906441#0 -939464283 -96897376#4 10160480#0 10754466#1 10754555#0 13449572#0 13451018#0 137918296#1 15522127#2 25384719#1 26584579#0 40301262#0 50249435 96897366
```

## 4.2 RandomTrips.py & Calibrators
Another solution to translating traffic counts to traffic demand in SUMO is by using calibrators. First, a set of random routes need to be generated, then claibrators are used to adjust the vehicles flow on counter positions, so the overall number is approximately close to the real readings.

**Note: When using --validate, trip files are generated which results in the simulation running very slowly.** Just use route files. 

```
!cd C:\Users\ollir\OneDrive\Documents\University\Data Science MSc\Dissertation\To-Fly-or-Not-to-Fly\simulation
!randomTrips.py -n osm.net.xml -o traffic_trips.xml --route-file traffic_routes.xml -e 3600 -p 1
```

In [None]:
def generate_calibrator_file(df):
    '''
    Function to generate the SUMO detector file from the sumo_count df.
    '''
    FREQ = 1 # the aggregation interval in which to calibrate the flows. default is step-length
    
    with open("calibrator.xml", 'w') as outf:
        outf.write("<additional>\n")
        
        for index, row in df.iterrows():
            out_string = (f"    <routeProbe id='{row.count_point_id+'_'+row.direction_of_travel+'_probe'}' edge='{row.edge._id}' freq='{FREQ}' file='route_probe_output.xml'/>\n")
            outf.write(out_string.replace("'",'"'))
            
        for index, row in df.iterrows():
            out_string = (f"    <calibrator id='{row.count_point_id+'_'+row.direction_of_travel}' edge='{row.edge._id}' pos='{row.edge._lanes[0]._length/2}' freq='{FREQ}' routeProbe='{row.count_point_id+'_'+row.direction_of_travel+'_probe'}' output='calibrator_output.xml'>\n")
            outf.write(out_string.replace("'",'"'))
            out_string = (f"        <route id='{'fallback_'+row.count_point_id+'_'+row.direction_of_travel}' edges='{row.edge._id}'/>\n")
            outf.write(out_string.replace("'",'"'))
            out_string = (f"        <flow  begin='0' end='3600' route='{'fallback_'+row.count_point_id+'_'+row.direction_of_travel}' vehsPerHour='{row['18_pred']}' speed='{row.avg_speed}'/>\n")
            outf.write(out_string.replace("'",'"'))
            outf.write("    </calibrator>\n")
        
        # generate edge-based dump
        out_string = ("    <edgeData id='measure_1' file='edge_output.xml'/>\n")
        outf.write(out_string.replace("'",'"'))        
        
        outf.write("</additional>")

'''
note:  position of calibrator is actually ignored at the moment. https://github.com/eclipse/sumo/issues/1331.
'freq' attribute of the calibrator may need to be increased as there is a trade of between small and large time
intervals in which to calibrate the flow.
"While this can be done with dfrouter as well, the method described here is more robust for highly meshed networks as found in cities"
"For the calibrator to be able to function before the first vehicle, it needs a fall back route which just needs to consist of a single edge (i.e. the edge on which the calibrator is placed)."
"However, the realism of traffic flow behind (or between) calibrators depends on the fit between random routes and real-world routes."
'''

generate_calibrator_file(sumo_counts)

# 5. Extracting Simulation Data
In this section, data from the simulation will be extracted and visualised. First, the simulation must be run; then TraCI and sumolib can be used to interact live and summary data.

In [None]:
'''Change config file to include <seed value="202"/> such as to fix the stochasticity for reproducable results'''

In [None]:
!cd C:\Users\ollir\OneDrive\Documents\University\Data Science MSc\Dissertation\To-Fly-or-Not-to-Fly\simulation
!sumo -c osm.sumocfg

In [38]:
def xml_2_csv(files):
    '''
    Function to convert a list of XML files to CSV.
    '''
    for file, out_name in files:
        !xml2csv.py {file} --output {out_name}
    
files = [('tripinfo_output.xml','tripinfo_output.csv'),
         ('statistic_output.xml','statistic_output.csv'),
         ('edge_output.xml','edge_output.csv')]
xml_2_csv(files)

In [39]:
trip_output = pd.read_csv('tripinfo_output.csv', sep=';', header=0, index_col=None) #, dtype=col_types, usecols=cols, na_values='')
stats_output = pd.read_csv('statistic_output.csv', sep=';', header=0, index_col=None) #, dtype=col_types, usecols=cols, na_values='')
edge_output = pd.read_csv('edge_output.csv', sep=';', header=0, index_col=None) #, dtype=col_types, usecols=cols, na_values='')

### TO DO:
- ~~Change above function to return edge id~~
- ~~Also need to work out position along edge for detector file (going for length/2)~~
- ~~How do you choose the value for a certain count point?~~
- ~~Compute detector information and generate detector file. **Note: add type**~~
- ~~Compute flow information and generate flow file~~
- ~~Use flowrouter.py~~

**Try using [dynamic calibrators](https://sumo.dlr.de/docs/Simulation/Calibrator.html#Calibrators) as per [this recommendation](https://www.researchgate.net/post/How-cab-I-build-a-realistic-road-traffic-scenario-using-real-traffic-data-and-SUMO).**
- ~~Generate traffic using randomTrips.py~~
- ~~Could validate these routes if necessary.~~
- ~~Create calibrators, according to traffic counter positions in the network.~~
- ~~Use routeprobes too~~

See [this page of the wiki](https://sumo.dlr.de/docs/Simulation/Calibrator.html#building_a_scenario_without_knowledge_of_routes_based_on_flow_measurements).
Running the simulation with the random demand as well as these `<calibrator>` and `<routeProbe>` definitions will achieve a simulation in which traffic matches the specified flows at each calibrator edge. However, the realism of traffic flow behind (or between) calibrators depends on the fit between random routes and real-world routes. **The importance of this fit increases with the size and complexity of the network between calibrator edges.**

- Trial sublane lane changing model for EV
- Plan visualisations from information extracted from simulation