In [None]:
# Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.animation
import time
import pylab as pl
from IPython import display
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import time as t
import datetime
import dask.dataframe as dd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
%matplotlib inline 


# Visualizing the Training Data
In the following notebook, I try to create a vizualization resembling something like a very simplified version of real roadway from the training data. I'm not sure how useful this is going to be with respect to the modelling but maybe we could draw some inference from it. In any case, I though it was a cool way to look at the training data(shown below).

<img src="https://drive.google.com/uc?export=view&id=1Qi_HEod-gRDvRDBY920ZE4ti49dfyywH">

# Data
The training data conists of x and y coordindates of a roadway and the traffic direction. We are supplied with a level of congestion in form of a time series for each of the roadway.

In [None]:
# Reading the data
data = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
print(data.head())
print(data.x.unique())
print(data.y.unique())
print(data.direction.unique())

From the data, we have quite less number of unique x,y coordinates and directions. So we could try to visualize the data through something resembling the actual road network.

In [None]:
# Grouping by coordinates and direction
data_dir = data.groupby(['x','y','direction'])['congestion'].count().reset_index()
data_dir = data_dir.drop(columns = ['congestion'])
data_dir


# Defining offset coordintes and colors
Here we define an xy plane offset that we are going to be using to plot the road network. The road network are to made using a number of line segments and the tuple defined for every direction below provide the offset of the x-y coordinate from the mid point of the road. For example the tuple defined for the direct 'EB' is (-0.5,0.5,0,0). So suppose we have a East Bound(EB) road with its midpoint at (0,0), then that road is going to be represented by a line seqment which goes from point (0-0.5,0+0) to (0+0.5,0+0), i.e (-0.5,0) to (0.5,0). Similiary others are defined as well.

Since here we have roads going in the opposite directions as well, for ex - 'NB' and 'SB', so those are represented by parallel lines offset from the midpoint as given in the training data for better visualization.
If the explaination above isn't totally clear then the plot below probably will be.

In [None]:
xy_offset = {'EB' : (-0.5,0.5,0,0),'NB' : (0,0,0.5,-0.5), 
             'SB' : (0.2,0.2,0.5,-0.5), 'WB' : (0.5,-0.5,0.2,0.2),
            'NE' : (-0.5,0.5,0.5,-0.5), 'NW' : (0.5,-0.5,0.5,-0.5), 
             'SW' : (-0.3,0.7,0.5,-0.5), 'SE' : (-0.3,0.7,-0.5,0.5)}

colors = {'EB' : "#89b0ae",'NB' : "#89b0ae", 
             'SB' : "#2d4051", 'WB' :"#2d4051",
            'NE' : "#89b0ae", 'NW' : "#89b0ae", 
             'SW' : "#2d4051", 'SE' : "#2d4051"}

# Plotting the Road Network
The plot below looks like an idealized version of a city block. The network is pretty simplified with the obvious assumption that the coordinates are equidistant, but still it could be a helpful way to vizualize the data.
The light colored roads represent roads going in NB, EB, NE and NW directions. Rest directions are represented by the dark color. And as stated earlier the opposite roadways are made by offseting it parallel to it center. For example, NB roads pass through the mid point x,y as given in the data but the south roads have been charted by just offseting it parallely for better visualization.

In [None]:
fig = plt.figure(figsize = (10,13))
gs = fig.add_gridspec(nrows=1, ncols=1)
ax = fig.add_subplot(gs[0:, 0:])
ax.set_title("Road Network",fontsize = 20, fontweight = "bold", fontfamily = 'serif')
ax.set_xlabel("<- East   (Coordinates)     West ->", fontsize = 16, fontfamily = 'serif')
ax.set_ylabel("<-South    (Coordinates)   North->",fontsize = 16, fontfamily = 'serif')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

def show_congestion(data_dir):
    for index, row in data_dir.iterrows():
        x_values = [row['x'] + xy_offset[row['direction']][0],
                    row['x']+xy_offset[row['direction']][1]]
        y_values = [row['y']+xy_offset[row['direction']][2],
                    row['y']+xy_offset[row['direction']][3]]

        ax.plot(x_values,y_values, linewidth=10, color = colors[row['direction']], 
                alpha = 1)
#         line1.set_xdata(x_values)
#         line1.set_ydata(y_values)
#         line1.set_alpha(row['congestion']/100
show_congestion(data_dir)
        

# Animating the Network
Using matplotlib we could spin up a very rudimentery animation showing how congestion varies over time for each of the roads given in the training data.

In [None]:
# First we extract the unique times and sort it
unique_times = data['time'].unique()
unique_times = np.sort(unique_times)

min_time = unique_times[0]
max_time = unique_times[-1]

print(unique_times[0],unique_times[-1])

It is very computationaly expensive and not very reasonable to animate the network for whole of the training data since it covers a whole year in 20 min intervals. So we define a time interval below for which we would like to visualize the data.

In [None]:
# Set the start and end time for the visualization
start_time = '1991-04-01 00:00:00'
end_time = '1991-04-02 19:00:00'

def get_selected_time(start_time,end_time,unique_times):
    start = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
    end = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
    max_t = datetime.datetime.strptime(max_time, "%Y-%m-%d %H:%M:%S")
    min_t = datetime.datetime.strptime(min_time, "%Y-%m-%d %H:%M:%S")
    print(start,end,max_t,min_t)
    # Validating if time entered falles within range
    if(start < min_t or end > max_t):
        
        print("Invalid time interval!!")
        return
   
    # Rounding the datetime to nearest 20 min mark if the entered times aren't compatible

    start = start + datetime.timedelta(minutes = 20 - start.minute%20, seconds = 0 - start.second)
    end = end - datetime.timedelta(minutes = end.minute%20,seconds = 0 + end.second)
    print(start,end)
    start_time = start.strftime("%Y-%m-%d %H:%M:%S")
    end_time = end.strftime("%Y-%m-%d %H:%M:%S")
    
    unique_times = unique_times.tolist()

    times_filtered = unique_times[unique_times.index(start_time):
                                  unique_times.index(end_time)+1]
    
    return times_filtered

times_filtered = get_selected_time(start_time,end_time,unique_times)
print(len(times_filtered))

data_filtered = data[data['time'].isin(times_filtered)]
data_filtered.shape

In [None]:
# We create an array beforehand of dataframe filtered by the time since filitering the large dataframe in the loop before plotting would make the animation slow.
d_a = []
for time in times_filtered:
    d_a.append(data_filtered[data_filtered['time'] == time])

# Animation
The animation is simple. For each of the roads in the network we just vary its linewidth according to the congestion level. At lower congestion level the lines would be leaner and thick when the congestion is high.


In [None]:
fig = plt.figure(figsize = (14,10))
gs = fig.add_gridspec(nrows=1, ncols=1)
ax1 = fig.add_subplot(gs[0:, 0:])

def animation(fig,ax1,d_a,u_f):
#     ax1.set_xlim(-0.5,2.5)
#     ax1.set_ylim(-0.5,4)

    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_visible(False)
    ax1.spines['bottom'].set_visible(False)
    # d_a is a list of dataframes created using different filters
    for data_filtered, tm in zip(d_a,u_f):        

    #     show_congestion(data_filtered)
        ax1.clear()
        for index,row in data_filtered.iterrows():

            x_values = [row['x'] + xy_offset[row['direction']][0],
                        row['x']+xy_offset[row['direction']][1]]
            y_values = [row['y']+xy_offset[row['direction']][2],
                        row['y']+xy_offset[row['direction']][3]]

            ax1.plot(x_values,y_values, linewidth=row['congestion']/5,
                     color = colors[row['direction']],
#                         alpha = row['congestion']/100

                            )
        ax1.set_xlim(-1,3)
        ax1.set_ylim(-1,4)
        
        ax1.text(1.6,3.8,tm, fontsize = 16, fontweight = 'bold', color = 'grey')

        ax1.set_title("Road Network",fontsize = 20, fontweight = "bold", fontfamily = 'serif', color ="grey")
        ax1.set_xlabel("<- East   (Coordinates)     West ->", fontsize = 16, fontfamily = 'serif', color ="grey")
        ax1.set_ylabel("<-South    (Coordinates)   North->",fontsize = 16, fontfamily = 'serif', color ="grey")
        display.display(fig)
        display.clear_output(wait =True)
    
        plt.pause(0.1)

        
animation(fig,ax1,d_a,times_filtered)


# Isolating roads and directions
It can be difficult to infer anything viewing the whole network at a time. So it would probably be more useful if we isolated some roads and just looked at them, which should be simple to do.

In [None]:
fig = plt.figure(figsize = (14,10))
gs = fig.add_gridspec(nrows=1, ncols=1)
ax1 = fig.add_subplot(gs[0:, 0:])

filter_coordinates_x = [0]
filter_coordinates_y = [0,1,2,3]

filter_direction = ["NE","NB","SB"]

da_f = data_filtered = data[(data['direction'].isin(filter_direction)) & 
                           (data['x'].isin(filter_coordinates_x)) &
                           (data['y'].isin(filter_coordinates_y))]
df_array = []

start_time = '1991-04-01 00:00:00'
end_time = '1991-04-02 19:00:00'

filter_t =  get_selected_time(start_time,end_time,unique_times)
    
for time in filter_t:
    df_array.append(da_f[da_f['time'] == time])
    
animation(fig,ax1,df_array,filter_t)

So, we could probably get a correlation between congestion in the same pathways, which from the limited animation we could already see. But, I'm not sure how useful it would be in informing modelling choice. So, let me know if anything is wrong here or it could be made better(I'm sure) in some way.