In [1]:
import pandas as pd
import numpy as np
import datetime
import holidays
import seaborn as sns


from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60


In [3]:
def prepo(data):
    # dropping row_id
    data.drop('row_id', axis=1, inplace=True)

    ## Creating individual columns for time column(1991-04-01 03:20:00 --- year-month-day  Hours:minutes:seconds)
    data['year'] = data['time'].apply(lambda row: row[:4])
    data['month'] = data['time'].apply(lambda row: row.split('-')[1][:2] )
    data['date'] = data['time'].apply(lambda row: row.split('-')[2][:2])
    data['hour'] = data['time'].apply(lambda row: row.split(':')[0][-2:])
    data['minute'] = data['time'].apply(lambda row: row.split(':')[1][-2:])
    data['c_date'] = data['time'].apply(lambda row: row.split(' ')[0])

    ## Chainging dataframe dtypes
    data = data.astype({'year':'int64','month':'int64', 'date':'int64','minute':'int64',
                        'x':'str','y':'str', 'direction':'str', 'hour':'int64' })

    ## Creating new _column by combining x,y,& direction.
    data['road_coord']= data['x']+data['y']+data['direction']

    ### Creating a new column of holiday.
    hol_list = []
    us_holidays = holidays.US()
    for i in data['c_date']:
        if us_holidays.get(f'{i}') == None:
            hol_list.append(0)
        else:
            hol_list.append(1)

    data['is_holiday'] = hol_list


    ## Creating new column is_weekday
    dt_list = []
    for i,j in enumerate(data['c_date']):
        day_type = datetime.date(data['year'].iloc[i],data['month'].iloc[i],data['date'].iloc[i])
        dy = day_type.weekday()
        dt_list.append(dy)
    data['week_num'] = dt_list

    ## Categorizing month column --- 1-10--0(start-month), 11-20--1(mid_month), (21-30)-- end month.
    data['month_cat'] = pd.cut(data['date'], bins=[0,10,20,31], labels=[0,1,2])

    ## Categorizing is_weekday column 0-Weekday, 1-Weekend
    data['is_weekday'] = pd.cut(data['week_num'], bins=[-1,4,6], labels = [0, 1])
    # Season 0- spring and 1-summer.
    data['season'] = pd.cut(data['month'], bins=[3,6,9], labels = [0, 1])
    ## Categorizing new column based on day timings
    data['m_a_e_n'] = pd.cut(data['hour'], bins=[4,11,16,21], labels = [0, 1,2])
    data = data.astype({'m_a_e_n':'float64'})
    data['m_a_e_n'].fillna(3, inplace=True)
    data = data.astype({'m_a_e_n':'int64'})

    ## Encoding the highway code
    le = LabelEncoder()
    data['highway_code']= le.fit_transform(data['road_coord'])
    return data

In [4]:
data = prepo(data)
data.head()

Unnamed: 0,time,x,y,direction,congestion,year,month,date,hour,minute,c_date,road_coord,is_holiday,week_num,month_cat,is_weekday,season,m_a_e_n,highway_code
0,1991-04-01 00:00:00,0,0,EB,70,1991,4,1,0,0,1991-04-01,00EB,0,0,0,0,0,3,0
1,1991-04-01 00:00:00,0,0,NB,49,1991,4,1,0,0,1991-04-01,00NB,0,0,0,0,0,3,1
2,1991-04-01 00:00:00,0,0,SB,24,1991,4,1,0,0,1991-04-01,00SB,0,0,0,0,0,3,2
3,1991-04-01 00:00:00,0,1,EB,18,1991,4,1,0,0,1991-04-01,01EB,0,0,0,0,0,3,3
4,1991-04-01 00:00:00,0,1,NB,60,1991,4,1,0,0,1991-04-01,01NB,0,0,0,0,0,3,4


In [5]:
data = data.drop(['time','x','y','direction','year','c_date','road_coord','hour','minute','month','date'], axis=1)
print(data.shape)
data.dtypes

(848835, 8)


congestion         int64
is_holiday         int64
week_num           int64
month_cat       category
is_weekday      category
season          category
m_a_e_n            int64
highway_code       int32
dtype: object

In [6]:
data = data.astype({'month_cat':'int64','is_weekday':'int64', 'season':'int64'})
data.dtypes

congestion      int64
is_holiday      int64
week_num        int64
month_cat       int64
is_weekday      int64
season          int64
m_a_e_n         int64
highway_code    int32
dtype: object

In [7]:
def pairing(data, seq_len=195):
    x=[]
    y =[]
    for i in range(0, ((data.shape[0])-(seq_len+1)), seq_len+1):
        shp = (seq_len, data.shape[1] )
        seq = np.zeros(shp)

        for j in range(seq_len):
            seq[j] = data.values[i+j]

        x.append(seq.flatten())  # flattening the matrix and appending it to the x list
        y.append( data["congestion"][i+seq_len] )  # appending the target to the y list 
        
    return np.array(x), np.array(y)

In [8]:
x,y = pairing(data)

KeyboardInterrupt: 

In [None]:
x.shape

In [None]:
# df = pd.DataFrame(x)
# df.to_csv('x.csv')
# df_y = pd.DataFrame(y)
# df_y.to_csv('y.csv')