In [3]:
#Import package pandas for data analysis
import pandas as pd
# Import package numpy for numeric computing
import numpy as np
from numpy import int64
from numpy import float64
from numpy import datetime64
# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
# Allows plots to appear directly in the notebook.
%matplotlib inline
# For dealing with some Accented characters (in Irish Place names)
import unidecode
# Date/time functionality
import datetime
import time
# Check if files exist
from os.path import exists
from os import makedirs
# System specific parameters and functions
import sys
# look at some z-scores for inspecting outliers.
from scipy import stats
import seaborn as sb
# lookup lat/long and convert lat/long to national grid references.
import geopy
import pyproj

from patsy import dmatrices
from sklearn import metrics

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
#from sklearn.tree import export_text

from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split

import graphviz
from graphviz import Source
#to read all CSV files in a folder
import os
import glob

Reading data in chunks and adding dataframes to a list
        
        *Note: Dont need to run below cell if chunks already created, call get_chunks function to create chunks list

In [3]:
chunk_size = 10000000
batch_no=1
chunk_vehicles_list=[]
for chunk in pd.read_csv('~/tmp/data/rt_vehicles_DB_2018.txt',sep=';',chunksize=chunk_size):
    chunk_vehicles_list.append(chunk)
    chunk.to_csv('Chunks/vehicles/'+str(batch_no)+'chunk_vehicles'+'.csv',index=False)
    batch_no+=1
    print(chunk.shape)

(272622, 7)


Reading Data from multiple files in a folder to a list of data frames

In [4]:
def get_chunks(location):
    cwd=os.getcwd()
    path = cwd+location
    chunk_folder = glob.glob(os.path.join(path, "*.csv"))
    chunk_list=[]
    for filename in chunk_folder:
        # read the csv file
        df_chunk = pd.read_csv(filename)
        chunk_list.append(df_chunk)
        print('Location:', filename)
    return chunk_list

In [5]:
chunk_vehicles_list=get_chunks('/Chunks/vehicles')

('Location:', '/home/team8/notebook/Pelin/Chunks/vehicles/1chunk_vehicles.csv')


Combining chunks to 1 big dataframe

In [6]:
df_vehicles=pd.concat(chunk_vehicles_list, axis=0)

In [8]:
df_vehicles.shape

(272622, 7)

In [10]:
df_vehicles.head(5)

Unnamed: 0,DATASOURCE,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES,LASTUPDATE,NOTE
0,DB,23-NOV-18 00:00:00,3303848,286166,58849,04-DEC-18 08:03:09,
1,DB,23-NOV-18 00:00:00,3303847,259545,56828,04-DEC-18 08:03:09,
2,DB,28-FEB-18 00:00:00,2868329,103096,40967,08-MAR-18 10:35:59,
3,DB,28-FEB-18 00:00:00,2868330,147277,43599,08-MAR-18 10:35:59,
4,DB,28-FEB-18 00:00:00,2868331,224682,40447,08-MAR-18 10:35:59,


In [11]:
df_vehicles.tail(5)

Unnamed: 0,DATASOURCE,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES,LASTUPDATE,NOTE
272617,DB,29-DEC-18 00:00:00,3393878,264237,62320,16-JAN-19 18:00:42,
272618,DB,29-DEC-18 00:00:00,3394105,250335,52254,16-JAN-19 18:00:42,
272619,DB,29-DEC-18 00:00:00,3394109,172539,44349,16-JAN-19 18:00:42,
272620,DB,29-DEC-18 00:00:00,3394130,188057,38948,16-JAN-19 18:00:42,
272621,DB,29-DEC-18 00:00:00,3394131,291697,63677,16-JAN-19 18:00:42,


In [12]:
for df in chunk_vehicles_list:
    print(df.nunique())
    print ("="*66)

DATASOURCE           1
DAYOFSERVICE       360
VEHICLEID         1152
DISTANCE        170498
MINUTES          57523
LASTUPDATE         360
NOTE                 0
dtype: int64


There is 1152 Vehicle in this dataset, data related to 1019 of it is available in leavetimes dataset

In [14]:
df_vehicles.columns

Index([u'DATASOURCE', u'DAYOFSERVICE', u'VEHICLEID', u'DISTANCE', u'MINUTES',
       u'LASTUPDATE', u'NOTE'],
      dtype='object')

Dropping empty and constant columns from all chunks in dataset.

In [6]:
for df in  chunk_vehicles_list:
    del df['DATASOURCE']
    del df['NOTE']

In [9]:
# Clean the column names to remove white space after the name or in the name
for df in chunk_vehicles_list:
    df.columns = df.columns.str.replace(' ', '')

In [10]:
#updating bigdataframe with newly derived features chunks
df_vehicles=pd.concat(chunk_vehicles_list, axis=0)

In [11]:
df_vehicles.head(5)

Unnamed: 0,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES,LASTUPDATE
0,23-NOV-18 00:00:00,3303848,286166,58849,04-DEC-18 08:03:09
1,23-NOV-18 00:00:00,3303847,259545,56828,04-DEC-18 08:03:09
2,28-FEB-18 00:00:00,2868329,103096,40967,08-MAR-18 10:35:59
3,28-FEB-18 00:00:00,2868330,147277,43599,08-MAR-18 10:35:59
4,28-FEB-18 00:00:00,2868331,224682,40447,08-MAR-18 10:35:59


In [12]:
def update_csv_chunks(adress,name, c_list):
    cwd=os.getcwd()
    path = cwd+adress
    batch_no=1
    for chunk in c_list:
        chunk.to_csv(path+str(batch_no)+name+'.csv',index=False)
        batch_no+=1
        print(chunk.shape)

In [13]:
update_csv_chunks('/Chunks/vehicles/','chunk_vehicles', chunk_vehicles_list)

(272622, 5)


Creating PK feature columns to merge with other DB datasets

In [14]:
for df in chunk_vehicles_list:
    df['PK_4_VehICLES']=df['DAYOFSERVICE'].astype('str')+df['VEHICLEID'].astype('str')

In [15]:
#updating bigdataframe with newly derived features chunks
df_vehicles=pd.concat(chunk_vehicles_list, axis=0)

In [16]:
df_vehicles.head(5)

Unnamed: 0,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES,LASTUPDATE,PK_4_VehICLES
0,23-NOV-18 00:00:00,3303848,286166,58849,04-DEC-18 08:03:09,23-NOV-18 00:00:003303848
1,23-NOV-18 00:00:00,3303847,259545,56828,04-DEC-18 08:03:09,23-NOV-18 00:00:003303847
2,28-FEB-18 00:00:00,2868329,103096,40967,08-MAR-18 10:35:59,28-FEB-18 00:00:002868329
3,28-FEB-18 00:00:00,2868330,147277,43599,08-MAR-18 10:35:59,28-FEB-18 00:00:002868330
4,28-FEB-18 00:00:00,2868331,224682,40447,08-MAR-18 10:35:59,28-FEB-18 00:00:002868331


In [17]:
update_csv_chunks('/Chunks/vehicles/','chunk_vehicles', chunk_vehicles_list)

(272622, 6)


Converting features into appropriate data types

In [33]:
for df in chunk_vehicles_list:
    df['DAYOFSERVICE']=df['DAYOFSERVICE'].astype('str')
    df['LASTUPDATE']=df['LASTUPDATE'].astype('str')

In [34]:
df_vehicles=pd.concat(chunk_vehicles_list, axis=0)

In [35]:
df_vehicles.dtypes

DAYOFSERVICE       object
VEHICLEID        category
DISTANCE            int64
MINUTES             int64
LASTUPDATE         object
PK_4_VehICLES      object
dtype: object

In [36]:
df_vehicles.head(5)

Unnamed: 0,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES,LASTUPDATE,PK_4_VehICLES
0,2018-11-23,3303848,286166,58849,2018-12-04 08:03:09,23-NOV-18 00:00:003303848
1,2018-11-23,3303847,259545,56828,2018-12-04 08:03:09,23-NOV-18 00:00:003303847
2,2018-02-28,2868329,103096,40967,2018-03-08 10:35:59,28-FEB-18 00:00:002868329
3,2018-02-28,2868330,147277,43599,2018-03-08 10:35:59,28-FEB-18 00:00:002868330
4,2018-02-28,2868331,224682,40447,2018-03-08 10:35:59,28-FEB-18 00:00:002868331


In [37]:
def vehicles_list_dtype_converter(df):
    df['DAYOFSERVICE'] =  pd.to_datetime(df['DAYOFSERVICE'], format="%Y-%m-%d %H:%M:%S")
    df['VEHICLEID']=df['VEHICLEID'].astype('category')
    df['DISTANCE']=df['DISTANCE'].astype('int64')
    df['MINUTES']=df['MINUTES'].astype('int64')
    df['LASTUPDATE'] =  pd.to_datetime(df['LASTUPDATE'], format="%Y-%m-%d %H:%M:%S")
    df['PK_4_VehICLES']=df['PK_4_VehICLES'].astype('str')
    print(df.dtypes)
    return df

In [38]:
chunk_vehicles_list=map(vehicles_list_dtype_converter,chunk_vehicles_list)

DAYOFSERVICE     datetime64[ns]
VEHICLEID              category
DISTANCE                  int64
MINUTES                   int64
LASTUPDATE       datetime64[ns]
PK_4_VehICLES            object
dtype: object
