# Statistics Meets Logistics
This notebook holds the DataFrames and analysis for the project.

Requirements for the project environment can be found in https://github.com/luiul/statistics-meets-logistics/blob/main/requirements.txt. Disclaimer: the project has not been tested in other environment. 

# 📋 Outline of Project (WIP)
- Import
- Apply Pandas knowledge to DataFrame
    - Conditional filtering
    - Useful Methods
    - Check for missing data

# 📚 Import Libraries & Set Options

In [1]:
import numpy as np
import pandas as pd

# calling np.version.version should return 1.18.1
# calling pd.__version__ should return 1.1.2

In [41]:
pd.set_option('display.max_columns', None)
# avoid truncate view of DataFrame (scroll to view all columns)
# set to 0 for pandas to auto-detect the with of the terminal and print truncated object
# that fits the screen width

# pd.set_option('float_format', '{:.2f}'.format)
# prints floats with two decimal points
# do not comment out in this project since the features lat and lon
# have to many significant decimal points

# 🖼 Prepare DataFrames & Explore Features

In [2]:
dl = pd.read_csv('raw_data_dl.csv', delim_whitespace=True)
ul = pd.read_csv('raw_data_ul.csv', delim_whitespace=True)
# dl is the DataFrame for the download raw data
# ul is the DataFrame for the upload raw data

In [40]:
dl

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,rsrp,rsrq,sinr,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
0,10.33,1544432937,99.42,51.490553,7.413966,157.63,11.83,0.00,79.35,1,-85,-5,22,10,50,7,26385408,95,0,0.1,6.83763,41,1544432927,campus
1,21.87,1544432949,237.43,51.490715,7.416002,152.41,10.76,-0.52,89.45,1,-84,-6,11,13,52,4,29391105,167,1,2.0,9.71463,58,1544432927,campus
2,32.46,1544432959,325.26,51.490668,7.417176,154.64,6.19,-0.62,27.05,1,-82,-6,21,15,54,4,29391105,167,2,2.0,7.30594,57,1544432927,campus
3,46.40,1544432973,448.27,51.491839,7.416804,155.87,9.77,-0.98,342.45,1,-94,-9,9,9,45,4,29391105,167,3,3.0,3.94997,163,1544432927,campus
4,54.95,1544432982,540.48,51.492531,7.416222,154.41,12.33,0.01,336.68,1,-90,-6,16,9,45,4,29391105,167,4,5.0,8.54884,59,1544432927,campus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2136,421.52,1547803330,2908.77,51.510228,7.461691,159.23,5.05,0.00,137.18,1,-89,-7,7,7,49,1,26378755,441,41,0.5,4.61894,48,1547802908,urban
2137,434.63,1547803343,3010.99,51.509847,7.463047,155.79,6.32,0.00,101.37,1,-85,-7,10,8,53,5,27299332,326,42,10.0,19.87580,35,1547802908,urban
2138,441.33,1547803350,3044.81,51.509787,7.463589,150.10,4.93,0.00,109.23,1,-82,-5,20,10,55,5,27299332,326,43,0.5,5.87372,43,1547802908,urban
2139,453.15,1547803361,3146.50,51.509798,7.465158,151.15,11.74,0.00,95.13,1,-90,-7,10,6,46,5,27299332,326,44,5.0,15.72950,69,1547802908,urban


In [4]:
ul

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ta,ci,pci,id,payload,throughput,rtt,txPower,measurement,location
0,11.53,1544432938,113.75,51.490592,7.414306,156.39,12.21,0.50,77.59,1,...,7,26385408,95,0,4.0,24.52110,35,12.30470,1544432927,campus
1,21.33,1544432948,231.40,51.490717,7.415835,152.53,11.17,0.00,87.24,1,...,4,29391105,167,1,2.0,14.85610,51,10.01560,1544432927,campus
2,32.22,1544432959,323.73,51.490634,7.417138,154.23,6.44,0.00,48.55,1,...,4,29391105,167,2,4.0,16.26840,57,4.34375,1544432927,campus
3,54.69,1544432982,537.34,51.492531,7.416222,154.41,12.33,0.02,336.68,1,...,4,29391105,167,4,8.0,14.58860,60,17.31250,1544432927,campus
4,63.98,1544432991,650.07,51.493484,7.415836,153.05,11.69,-0.18,348.09,1,...,4,29391105,167,5,6.0,13.12550,57,19.93750,1544432927,campus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137,422.52,1547803331,2914.37,51.510185,7.461754,159.96,6.28,0.00,135.25,1,...,1,26378755,441,41,5.0,21.36750,48,17.82810,1547802908,urban
2138,434.11,1547803342,3007.72,51.509847,7.463047,155.79,6.32,0.00,101.37,1,...,5,27299332,326,42,8.0,18.77380,46,17.11720,1547802908,urban
2139,442.91,1547803351,3053.18,51.509741,7.463746,147.95,6.77,1.34,118.12,1,...,5,27299332,326,43,9.0,31.81620,42,11.07810,1547802908,urban
2140,451.53,1547803360,3128.44,51.509786,7.464816,150.03,10.24,0.00,93.22,1,...,5,27299332,326,44,2.0,17.44820,36,17.28130,1547802908,urban


In [5]:
dl.info()
# download raw data
# almost 2,200 entries, unlabeled index, 24 data columns, all values are non-null
# dtypes: ints (13), floats (10), string / object (1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2141 entries, 0 to 2140
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    2141 non-null   float64
 1   rawTimesamp  2141 non-null   int64  
 2   distance     2141 non-null   float64
 3   lat          2141 non-null   float64
 4   lon          2141 non-null   float64
 5   alt          2141 non-null   float64
 6   speed        2141 non-null   float64
 7   acc          2141 non-null   float64
 8   dir          2141 non-null   float64
 9   connected    2141 non-null   int64  
 10  rsrp         2141 non-null   int64  
 11  rsrq         2141 non-null   int64  
 12  sinr         2141 non-null   int64  
 13  cqi          2141 non-null   int64  
 14  ss           2141 non-null   int64  
 15  ta           2141 non-null   int64  
 16  ci           2141 non-null   int64  
 17  pci          2141 non-null   int64  
 18  id           2141 non-null   int64  
 19  payloa

In [6]:
ul.info()
# upload raw data
# almost 2,200 entries, unlabeled index, 25 data columns, all values are non-null
# dtypes: ints (13), floats (11), string / object (1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2142 entries, 0 to 2141
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    2142 non-null   float64
 1   rawTimesamp  2142 non-null   int64  
 2   distance     2142 non-null   float64
 3   lat          2142 non-null   float64
 4   lon          2142 non-null   float64
 5   alt          2142 non-null   float64
 6   speed        2142 non-null   float64
 7   acc          2142 non-null   float64
 8   dir          2142 non-null   float64
 9   connected    2142 non-null   int64  
 10  rsrp         2142 non-null   int64  
 11  rsrq         2142 non-null   int64  
 12  sinr         2142 non-null   int64  
 13  cqi          2142 non-null   int64  
 14  ss           2142 non-null   int64  
 15  ta           2142 non-null   int64  
 16  ci           2142 non-null   int64  
 17  pci          2142 non-null   int64  
 18  id           2142 non-null   int64  
 19  payloa

In [7]:
dl.columns
# timestamp: seems like an 'arbitrary' variable (drop this column?)
# rawTimesamp: unix timestamp; visit unixtimestamp.com/ to convert (correction of column name!)
# distance: distance from cell tower
# lat: latitude
# lon: longitude
# alt: altitude relative to NHN
# speed: [speed] = m/s
# acc: [acc] = m/s**2; contains negative values!
# dir: [dir] = grad; north === 0; dir: S -> [0,359.99]
# connect: LTE connection; here always one (drop this column!)

# signal performance indicator; visit cablefree.net/wirelesstechnology/4glte/rsrp-rsrq-measurement-lte/ for more detail
    # rsrp: performance indicator based on path loss; all values are negative (possibly irrelevant!)
    # rsrq: performance indicator; can contain negative values (possible irrelevant!)
    # sinr: performance indicator; can contain negative values (possible irrelevant!)
    # cqi: ? 
    # ss === Arbitrary Strength Unit (ASU); rsrp = ASU - 140 (redudant! drop this column!) 
    # ta: ? 
    # ci: ? 

# pci: physical cell ID; internal information (irrelevant!)
# id: vehicle ID
# payload: [payload] = MB
# throughput: [throughput] = mbits (key value!)
# rtt: [rtt] = ms; round trip time for the signal (vehicle -> tower -> vehicle)
# measuremt: experiment number 
# location: name of the location; location: S -> {'campus', 'highway', 'suburban', 'urban'}

# there is no column reflecting the frequency of the signal (see Variablenbeschreibung in moodle) (?)

# drop: timestamp, connect, ss, pci (?)
# rename: rawTimesamp -> timestamp

Index(['timestamp', 'rawTimesamp', 'distance', 'lat', 'lon', 'alt', 'speed',
       'acc', 'dir', 'connected', 'rsrp', 'rsrq', 'sinr', 'cqi', 'ss', 'ta',
       'ci', 'pci', 'id', 'payload', 'throughput', 'rtt', 'measurement',
       'location'],
      dtype='object')

In [8]:
ul.columns
# txPower: [txPower] = dBm; measurement of signal strength 

# drop: timestamp, connect, ss, pci (?)
# rename: rawTimesamp -> timestamp

Index(['timestamp', 'rawTimesamp', 'distance', 'lat', 'lon', 'alt', 'speed',
       'acc', 'dir', 'connected', 'rsrp', 'rsrq', 'sinr', 'cqi', 'ss', 'ta',
       'ci', 'pci', 'id', 'payload', 'throughput', 'rtt', 'txPower',
       'measurement', 'location'],
      dtype='object')

In [9]:
# Variablenbeschreibung

# - timestamp_ms ist ein Unix Timestamp, welcher die Umrechnung in eine konkrete Uhrzeit ermöglicht
# (siehe https://www.unixtimestamp.com/)
# - altitude_m die Höhe über Normalnull
# - veclocity ist in meter/s und acceleration in meter/s^2: Für die Beschleunigung sind negative Werte durch Bremsen 
# möglich (Änderung der Geschwindigkeit)
# - direction ist die Richtung in Grad, Norden entspricht 0 - isRegistered gibt an, ob das Gerät eine aktive 
# Verbindung zu einer LTE Zelle hat
# - rsrp ist ein Indikator für die Empfangsleistung und durch den Pfadverlust immer negativ – das heißt dann 
# einfach, dass nur sehr geringe Leistungsmengen beim Endgerät ankommen - rsrq und rssinr sind Verhältnisse von 
# Leistungen, auch hier sind negative werte möglich 
# (siehe auch https://www.cablefree.net/wirelesstechnology/4glte/rsrp-rsrq-measurement-lte/)
# - ss entspricht der Arbitrary Strength Unit (ASU) und ist redundant zum RSRP, da RSRP = ASU – 140
# - pci ist die Physical Cell Id, welche intern vom Endgerät verwendet wird, um Codierungsaufgaben zu machen 
# -> Sollte für euch nicht wichtig sein
# - payload entspricht der übertragenden Datenmenge in Megabyte
# - througput_mbits entspricht der Datenrate und somit der Zielgröße
# - connected 1 wenn eine Verbindung besteht 0 sonst
# - rtt_ms ist die Round Trip Time: Also die Signallaufzeit vom Sender zum Empfänger und wieder zurück
# - txPower_dBm entspricht der Sendeleistung des Endgerätes (somit auch nur im Uplink verfügbar, weil das 
# Endgerät im Downlink nur empfängt)
# - f_mhz entspricht der Trägerfrequenz der Basisstation, daher ist dies auch in wichtiges Feature, da die 
# Frequenz einen großen Einfluss auf die Funkausbreitungseigenschaften hat 
# - measurement ist die jeweilige Messfahrt

In [37]:
dl.isnull().sum()
# no missing data

timestamp      0
rawTimesamp    0
distance       0
lat            0
lon            0
alt            0
speed          0
acc            0
dir            0
connected      0
rsrp           0
rsrq           0
sinr           0
cqi            0
ss             0
ta             0
ci             0
pci            0
id             0
payload        0
throughput     0
rtt            0
measurement    0
location       0
dtype: int64

In [38]:
ul.isnull().sum()
# no missing data

timestamp      0
rawTimesamp    0
distance       0
lat            0
lon            0
alt            0
speed          0
acc            0
dir            0
connected      0
rsrp           0
rsrq           0
sinr           0
cqi            0
ss             0
ta             0
ci             0
pci            0
id             0
payload        0
throughput     0
rtt            0
txPower        0
measurement    0
location       0
dtype: int64

In [10]:
dl.describe().transpose()
# some statistical values have no meaningful interpretation!

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
timestamp,2141.0,355.0805,245.9953,9.93,152.54,301.88,532.81,1040.95
rawTimesamp,2141.0,1545881000.0,1395781.0,1544433000.0,1544680000.0,1545135000.0,1547616000.0,1547803000.0
distance,2141.0,4273.933,3774.43,99.42,1365.83,2825.78,6435.79,14124.02
lat,2141.0,51.47864,0.02252776,51.43714,51.46308,51.48165,51.49067,51.51791
lon,2141.0,7.445595,0.03079581,7.393019,7.416637,7.450149,7.473018,7.491755
alt,2141.0,177.2354,41.97569,125.06,152.0,163.7,181.92,297.82
speed,2141.0,12.1547,8.870322,0.0,6.86,12.45,14.99,39.34
acc,2141.0,0.00444652,1.123975,-11.05,0.0,0.0,0.0,17.26
dir,2141.0,164.8287,93.1786,0.03,92.3,150.27,250.64,359.98
connected,2141.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [11]:
ul.describe().transpose()
# some statistical values have no meaningful interpretation!

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
timestamp,2142.0,354.4113,244.4693,9.94,153.8925,301.36,530.9725,1040.54
rawTimesamp,2142.0,1545889000.0,1395255.0,1544433000.0,1544680000.0,1545135000.0,1547616000.0,1547803000.0
distance,2142.0,4282.102,3766.634,79.87,1378.81,2836.01,6444.035,14125.34
lat,2142.0,51.47852,0.02267911,51.43718,51.46261,51.48165,51.49066,51.52182
lon,2142.0,7.445505,0.03073957,7.392997,7.41664,7.450149,7.472981,7.491727
alt,2142.0,177.2895,41.98303,118.17,151.9175,163.77,182.185,297.16
speed,2142.0,12.33604,8.950635,0.0,7.1525,12.54,15.095,39.34
acc,2142.0,0.02055556,0.9230749,-6.03,0.0,0.0,0.0,13.04
dir,2142.0,165.6892,93.09316,0.04,91.34,151.375,251.1425,359.95
connected,2142.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [12]:
dl['id'].unique()
# array of vehicle IDs?

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  30,  31,  32,  29,  33,  34,  35,  36,  37,  38,
        39,  41,  42,  43,  44,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  57,  58,  59,  60,  62,  64,  65,  66,  67,  68,  69,
        70,  71,  40,  45,  56,  61,  63,  72,  73,  74,  75,  76,  78,
        79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  77,  85,  99, 100, 101, 102, 103])

In [13]:
ul['id'].unique()
# array of vehicle IDs?

array([  0,   1,   2,   4,   5,   6,   7,   8,  10,  11,  12,  13,  14,
        15,  16,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
        30,  31,  32,   3,   9,  17,  18,  34,  35,  36,  37,  40,  39,
        41,  42,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  57,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
        70,  71,  38,  43,  73,  56,  58,  33,  72,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  84,  85,  87,  88,  90,  91,  93,  94,
        95,  96,  97,  98,  99,  83,  86,  89,  92, 100, 101, 102, 103])

In [14]:
dl['id'].unique() == ul['id'].unique()
# note that the vehicles are not identical 

array([ True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,  True,  True,  True])

In [15]:
dl['location'].unique()

array(['campus', 'highway', 'suburban', 'urban'], dtype=object)

In [16]:
dl['location'].unique() == ul['location'].unique()

array([ True,  True,  True,  True])

In [17]:
len(dl['timestamp'].unique())

2095

In [18]:
len(dl['rawTimesamp'].unique())
# note that 'timestamp' and 'rawTimesamp' are differing amount of unique values

2138

In [19]:
len(ul['timestamp'].unique())

2084

In [20]:
len(ul['rawTimesamp'].unique())
# note that 'timestamp' and 'rawTimesamp' are differing amount of unique values

2140

In [21]:
dl.equals(dl.drop_duplicates())
# no duplicates in download DataFrame

True

In [22]:
ul.equals(ul.drop_duplicates())
# no duplicates in upload DataFrame

True

In [23]:
dl.sort_values('rawTimesamp').head()

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
0,10.33,1544432937,99.42,51.490553,7.413966,157.63,11.83,0.0,79.35,1,...,50,7,26385408,95,0,0.1,6.83763,41,1544432927,campus
1,21.87,1544432949,237.43,51.490715,7.416002,152.41,10.76,-0.52,89.45,1,...,52,4,29391105,167,1,2.0,9.71463,58,1544432927,campus
2,32.46,1544432959,325.26,51.490668,7.417176,154.64,6.19,-0.62,27.05,1,...,54,4,29391105,167,2,2.0,7.30594,57,1544432927,campus
3,46.4,1544432973,448.27,51.491839,7.416804,155.87,9.77,-0.98,342.45,1,...,45,4,29391105,167,3,3.0,3.94997,163,1544432927,campus
4,54.95,1544432982,540.48,51.492531,7.416222,154.41,12.33,0.01,336.68,1,...,45,4,29391105,167,4,5.0,8.54884,59,1544432927,campus


In [24]:
# pd.to_datetime(dl['rawTimesamp'],unit='s')
# pd.to_datetime(ul['rawTimesamp'],unit='s')
# change the timestamp format if neccessary

In [25]:
# apply filters to DataFrame? 

# sub-DataFrame per location with a filter or with isin() method
    # dl[dl['location']=='campus']
    # dl[dl['location']=='highway']
    # dl[dl['location']=='suburban']
    # dl[dl['location']=='urban']
    
    # ul[ul['location']=='campus']
    # ul[ul['location']=='highway']
    # ul[ul['location']=='suburban']
    # ul[ul['location']=='urban']

## Part 4

In [26]:
dl.corr()

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,measurement
timestamp,1.0,-0.095189,0.783976,-0.258318,0.137001,-0.014361,0.091416,-0.001915,-0.077123,,...,-0.069575,0.068113,0.052477,0.127474,0.165228,0.999898,0.030017,0.080304,-0.037556,-0.095363
rawTimesamp,-0.095189,1.0,-0.072429,0.31631,0.28845,-0.1462,-0.041262,-0.035921,-0.063609,,...,-0.078858,-0.056486,-0.002465,-0.147401,0.20024,-0.094833,0.043842,0.044219,-0.059295,1.0
distance,0.783976,-0.072429,1.0,-0.244318,-0.243813,0.067715,0.329632,-0.003321,0.067191,,...,0.047723,0.139942,0.119168,0.082589,-0.12929,0.783489,0.036347,0.132209,0.000996,-0.072565
lat,-0.258318,0.31631,-0.244318,1.0,-0.325227,-0.79586,-0.452051,-0.065115,-0.132508,,...,-0.048432,0.110843,-0.092231,-0.346866,-0.076895,-0.256608,-0.005548,0.037768,-0.093787,0.31635
lon,0.137001,0.28845,-0.243813,-0.325227,1.0,0.409877,-0.053622,0.026486,-0.135801,,...,-0.092208,-0.206719,-0.051666,0.083267,0.59764,0.136366,0.024384,-0.041582,0.002664,0.288421
alt,-0.014361,-0.1462,0.067715,-0.79586,0.409877,1.0,0.296487,0.0558,0.071459,,...,0.16629,-0.013427,-0.134802,0.045965,0.019848,-0.015282,0.002225,0.020332,0.078667,-0.146195
speed,0.091416,-0.041262,0.329632,-0.452051,-0.053622,0.296487,1.0,0.074545,0.393086,,...,0.069456,0.005707,0.265169,0.309203,-0.080954,0.090504,-0.004443,0.028435,0.050264,-0.041278
acc,-0.001915,-0.035921,-0.003321,-0.065115,0.026486,0.0558,0.074545,1.0,0.032983,,...,-0.017115,0.029493,-0.015142,0.028241,0.038521,-0.002199,-0.016029,-0.020694,0.067178,-0.03592
dir,-0.077123,-0.063609,0.067191,-0.132508,-0.135801,0.071459,0.393086,0.032983,1.0,,...,0.013737,-0.045645,0.176845,0.262824,-0.067199,-0.078117,-0.007059,-0.045619,0.048768,-0.063594
connected,,,,,,,,,,,...,,,,,,,,,,


In [27]:
ul.corr()

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ss,ta,ci,pci,id,payload,throughput,rtt,txPower,measurement
timestamp,1.0,-0.098971,0.784438,-0.252098,0.131718,-0.016756,0.081001,-0.033556,-0.080902,,...,0.06992,0.044423,0.123807,0.16518,0.999947,0.003628,0.082215,-0.027811,-0.014971,-0.099144
rawTimesamp,-0.098971,1.0,-0.075157,0.317175,0.288499,-0.156773,-0.044742,-0.032168,-0.059578,,...,-0.061574,0.008418,-0.127935,0.200293,-0.098598,-0.011906,0.018246,-0.042838,0.077858,1.0
distance,0.784438,-0.075157,1.0,-0.246007,-0.244973,0.06696,0.322627,-0.020394,0.063113,,...,0.147037,0.104631,0.086078,-0.126801,0.784505,-0.003637,0.093922,0.005488,-0.145458,-0.075293
lat,-0.252098,0.317175,-0.246007,1.0,-0.318605,-0.795056,-0.45802,-0.076073,-0.138082,,...,0.105169,-0.096354,-0.370375,-0.090659,-0.250089,-0.036697,0.059516,-0.084135,0.030732,0.317213
lon,0.131718,0.288499,-0.244973,-0.318605,1.0,0.405465,-0.054661,0.030919,-0.114084,,...,-0.210855,-0.039842,0.089816,0.597597,0.130409,0.004622,-0.053865,0.014156,0.158138,0.288471
alt,-0.016756,-0.156773,0.06696,-0.795056,0.405465,1.0,0.294326,0.104308,0.067212,,...,0.004861,-0.135756,0.065282,0.02377,-0.018365,0.019788,-0.009411,0.059062,-0.165039,-0.156768
speed,0.081001,-0.044742,0.322627,-0.45802,-0.054661,0.294326,1.0,0.044231,0.397196,,...,-0.002702,0.260665,0.335793,-0.07448,0.081129,-0.029272,-0.01917,0.058423,-0.052709,-0.044756
acc,-0.033556,-0.032168,-0.020394,-0.076073,0.030919,0.104308,0.044231,1.0,0.017558,,...,0.018937,-0.032659,-0.021572,-0.000357,-0.033679,-0.001753,0.025546,0.033462,-0.035285,-0.032162
dir,-0.080902,-0.059578,0.063113,-0.138082,-0.114084,0.067212,0.397196,0.017558,1.0,,...,-0.045176,0.18569,0.280383,-0.04907,-0.081057,-0.018687,-0.055719,0.052331,0.011563,-0.059563
connected,,,,,,,,,,,...,,,,,,,,,,


In [28]:
dl['location'].value_counts()
# good categorical data to group by

suburban    858
highway     640
urban       363
campus      280
Name: location, dtype: int64

In [29]:
ul['location'].value_counts()
# good categorical data to group by

# is there more categorical data to group by? 

suburban    848
highway     650
urban       366
campus      278
Name: location, dtype: int64

In [30]:
dl.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2136    False
2137    False
2138    False
2139    False
2140    False
Length: 2141, dtype: bool

In [31]:
dl.nlargest(5,'throughput')

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
569,351.39,1545111116,6575.87,51.443189,7.440798,225.68,34.92,0.0,287.42,1,...,63,18,33527552,376,34,9.0,43.0108,34,1545110765,highway
1737,531.63,1547647008,4644.25,51.471942,7.456441,161.8,12.67,0.0,113.7,1,...,48,7,29568768,415,52,10.0,41.7755,38,1547646477,suburban
1220,522.06,1544795439,4223.41,51.473138,7.450812,157.52,12.32,1.48,123.14,1,...,54,7,29568768,415,51,10.0,39.0434,35,1544794917,suburban
2021,252.6,1547802025,2419.58,51.51274,7.456824,143.03,14.9,0.0,129.58,1,...,52,1,26796290,25,24,10.0,38.5356,39,1547801772,urban
612,131.81,1545197312,1347.12,51.440304,7.485386,282.17,14.68,-0.37,233.53,1,...,61,1,27771908,270,12,8.0,38.3923,36,1545197180,highway


In [32]:
dl.nsmallest(5,'throughput')

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
642,502.67,1545197683,10570.49,51.468872,7.403781,160.82,28.28,-1.1,313.05,1,...,50,4,27770885,33,48,0.1,0.070194,83,1545197180,highway
639,477.31,1545197658,9746.63,51.464091,7.413126,152.47,34.49,0.0,310.4,1,...,43,47,27771140,49,43,1.0,0.219262,99,1545197180,highway
641,502.28,1545197683,10559.41,51.468698,7.40409,159.97,28.79,0.0,312.63,1,...,41,31,28368642,26,42,3.0,0.33306,97,1545197180,highway
510,423.25,1545025788,8483.03,51.454449,7.42288,160.98,35.76,0.0,343.96,1,...,44,47,27771140,49,41,0.1,0.355241,357,1545025365,highway
643,505.13,1545197685,10640.45,51.469219,7.40317,162.97,28.61,0.19,313.15,1,...,45,4,27770883,35,47,1.0,0.472283,38,1545197180,highway


In [33]:
ul.nlargest(5,'throughput')

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ta,ci,pci,id,payload,throughput,rtt,txPower,measurement,location
682,232.08,1545283773,3589.23,51.443682,7.483773,264.66,23.56,0.0,268.47,1,...,1,27771909,271,22,10.0,40.0601,35,3.125,1545283541,highway
873,121.61,1547702877,1470.85,51.439657,7.484012,284.98,15.56,0.93,232.66,1,...,1,27771908,270,11,9.0,38.1356,39,-1.38281,1547702755,highway
915,551.88,1547703307,12465.05,51.481754,7.397372,184.83,15.7,-0.86,56.48,1,...,4,27771139,48,54,10.0,37.0542,37,13.125,1547702755,highway
869,81.68,1547702837,854.07,51.44343,7.490552,275.55,16.2,0.72,216.73,1,...,6,27771907,272,7,9.0,36.4557,40,-0.304688,1547702755,highway
701,421.74,1545283963,9350.65,51.461367,7.41742,150.83,30.86,0.0,322.48,1,...,8,27770883,35,41,8.0,36.343,37,14.0781,1545283541,highway


In [34]:
ul.nsmallest(5,'throughput')

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ta,ci,pci,id,payload,throughput,rtt,txPower,measurement,location
281,64.76,1544593376,719.82,51.444633,7.491625,278.6,0.0,0.0,127.23,1,...,7,27771907,272,5,0.1,0.277201,358,21.5625,1544593312,highway
312,413.43,1544593725,8485.02,51.454994,7.422637,159.56,35.48,-0.35,343.36,1,...,44,28365063,274,40,0.1,0.42017,488,19.4531,1544593312,highway
804,101.71,1547615940,739.44,51.444386,7.491662,271.31,5.64,0.0,215.72,1,...,6,27771907,272,8,0.5,0.432994,736,21.7813,1547615839,highway
377,413.79,1544679848,8453.38,51.454167,7.422987,160.98,27.69,-0.18,343.81,1,...,45,28365063,274,40,0.1,0.454031,433,18.9531,1544679435,highway
636,476.96,1545197657,9734.56,51.464091,7.413126,152.47,34.49,0.37,310.4,1,...,6,33527554,375,46,0.1,0.534761,5322,10.3281,1545197180,highway


## Part 5

In [35]:
# df.groupby(['model_year','cylinders']).describe().transpose()

In [36]:
dl['rawTimesamp']
# convert the rawTimesamp from an integer into a datetime object

0       1544432937
1       1544432949
2       1544432959
3       1544432973
4       1544432982
           ...    
2136    1547803330
2137    1547803343
2138    1547803350
2139    1547803361
2140    1547803369
Name: rawTimesamp, Length: 2141, dtype: int64