# Statistics Meets Logistics
This notebook holds the DataFrames and analysis for the project.

Requirements for the project environment can be found in https://github.com/luiul/statistics-meets-logistics/blob/main/requirements.txt. The project has not been tested in any other environment. 

## 📋 Outline of Project (WIP)
- Import
- Apply Pandas knowledge to DataFrame
    - Conditional filtering
    - Useful Methods
    - Check for missing data

## 📚 Import Libraries

In [1]:
import numpy as np
import pandas as pd

pd.set_option('float_format', '{:.2f}'.format)
# prints floats with two decimal point

# calling np.version.version should return 1.18.1
# calling pd.__version__ should return 1.1.2

## 🖼 Prepare DataFrames & Describe Raw Features

In [2]:
dl = pd.read_csv('raw_data_dl.csv')
ul = pd.read_csv('raw_data_ul.csv')

# dl is the DataFrame for the download raw data
# ul is the DataFrame for the upload raw data

In [3]:
dl

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
0,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.00,79.35,1,...,50,7,26385408,95,0,0.10,6.84,41,1544432927,campus
1,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,52,4,29391105,167,1,2.00,9.71,58,1544432927,campus
2,32.46,1544432959,325.26,51.49,7.42,154.64,6.19,-0.62,27.05,1,...,54,4,29391105,167,2,2.00,7.31,57,1544432927,campus
3,46.40,1544432973,448.27,51.49,7.42,155.87,9.77,-0.98,342.45,1,...,45,4,29391105,167,3,3.00,3.95,163,1544432927,campus
4,54.95,1544432982,540.48,51.49,7.42,154.41,12.33,0.01,336.68,1,...,45,4,29391105,167,4,5.00,8.55,59,1544432927,campus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9349,421.52,1547803330,2908.77,51.51,7.46,159.23,5.05,0.00,137.18,1,...,49,1,26378755,441,41,0.50,4.62,48,1547802908,urban
9350,434.63,1547803343,3010.99,51.51,7.46,155.79,6.32,0.00,101.37,1,...,53,5,27299332,326,42,10.00,19.88,35,1547802908,urban
9351,441.33,1547803350,3044.81,51.51,7.46,150.10,4.93,0.00,109.23,1,...,55,5,27299332,326,43,0.50,5.87,43,1547802908,urban
9352,453.15,1547803361,3146.50,51.51,7.47,151.15,11.74,0.00,95.13,1,...,46,5,27299332,326,44,5.00,15.73,69,1547802908,urban


In [4]:
ul

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ta,ci,pci,id,payload,throughput,rtt,txPower,measurement,location
0,11.53,1544432938,113.75,51.49,7.41,156.39,12.21,0.50,77.59,1,...,7,26385408,95,0,4.00,24.52,35,12.30,1544432927,campus
1,21.33,1544432948,231.40,51.49,7.42,152.53,11.17,0.00,87.24,1,...,4,29391105,167,1,2.00,14.86,51,10.02,1544432927,campus
2,32.22,1544432959,323.73,51.49,7.42,154.23,6.44,0.00,48.55,1,...,4,29391105,167,2,4.00,16.27,57,4.34,1544432927,campus
3,45.99,1544432973,444.10,51.49,7.42,155.89,10.11,0.06,344.34,1,...,4,29391105,167,3,9.00,12.68,54,17.11,1544432927,campus
4,54.69,1544432982,537.34,51.49,7.42,154.41,12.33,0.02,336.68,1,...,4,29391105,167,4,8.00,14.59,60,17.31,1544432927,campus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9353,422.52,1547803331,2914.37,51.51,7.46,159.96,6.28,0.00,135.25,1,...,1,26378755,441,41,5.00,21.37,48,17.83,1547802908,urban
9354,434.11,1547803342,3007.72,51.51,7.46,155.79,6.32,0.00,101.37,1,...,5,27299332,326,42,8.00,18.77,46,17.12,1547802908,urban
9355,442.91,1547803351,3053.18,51.51,7.46,147.95,6.77,1.34,118.12,1,...,5,27299332,326,43,9.00,31.82,42,11.08,1547802908,urban
9356,451.53,1547803360,3128.44,51.51,7.46,150.03,10.24,0.00,93.22,1,...,5,27299332,326,44,2.00,17.45,36,17.28,1547802908,urban


In [5]:
dl.info()
# download raw data
# almost 9,400 entries, unlabeled index, 24 data columns, all values are non-null
# dtypes: ints (13), floats (10), string / object (1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9354 entries, 0 to 9353
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    9354 non-null   float64
 1   rawTimesamp  9354 non-null   int64  
 2   distance     9354 non-null   float64
 3   lat          9354 non-null   float64
 4   lon          9354 non-null   float64
 5   alt          9354 non-null   float64
 6   speed        9354 non-null   float64
 7   acc          9354 non-null   float64
 8   dir          9354 non-null   float64
 9   connected    9354 non-null   int64  
 10  rsrp         9354 non-null   int64  
 11  rsrq         9354 non-null   int64  
 12  sinr         9354 non-null   int64  
 13  cqi          9354 non-null   int64  
 14  ss           9354 non-null   int64  
 15  ta           9354 non-null   int64  
 16  ci           9354 non-null   int64  
 17  pci          9354 non-null   int64  
 18  id           9354 non-null   int64  
 19  payloa

In [6]:
ul.info()
# upload raw data
# almost 9,400 entries, unlabeled index, 25 data columns, all values are non-null
# dtypes: ints (13), floats (11), string / object (1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9358 entries, 0 to 9357
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    9358 non-null   float64
 1   rawTimesamp  9358 non-null   int64  
 2   distance     9358 non-null   float64
 3   lat          9358 non-null   float64
 4   lon          9358 non-null   float64
 5   alt          9358 non-null   float64
 6   speed        9358 non-null   float64
 7   acc          9358 non-null   float64
 8   dir          9358 non-null   float64
 9   connected    9358 non-null   int64  
 10  rsrp         9358 non-null   int64  
 11  rsrq         9358 non-null   int64  
 12  sinr         9358 non-null   int64  
 13  cqi          9358 non-null   int64  
 14  ss           9358 non-null   int64  
 15  ta           9358 non-null   int64  
 16  ci           9358 non-null   int64  
 17  pci          9358 non-null   int64  
 18  id           9358 non-null   int64  
 19  payloa

In [7]:
dl.columns
# timestamp: ? (drop this column?)
# rawTimesamp: unix timestamp; visit unixtimestamp.com/ to convert (correction of column name!)
# distance: distance from cell tower
# lat: latitude
# lon: longitude
# alt: altitude relative to NHN
# speed: [speed] = m/s
# acc: [acc] = m/s**2; contains negative values!
# dir: [dir] = grad; north === 0; dir: S -> [0,359.99]
# connect: LTE connection; here always one (drop this column!)

# signal performance indicator; visit cablefree.net/wirelesstechnology/4glte/rsrp-rsrq-measurement-lte/ for more detail
    # rsrp: performance indicator based on path loss; all values are negative (possibly irrelevant!)
    # rsrq: performance indicator; can contain negative values (possible irrelevant!)
    # sinr: performance indicator; can contain negative values (possible irrelevant!)
    # cqi: ? 
    # ss === Arbitrary Strength Unit (ASU); rsrp = ASU - 140 (redudant! drop this column!) 
    # ta: ? 
    # ci: ? 

# pci: physical cell ID; internal information (irrelevant!)
# id: vehicle ID
# payload: [payload] = MB
# throughput: [throughput] = mbits (key value!)
# rtt: [rtt] = ms; round trip time for the signal (vehicle -> tower -> vehicle)
# measuremt: experiment number 
# location: name of the location; location: S -> {'campus', 'highway', 'suburban', 'urban'}

# there is no column reflecting the frequency of the signal (see Variablenbeschreibung in moodle) (?)

# drop: timestamp, connect, ss, pci
# rename: rawTimesamp -> timestamp_raw

Index(['timestamp', 'rawTimesamp', 'distance', 'lat', 'lon', 'alt', 'speed',
       'acc', 'dir', 'connected', 'rsrp', 'rsrq', 'sinr', 'cqi', 'ss', 'ta',
       'ci', 'pci', 'id', 'payload', 'throughput', 'rtt', 'measurement',
       'location'],
      dtype='object')

In [8]:
ul.columns
# txPower: [txPower] = dBm; measurement of signal strength 

# drop: timestamp, connect, ss, pci
# rename: rawTimesamp -> timestamp_raw

Index(['timestamp', 'rawTimesamp', 'distance', 'lat', 'lon', 'alt', 'speed',
       'acc', 'dir', 'connected', 'rsrp', 'rsrq', 'sinr', 'cqi', 'ss', 'ta',
       'ci', 'pci', 'id', 'payload', 'throughput', 'rtt', 'txPower',
       'measurement', 'location'],
      dtype='object')

In [9]:
# Variablenbeschreibung

# - timestamp_ms ist ein Unix Timestamp, welcher die Umrechnung in eine konkrete Uhrzeit ermöglicht
# (siehe https://www.unixtimestamp.com/)
# - altitude_m die Höhe über Normalnull
# - veclocity ist in meter/s und acceleration in meter/s^2: Für die Beschleunigung sind negative Werte durch Bremsen 
# möglich (Änderung der Geschwindigkeit)
# - direction ist die Richtung in Grad, Norden entspricht 0 - isRegistered gibt an, ob das Gerät eine aktive 
# Verbindung zu einer LTE Zelle hat
# - rsrp ist ein Indikator für die Empfangsleistung und durch den Pfadverlust immer negativ – das heißt dann 
# einfach, dass nur sehr geringe Leistungsmengen beim Endgerät ankommen - rsrq und rssinr sind Verhältnisse von 
# Leistungen, auch hier sind negative werte möglich 
# (siehe auch https://www.cablefree.net/wirelesstechnology/4glte/rsrp-rsrq-measurement-lte/)
# - ss entspricht der Arbitrary Strength Unit (ASU) und ist redundant zum RSRP, da RSRP = ASU – 140
# - pci ist die Physical Cell Id, welche intern vom Endgerät verwendet wird, um Codierungsaufgaben zu machen 
# -> Sollte für euch nicht wichtig sein
# - payload entspricht der übertragenden Datenmenge in Megabyte
# - througput_mbits entspricht der Datenrate und somit der Zielgröße
# - connected 1 wenn eine Verbindung besteht 0 sonst
# - rtt_ms ist die Round Trip Time: Also die Signallaufzeit vom Sender zum Empfänger und wieder zurück
# - txPower_dBm entspricht der Sendeleistung des Endgerätes (somit auch nur im Uplink verfügbar, weil das 
# Endgerät im Downlink nur empfängt)
# - f_mhz entspricht der Trägerfrequenz der Basisstation, daher ist dies auch in wichtiges Feature, da die 
# Frequenz einen großen Einfluss auf die Funkausbreitungseigenschaften hat 
# - measurement ist die jeweilige Messfahrt

In [10]:
dl.describe().transpose()
# some statistical values have no meaningful interpretation!

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
timestamp,9354.0,355.79,245.0,9.93,153.16,302.74,533.48,1040.95
rawTimesamp,9354.0,1545877552.83,1394528.27,1544432937.0,1544679623.0,1545134772.0,1547616352.0,1547803369.0
distance,9354.0,4312.5,3794.56,99.42,1381.51,2851.75,6464.33,14124.02
lat,9354.0,51.48,0.02,51.44,51.46,51.48,51.49,51.52
lon,9354.0,7.45,0.03,7.39,7.42,7.45,7.47,7.49
alt,9354.0,177.26,41.81,118.5,151.96,163.84,182.28,297.82
speed,9354.0,12.23,8.95,0.0,6.91,12.45,15.0,39.34
acc,9354.0,0.01,1.15,-11.05,0.0,0.0,0.0,17.26
dir,9354.0,165.62,93.23,0.03,92.3,151.14,251.46,359.98
connected,9354.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [11]:
ul.describe().transpose()
# some statistical values have no meaningful interpretation!

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
timestamp,9358.0,355.21,244.92,9.94,153.09,301.4,532.41,1040.54
rawTimesamp,9358.0,1545878041.95,1394599.51,1544432938.0,1544679629.0,1545134782.0,1547616347.0,1547803369.0
distance,9358.0,4301.63,3786.89,79.87,1378.81,2841.4,6473.86,14125.34
lat,9358.0,51.48,0.02,51.44,51.46,51.48,51.49,51.52
lon,9358.0,7.45,0.03,7.39,7.42,7.45,7.47,7.49
alt,9358.0,177.24,41.83,118.17,151.92,163.78,182.09,297.16
speed,9358.0,12.24,8.97,0.0,6.93,12.47,15.01,39.34
acc,9358.0,0.02,0.94,-6.03,0.0,0.0,0.0,13.04
dir,9358.0,165.33,93.26,0.04,91.84,151.19,250.56,359.95
connected,9358.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [12]:
dl['id'].unique()
# array of vehicle IDs?

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,   9,  20,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103])

In [13]:
ul['id'].unique()
# array of vehicle IDs?

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103])

In [14]:
dl['id'].unique() == ul['id'].unique()
# different vehicles for upload and download experiments

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [15]:
dl['location'].unique()

array(['campus', 'highway', 'suburban', 'urban'], dtype=object)

In [16]:
dl['location'].unique() == ul['location'].unique()

array([ True,  True,  True,  True])

In [17]:
# continue with section 3

In [18]:
len(dl['timestamp'].unique())

2286

In [19]:
len(dl['rawTimesamp'].unique())

2338

In [20]:
dl.sort_values('rawTimesamp').head(10)

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
0,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,50,7,26385408,95,0,0.1,6.84,41,1544432927,campus
7015,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,50,7,26385408,95,0,0.1,6.84,41,1544432927,urban
2338,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,50,7,26385408,95,0,0.1,6.84,41,1544432927,highway
4676,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,50,7,26385408,95,0,0.1,6.84,41,1544432927,suburban
7016,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,52,4,29391105,167,1,2.0,9.71,58,1544432927,urban
1,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,52,4,29391105,167,1,2.0,9.71,58,1544432927,campus
4677,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,52,4,29391105,167,1,2.0,9.71,58,1544432927,suburban
2339,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,52,4,29391105,167,1,2.0,9.71,58,1544432927,highway
4678,32.46,1544432959,325.26,51.49,7.42,154.64,6.19,-0.62,27.05,1,...,54,4,29391105,167,2,2.0,7.31,57,1544432927,suburban
7017,32.46,1544432959,325.26,51.49,7.42,154.64,6.19,-0.62,27.05,1,...,54,4,29391105,167,2,2.0,7.31,57,1544432927,urban


In [21]:
dl_no_loc = dl.drop('location',axis=1)

In [22]:
dl_no_loc.sort_values('rawTimesamp').head(10)

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,measurement
0,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,10,50,7,26385408,95,0,0.1,6.84,41,1544432927
7015,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,10,50,7,26385408,95,0,0.1,6.84,41,1544432927
2338,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,10,50,7,26385408,95,0,0.1,6.84,41,1544432927
4676,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,1,...,10,50,7,26385408,95,0,0.1,6.84,41,1544432927
7016,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,13,52,4,29391105,167,1,2.0,9.71,58,1544432927
1,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,13,52,4,29391105,167,1,2.0,9.71,58,1544432927
4677,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,13,52,4,29391105,167,1,2.0,9.71,58,1544432927
2339,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,13,52,4,29391105,167,1,2.0,9.71,58,1544432927
4678,32.46,1544432959,325.26,51.49,7.42,154.64,6.19,-0.62,27.05,1,...,15,54,4,29391105,167,2,2.0,7.31,57,1544432927
7017,32.46,1544432959,325.26,51.49,7.42,154.64,6.19,-0.62,27.05,1,...,15,54,4,29391105,167,2,2.0,7.31,57,1544432927


In [23]:
dl_no_loc.drop_duplicates()

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,...,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,measurement
0,10.33,1544432937,99.42,51.49,7.41,157.63,11.83,0.00,79.35,1,...,10,50,7,26385408,95,0,0.10,6.84,41,1544432927
1,21.87,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,1,...,13,52,4,29391105,167,1,2.00,9.71,58,1544432927
2,32.46,1544432959,325.26,51.49,7.42,154.64,6.19,-0.62,27.05,1,...,15,54,4,29391105,167,2,2.00,7.31,57,1544432927
3,46.40,1544432973,448.27,51.49,7.42,155.87,9.77,-0.98,342.45,1,...,9,45,4,29391105,167,3,3.00,3.95,163,1544432927
4,54.95,1544432982,540.48,51.49,7.42,154.41,12.33,0.01,336.68,1,...,9,45,4,29391105,167,4,5.00,8.55,59,1544432927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2336,453.15,1547803361,3146.50,51.51,7.47,151.15,11.74,0.00,95.13,1,...,6,46,5,27299332,326,44,5.00,15.73,69,1547802908
2337,460.73,1547803369,3239.49,51.51,7.47,154.61,11.71,0.00,96.02,1,...,12,44,5,27299332,326,45,0.10,5.13,38,1547802908
2347,104.50,1544433031,756.20,51.49,7.42,155.86,1.03,0.86,355.63,1,...,7,48,4,29391105,167,9,9.00,16.86,47,1544432927
2358,213.14,1544433140,1690.01,51.49,7.40,163.39,0.00,0.00,268.19,1,...,7,45,7,26385410,94,20,5.00,13.59,38,1544432927


In [24]:
# pd.to_datetime(dl['rawTimesamp'],unit='s')
# pd.to_datetime(ul['rawTimesamp'],unit='s')
# change the timestamp format if neccessary

In [25]:
# apply filters to DataFrame? 

# sub-DataFrame per location with a filter or with isin() method
    # dl[dl['location']=='campus']
    # dl[dl['location']=='highway']
    # dl[dl['location']=='suburban']
    # dl[dl['location']=='urban']
    
    # ul[ul['location']=='campus']
    # ul[ul['location']=='highway']
    # ul[ul['location']=='suburban']
    # ul[ul['location']=='urban']