# Statistics Meets Logistics
This notebook holds the DataFrames and analysis for the project.

Requirements for the project environment can be found in https://github.com/luiul/statistics-meets-logistics/blob/main/requirements.txt. Disclaimer: the project has not been tested in other environments. 

# 📋 Outline of Project (WIP)
- Import
- Apply Pandas knowledge to DataFrame
    - Conditional filtering
    - Useful Methods
    - Check for missing data

# 📚 Import Libraries & Set Library Options

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',None)
# avoid truncate view of DataFrame (scroll to view all columns); set to 0 for pandas to auto-detect the with of the terminal and print truncated object that fits the screen width

# pd.set_option('float_format', '{:.2f}'.format)
# prints floats with two decimal points; do not comment out in this project since the features lat and lon have sigficant figures after two decimal points

# calling np.version.version should return 1.18.1
# calling pd.__version__ should return 1.1.2

# 🖼 Prepare DataFrames & Explore Features

In [2]:
dl = pd.read_csv('raw_data_dl.csv', delim_whitespace=True)
# dl is the DataFrame for the download raw data

ul = pd.read_csv('raw_data_ul.csv', delim_whitespace=True)
# ul is the DataFrame for the upload raw data

In [38]:
dl.head(3)

Unnamed: 0,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,rsrp,rsrq,sinr,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
0,1544432937,99.42,51.49,7.41,157.63,11.83,0.0,79.35,-85,-5,22,10,50,7,26385408,95,0,0.1,6.84,41,1544432927,campus
1,1544432949,237.43,51.49,7.42,152.41,10.76,-0.52,89.45,-84,-6,11,13,52,4,29391105,167,1,2.0,9.71,58,1544432927,campus
2,1544432959,325.26,51.49,7.42,154.64,6.19,-0.62,27.05,-82,-6,21,15,54,4,29391105,167,2,2.0,7.31,57,1544432927,campus


In [4]:
ul.head(3)

Unnamed: 0,timestamp,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,connected,rsrp,rsrq,sinr,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,txPower,measurement,location
0,11.53,1544432938,113.75,51.490592,7.414306,156.39,12.21,0.5,77.59,1,-85,-5,22,10,50,7,26385408,95,0,4.0,24.5211,35,12.3047,1544432927,campus
1,21.33,1544432948,231.4,51.490717,7.415835,152.53,11.17,0.0,87.24,1,-84,-6,11,13,52,4,29391105,167,1,2.0,14.8561,51,10.0156,1544432927,campus
2,32.22,1544432959,323.73,51.490634,7.417138,154.23,6.44,0.0,48.55,1,-82,-6,21,15,54,4,29391105,167,2,4.0,16.2684,57,4.34375,1544432927,campus


In [5]:
dl.isnull().sum().sum()
# no missing data in dl

0

In [6]:
ul.isnull().sum().sum()
# no missing data in ul

0

In [7]:
dl.info()
# dl has almost 2,200 data points, unlabeled index, 24 features, no missing data; dtypes: 13 ints, 10 floats, and 1 object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2141 entries, 0 to 2140
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    2141 non-null   float64
 1   rawTimesamp  2141 non-null   int64  
 2   distance     2141 non-null   float64
 3   lat          2141 non-null   float64
 4   lon          2141 non-null   float64
 5   alt          2141 non-null   float64
 6   speed        2141 non-null   float64
 7   acc          2141 non-null   float64
 8   dir          2141 non-null   float64
 9   connected    2141 non-null   int64  
 10  rsrp         2141 non-null   int64  
 11  rsrq         2141 non-null   int64  
 12  sinr         2141 non-null   int64  
 13  cqi          2141 non-null   int64  
 14  ss           2141 non-null   int64  
 15  ta           2141 non-null   int64  
 16  ci           2141 non-null   int64  
 17  pci          2141 non-null   int64  
 18  id           2141 non-null   int64  
 19  payloa

In [8]:
ul.info()
# ul has almost 2,200 data points, unlabeled index, 25 features (versus 24), no missing data; dtypes: 13 ints, 11 floats and 1 object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2142 entries, 0 to 2141
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    2142 non-null   float64
 1   rawTimesamp  2142 non-null   int64  
 2   distance     2142 non-null   float64
 3   lat          2142 non-null   float64
 4   lon          2142 non-null   float64
 5   alt          2142 non-null   float64
 6   speed        2142 non-null   float64
 7   acc          2142 non-null   float64
 8   dir          2142 non-null   float64
 9   connected    2142 non-null   int64  
 10  rsrp         2142 non-null   int64  
 11  rsrq         2142 non-null   int64  
 12  sinr         2142 non-null   int64  
 13  cqi          2142 non-null   int64  
 14  ss           2142 non-null   int64  
 15  ta           2142 non-null   int64  
 16  ci           2142 non-null   int64  
 17  pci          2142 non-null   int64  
 18  id           2142 non-null   int64  
 19  payloa

In [9]:
# timestamp: seems like an 'arbitrary' variable (drop this column?)
# rawTimesamp: unix timestamp; visit unixtimestamp.com/ to convert (correction of column name!)
# distance: distance from cell tower
# lat: latitude
# lon: longitude
# alt: altitude relative to NHN
# speed: [speed] = m/s
# acc: [acc] = m/s**2; contains negative values!
# dir: [dir] = grad; north === 0; dir: S -> [0,359.99]
# connect: LTE connection; here always one (drop this column!)

# signal performance indicator; visit cablefree.net/wirelesstechnology/4glte/rsrp-rsrq-measurement-lte/ for more detail
    # rsrp: performance indicator based on path loss; all values are negative (possibly irrelevant!)
    # rsrq: performance indicator; can contain negative values (possible irrelevant!)
    # sinr: performance indicator; can contain negative values (possible irrelevant!)
    # cqi: ? 
    # ss === Arbitrary Strength Unit (ASU); rsrp = ASU - 140 (redudant! drop this column!) 
    # ta: ? 
    # ci: ? 

# pci: physical cell ID; internal information (irrelevant!)
# id: vehicle ID
# payload: [payload] = MB
# throughput: [throughput] = mbits (key value!)
# rtt: [rtt] = ms; round trip time for the signal (vehicle -> tower -> vehicle)
# measuremt: experiment number 
# location: name of the location; location: S -> {'campus', 'highway', 'suburban', 'urban'}

# there is no column reflecting the frequency of the signal (see Variablenbeschreibung in moodle) (?)

# drop: timestamp, connect, ss, pci (?)
# rename: rawTimesamp -> timestamp
dl.columns

Index(['timestamp', 'rawTimesamp', 'distance', 'lat', 'lon', 'alt', 'speed',
       'acc', 'dir', 'connected', 'rsrp', 'rsrq', 'sinr', 'cqi', 'ss', 'ta',
       'ci', 'pci', 'id', 'payload', 'throughput', 'rtt', 'measurement',
       'location'],
      dtype='object')

In [10]:
# txPower: [txPower] = dBm; measurement of signal strength 

# drop: timestamp, connect, ss, pci (?)
# rename: rawTimesamp -> timestamp
ul.columns

Index(['timestamp', 'rawTimesamp', 'distance', 'lat', 'lon', 'alt', 'speed',
       'acc', 'dir', 'connected', 'rsrp', 'rsrq', 'sinr', 'cqi', 'ss', 'ta',
       'ci', 'pci', 'id', 'payload', 'throughput', 'rtt', 'txPower',
       'measurement', 'location'],
      dtype='object')

In [11]:
# Variablenbeschreibung

# - timestamp_ms ist ein Unix Timestamp, welcher die Umrechnung in eine konkrete Uhrzeit ermöglicht
# (siehe https://www.unixtimestamp.com/)
# - altitude_m die Höhe über Normalnull
# - veclocity ist in meter/s und acceleration in meter/s^2: Für die Beschleunigung sind negative Werte durch Bremsen 
# möglich (Änderung der Geschwindigkeit)
# - direction ist die Richtung in Grad, Norden entspricht 0 - isRegistered gibt an, ob das Gerät eine aktive 
# Verbindung zu einer LTE Zelle hat
# - rsrp ist ein Indikator für die Empfangsleistung und durch den Pfadverlust immer negativ – das heißt dann 
# einfach, dass nur sehr geringe Leistungsmengen beim Endgerät ankommen - rsrq und rssinr sind Verhältnisse von 
# Leistungen, auch hier sind negative werte möglich 
# (siehe auch https://www.cablefree.net/wirelesstechnology/4glte/rsrp-rsrq-measurement-lte/)
# - ss entspricht der Arbitrary Strength Unit (ASU) und ist redundant zum RSRP, da RSRP = ASU – 140
# - pci ist die Physical Cell Id, welche intern vom Endgerät verwendet wird, um Codierungsaufgaben zu machen 
# -> Sollte für euch nicht wichtig sein
# - payload entspricht der übertragenden Datenmenge in Megabyte
# - througput_mbits entspricht der Datenrate und somit der Zielgröße
# - connected 1 wenn eine Verbindung besteht 0 sonst
# - rtt_ms ist die Round Trip Time: Also die Signallaufzeit vom Sender zum Empfänger und wieder zurück
# - txPower_dBm entspricht der Sendeleistung des Endgerätes (somit auch nur im Uplink verfügbar, weil das 
# Endgerät im Downlink nur empfängt)
# - f_mhz entspricht der Trägerfrequenz der Basisstation, daher ist dies auch in wichtiges Feature, da die 
# Frequenz einen großen Einfluss auf die Funkausbreitungseigenschaften hat 
# - measurement ist die jeweilige Messfahrt

In [12]:
dl = dl.drop(['connected','timestamp'],axis=1)

In [13]:
ul = ul.drop(['connected','timestamp'],axis=1)

In [22]:
dl.head(3)

Unnamed: 0,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,rsrp,rsrq,sinr,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,measurement,location
0,1544432937,99.42,51.490553,7.413966,157.63,11.83,0.0,79.35,-85,-5,22,10,50,7,26385408,95,0,0.1,6.83763,41,1544432927,campus
1,1544432949,237.43,51.490715,7.416002,152.41,10.76,-0.52,89.45,-84,-6,11,13,52,4,29391105,167,1,2.0,9.71463,58,1544432927,campus
2,1544432959,325.26,51.490668,7.417176,154.64,6.19,-0.62,27.05,-82,-6,21,15,54,4,29391105,167,2,2.0,7.30594,57,1544432927,campus


In [23]:
ul.head(3)

Unnamed: 0,rawTimesamp,distance,lat,lon,alt,speed,acc,dir,rsrp,rsrq,sinr,cqi,ss,ta,ci,pci,id,payload,throughput,rtt,txPower,measurement,location
0,1544432938,113.75,51.490592,7.414306,156.39,12.21,0.5,77.59,-85,-5,22,10,50,7,26385408,95,0,4.0,24.5211,35,12.3047,1544432927,campus
1,1544432948,231.4,51.490717,7.415835,152.53,11.17,0.0,87.24,-84,-6,11,13,52,4,29391105,167,1,2.0,14.8561,51,10.0156,1544432927,campus
2,1544432959,323.73,51.490634,7.417138,154.23,6.44,0.0,48.55,-82,-6,21,15,54,4,29391105,167,2,4.0,16.2684,57,4.34375,1544432927,campus


In [14]:
dl.describe().transpose()
# some statistical values have no meaningful interpretation!

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rawTimesamp,2141.0,1545881000.0,1395781.0,1544433000.0,1544680000.0,1545135000.0,1547616000.0,1547803000.0
distance,2141.0,4273.933,3774.43,99.42,1365.83,2825.78,6435.79,14124.02
lat,2141.0,51.47864,0.02252776,51.43714,51.46308,51.48165,51.49067,51.51791
lon,2141.0,7.445595,0.03079581,7.393019,7.416637,7.450149,7.473018,7.491755
alt,2141.0,177.2354,41.97569,125.06,152.0,163.7,181.92,297.82
speed,2141.0,12.1547,8.870322,0.0,6.86,12.45,14.99,39.34
acc,2141.0,0.00444652,1.123975,-11.05,0.0,0.0,0.0,17.26
dir,2141.0,164.8287,93.1786,0.03,92.3,150.27,250.64,359.98
rsrp,2141.0,-88.51985,9.314856,-113.0,-95.0,-89.0,-83.0,-56.0
rsrq,2141.0,-7.463802,2.091197,-19.0,-9.0,-7.0,-6.0,-4.0


In [15]:
ul.describe().transpose()
# some statistical values have no meaningful interpretation!

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rawTimesamp,2142.0,1545889000.0,1395255.0,1544433000.0,1544680000.0,1545135000.0,1547616000.0,1547803000.0
distance,2142.0,4282.102,3766.634,79.87,1378.81,2836.01,6444.035,14125.34
lat,2142.0,51.47852,0.02267911,51.43718,51.46261,51.48165,51.49066,51.52182
lon,2142.0,7.445505,0.03073957,7.392997,7.41664,7.450149,7.472981,7.491727
alt,2142.0,177.2895,41.98303,118.17,151.9175,163.77,182.185,297.16
speed,2142.0,12.33604,8.950635,0.0,7.1525,12.54,15.095,39.34
acc,2142.0,0.02055556,0.9230749,-6.03,0.0,0.0,0.0,13.04
dir,2142.0,165.6892,93.09316,0.04,91.34,151.375,251.1425,359.95
rsrp,2142.0,-88.36134,9.350568,-113.0,-95.0,-89.0,-83.0,-56.0
rsrq,2142.0,-7.44071,2.106458,-19.0,-9.0,-7.0,-6.0,-4.0


In [16]:
dl['id'].unique()
# array of vehicle IDs?

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  30,  31,  32,  29,  33,  34,  35,  36,  37,  38,
        39,  41,  42,  43,  44,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  57,  58,  59,  60,  62,  64,  65,  66,  67,  68,  69,
        70,  71,  40,  45,  56,  61,  63,  72,  73,  74,  75,  76,  78,
        79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  77,  85,  99, 100, 101, 102, 103])

In [17]:
ul['id'].unique()
# array of vehicle IDs?

array([  0,   1,   2,   4,   5,   6,   7,   8,  10,  11,  12,  13,  14,
        15,  16,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
        30,  31,  32,   3,   9,  17,  18,  34,  35,  36,  37,  40,  39,
        41,  42,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  57,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
        70,  71,  38,  43,  73,  56,  58,  33,  72,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  84,  85,  87,  88,  90,  91,  93,  94,
        95,  96,  97,  98,  99,  83,  86,  89,  92, 100, 101, 102, 103])

In [18]:
dl['id'].unique() == ul['id'].unique()
# note that the vehicles are not identical 

array([ True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,  True,  True,  True])

In [19]:
dl['location'].unique()

array(['campus', 'highway', 'suburban', 'urban'], dtype=object)

In [20]:
dl['location'].unique() == ul['location'].unique()

array([ True,  True,  True,  True])

In [24]:
# len(dl['timestamp'].unique())
# we dropped this feature

In [25]:
len(dl['rawTimesamp'].unique())
# note that 'timestamp' and 'rawTimesamp' are differing amount of unique values

2138

In [29]:
len(dl)
# rawTimesamp cannot be index because it's not unique

2141

In [27]:
# len(ul['timestamp'].unique())
# we dropped this feature

In [28]:
len(ul['rawTimesamp'].unique())
# note that 'timestamp' and 'rawTimesamp' are differing amount of unique values

2140

In [30]:
len(ul)

2142

In [None]:
dl.equals(dl.drop_duplicates())
# no duplicates in download DataFrame

In [None]:
ul.equals(ul.drop_duplicates())
# no duplicates in upload DataFrame

In [None]:
dl.sort_values('rawTimesamp').head()

In [None]:
# pd.to_datetime(dl['rawTimesamp'],unit='s')
# pd.to_datetime(ul['rawTimesamp'],unit='s')
# change the timestamp format if neccessary

In [None]:
# apply filters to DataFrame? 

# sub-DataFrame per location with a filter or with isin() method
    # dl[dl['location']=='campus']
    # dl[dl['location']=='highway']
    # dl[dl['location']=='suburban']
    # dl[dl['location']=='urban']
    
    # ul[ul['location']=='campus']
    # ul[ul['location']=='highway']
    # ul[ul['location']=='suburban']
    # ul[ul['location']=='urban']

## Part 4

In [None]:
dl.corr()

In [None]:
ul.corr()

In [None]:
dl['location'].value_counts()
# good categorical data to group by

In [None]:
ul['location'].value_counts()
# good categorical data to group by

# is there more categorical data to group by? 

In [None]:
dl.duplicated()

In [None]:
dl.nlargest(5,'throughput')

In [None]:
dl.nsmallest(5,'throughput')

In [None]:
ul.nlargest(5,'throughput')

In [None]:
ul.nsmallest(5,'throughput')

## Part 5

In [None]:
# df.groupby(['model_year','cylinders']).describe().transpose()

In [None]:
dl['rawTimesamp']
# convert the rawTimesamp from an integer into a datetime object