In [None]:
# This notebook aims to extract general features and to check the quality of the dataset.
# As it can be seen, the quality of the data is really good and all the values seems to be between a normal range

In [None]:
#LIBRARIES
import pyspark
import pyspark.sql.functions as f

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType
from pyspark.sql.functions import mean, min, max

import pandas as pd
import numpy as np

print(sc.version)

In [None]:
#READ THE DATASET
df = spark.read.format("csv").option("header", "true").load("sample_table.csv")

#  The original file can be found in 
#  https://www.kaggle.com/doit-intl/autotel-shared-car-locations

#Preview of the dataset
df.show()


In [None]:
#Code from https://stackoverflow.com/questions/35243744/get-specific-row-from-spark-dataframe

# Function to get rows at `rownums`
def getrows(df, rownums=None):
    return df.rdd.zipWithIndex().filter(lambda x: x[1] in rownums).map(lambda x: x[0])

In [None]:
# Get rows at positions 0 and 2.
print(getrows(df, rownums=[2532087]).collect())

In [None]:
#INFORMATION ABOUT THE DATA SET

#Number of rows
print('The number of rows is: ' + str(df.count()))

#Number of different time stamps
timestamps = df.groupby('timestamp').agg(f.count('timestamp').alias('count'))
print('The number of different timestamps is: ' + str(timestamps.count()))

In [None]:
#Number of different cars
max_size = 0
small_list_size = 100

cars_list  = df.select("carsList").rdd.flatMap(lambda x: x).collect() 
cars_small_lists = [cars_list[x:x+small_list_size] for x in range(0, len(cars_list), small_list_size )] #Divide cars list into smaller lists

car_IDs= [] #Final array of car IDs 

for small_list in cars_small_lists:
    for row in small_list: #For each row of the spark dataframe
        
        row = tuple(filter(None, row.split(',')))
        
        if len(row)>max_size:
            max_size = len(row)
        
        for element in row: 
            #remove [] and spaces
            element = element.replace('[', '')
            element = element.replace(']', '')
            element = element.replace(' ', '')
            
                  
            if element in car_IDs: 
                continue
            else:
                car_IDs.append(element)

print("The number of cars is: " + str(len(car_IDs)))
print("The maximum number of cars simultaneosuly in the same location is: "+str(max_size))

In [None]:
# Number of different locations
number_different_locations = df.groupby('latitude','longitude').count().distinct().count()
print("The number of different locations (i.e., pairs of coordinates) is: "+str(number_different_locations))

In [None]:
#Statistics about the number of cars in each location
result = df.select([mean("total_cars")])
result.show()

In [None]:
#Statistics about the locations
result_lat = df.select([mean("latitude"), min("latitude"), max("latitude")])
result_lat.show()
result_long = df.select([mean("longitude"), min("longitude"), max("longitude")])
result_long.show()

In [None]:
#Check for empty cells

print("The number of empty cells in latitude columns is: " + str(df.filter("'latitude' == ''").count()))
print("The number of empty cells in longitude columns is: " + str(df.filter("'longitude' == ''").count()))
print("The number of empty cells in total_cars columns is: " + str(df.filter("'total_cars' == ''").count()))