In [None]:
# This notebook aims to prepare the data for the following model training. Some new feaures, that could be useful 
# for the training are added. For example, for the dataframe that register the number of cars for a particular 
# location the time stamps are divided in year, season, month, day, hour and  minutes so the machine learning 
# algorithm can take advantage of it


In [None]:
# LIBRARIES

import pyspark
import pyspark.sql.functions as f

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType

import pandas as pd

import plotly.offline as py
import plotly.graph_objs as go

import numpy as np
from numpy import savetxt

import matplotlib.pyplot as plt

import csv

from scipy.stats import kde

import seaborn as sns

import time
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder

import os 
dirpath = os.getcwd()

print(sc.version)

In [None]:

# FEATURE ENGINEERING FOR MACHINE LEARNING ALGORITHM

# USE CASE 2: Predict number of cars in a particular location. 
# The time stamps are divided in year, season, month, day, hour and  minutes so the machine learning 
# algorithm can take advantage of it

df_location = spark.read.load("df_location.parquet")

df_location_pandas = df_location.toPandas()

list_hours = []
list_days = []
list_months = []
list_seasons = []
list_years = []
list_minutes = []

for i in range(len(df_location_pandas['timestamp'])): 
    new_timestamp = datetime.strptime(df_location_pandas['timestamp'][i], '%Y-%m-%d %H:%M:%S.%f %Z')
    list_minutes.append(new_timestamp.minute)
    list_hours.append(new_timestamp.hour)
    list_days.append(new_timestamp.day)
    list_months.append(new_timestamp.month)
    list_years.append(new_timestamp.year)
    if new_timestamp.month in range(1,4):
        list_seasons.append(int("1"))
    elif new_timestamp.month in range(4,7):
        list_seasons.append(int("2"))
    elif new_timestamp.month in range(7,10):
        list_seasons.append(int("3"))
    else:
        list_seasons.append(int("4"))

df_location_pandas["minute"] = list_minutes
df_location_pandas["hour"] = list_hours
df_location_pandas["day"] = list_days
df_location_pandas["month"] = list_months
df_location_pandas["season"] = list_seasons
df_location_pandas["year"] = list_years

df_location_pandas["total_cars_int"] = df_location_pandas.total_cars.astype(int)

df_location_expanded = spark.createDataFrame(df_location_pandas)
df_location_expanded.show()


In [None]:
# Statisctics, just for have some idea about possible correlations

df_location_pandas.corr(method ='pearson') 

# Preparation of the dataframe for the training. The features column is added
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=['minute','hour','day','month','season','year'],
                                  outputCol="features")

df_location_ml = vectorAssembler.transform(df_location_expanded)

df_location_ml.show()

In [None]:
# Saving the dataframes for a later usage
vectorAssembler.save(dirpath + "/vector_assembler")

df_location_expanded.write.format("parquet").save("df_location_expanded.parquet")
df_location_ml.write.format("parquet").save("df_location_ml.parquet")

In [None]:
# FEATURE ENGINEERING FOR DEEP LEARNING ALGORITHM 

df_location_expanded = spark.read.load("df_location_expanded.parquet")

df_pandas = df_location_expanded.toPandas()
df_pandas = df_pandas.sort_values(by='timestamp',ascending=True)
df_pandas = df_pandas.reset_index(drop=True)
df = df_pandas.drop(['timestamp','total_cars'], axis = 1)

X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values

# Dummy variable for season (Because fall is not more than Spring)
onehotencoder = OneHotEncoder(categorical_features = [4]) #4 is the number of the column 
X = onehotencoder.fit_transform(X).toarray()

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

# Saving X_train, X_test, y_train, y_test, X_train_sc and X_test_sc to csv file
savetxt('X_train.csv', X_train, delimiter=',')
savetxt('X_test.csv', X_test, delimiter=',')
savetxt('y_train.csv', y_train, delimiter=',')
savetxt('y_test.csv', y_test, delimiter=',')
savetxt('X_train_sc.csv', X_train_sc, delimiter=',')
savetxt('X_test_sc.csv', X_test_sc, delimiter=',')
