# Import Dependencies

**Note** PySpark has it's own [implementation of pandas api](https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/frame.html) which computes the same functionality using distributed computing and clusters under the hood. There are some differences but the methods implemented follow along with the pyspark documentation

In [76]:
# import pyspark.pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import os

# Load data

In [30]:
path_to_data = os.path.join('.', "data")

rides_df = pd.read_csv(os.path.join(path_to_data, 'rides_by_date.csv'), index_col = 0, parse_dates = [0])
gas_df = pd.read_csv(os.path.join(path_to_data, 'motor-gas-prices-2014.csv'), 
                     index_col = 0, nrows = 52, parse_dates = [0])

In [31]:
print( rides_df.index )
rides_df.head()

DatetimeIndex(['2014-04-01', '2014-04-02', '2014-04-03', '2014-04-04',
               '2014-04-05', '2014-04-06', '2014-04-07', '2014-04-08',
               '2014-04-09', '2014-04-10',
               ...
               '2014-09-21', '2014-09-22', '2014-09-23', '2014-09-24',
               '2014-09-25', '2014-09-26', '2014-09-27', '2014-09-28',
               '2014-09-29', '2014-09-30'],
              dtype='datetime64[ns]', length=183, freq=None)


Unnamed: 0,Total Rides,7-Day Average Rides
2014-04-01,14546,14546.0
2014-04-02,17474,16010.0
2014-04-03,20701,17573.666667
2014-04-04,26714,19858.75
2014-04-05,19521,19791.2


In [33]:
print( gas_df.index )
gas_df.head()

DatetimeIndex(['2014-01-06', '2014-01-13', '2014-01-20', '2014-01-27',
               '2014-02-03', '2014-02-10', '2014-02-17', '2014-02-24',
               '2014-03-03', '2014-03-10', '2014-03-17', '2014-03-24',
               '2014-03-31', '2014-04-07', '2014-04-14', '2014-04-21',
               '2014-04-28', '2014-05-05', '2014-05-12', '2014-05-19',
               '2014-05-26', '2014-06-02', '2014-06-09', '2014-06-16',
               '2014-06-23', '2014-06-30', '2014-07-07', '2014-07-14',
               '2014-07-21', '2014-07-28', '2014-08-04', '2014-08-11',
               '2014-08-18', '2014-08-25', '2014-09-01', '2014-09-08',
               '2014-09-15', '2014-09-22', '2014-09-29', '2014-10-06',
               '2014-10-13', '2014-10-20', '2014-10-27', '2014-11-03',
               '2014-11-10', '2014-11-17', '2014-11-24', '2014-12-01',
               '2014-12-08', '2014-12-15', '2014-12-22', '2014-12-29'],
              dtype='datetime64[ns]', freq=None)


Unnamed: 0,Statewide,Upstate,Downstate,NYC
2014-01-06,369.0,368.3,369.7,353.9
2014-01-13,364.4,366.1,362.9,349.6
2014-01-20,361.4,364.3,358.8,344.5
2014-01-27,359.6,362.5,356.9,342.8
2014-02-03,358.8,361.9,356.1,341.5


# Combine Data

In [37]:
combined = rides_df.join(gas_df.get('NYC'), how = 'inner')
combined.head()

Unnamed: 0,Total Rides,7-Day Average Rides,NYC
2014-04-07,19550,18850.142857,359.4
2014-04-14,12674,16635.428571,360.3
2014-04-21,13162,16598.0,368.1
2014-04-28,15475,20120.714286,377.5
2014-05-05,17859,22958.142857,375.3


In [79]:
combined.to_csv( os.path.join(path_to_data, 'rides_to_gas.csv') )