In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('Clean Obfuscated ATM Data.csv')
df['Terminal DateTime'] = pd.to_datetime(df['Terminal DateTime']) 
df.set_index(df['Terminal DateTime'], inplace=True)

model_df = df.resample('D').sum()
model_df = model_df[['amount_dispensed']]

model_df

Unnamed: 0_level_0,amount_dispensed
Terminal DateTime,Unnamed: 1_level_1
2020-07-06,240.0
2020-07-07,220.0
2020-07-08,380.0
2020-07-09,700.0
2020-07-10,240.0
...,...
2021-08-05,0.0
2021-08-06,0.0
2021-08-07,260.0
2021-08-08,0.0


# Add lagging amount dispensed features

Typically in time series we want to see what the value we were prediting was at in the recent past.

I am treating the previous 3 days and the say weekday last week as features.


In [68]:
model_df['t_minus_1'] = model_df['amount_dispensed'].shift(1)
model_df['t_minus_2'] = model_df['amount_dispensed'].shift(2)
model_df['t_minus_3'] = model_df['amount_dispensed'].shift(3)
model_df['t_minus_7'] = model_df['amount_dispensed'].shift(7)

# exclude the first 7 days since you don't have lagging indicators for them
model_df = model_df.iloc[7:]
model_df

Unnamed: 0_level_0,amount_dispensed,t_minus_1,t_minus_2,t_minus_3,t_minus_7
Terminal DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-07-13,240.0,0.0,500.0,240.0,240.0
2020-07-14,240.0,240.0,0.0,500.0,220.0
2020-07-15,0.0,240.0,240.0,0.0,380.0
2020-07-16,0.0,0.0,240.0,240.0,700.0
2020-07-17,620.0,0.0,0.0,240.0,240.0
...,...,...,...,...,...
2021-08-05,0.0,720.0,1920.0,60.0,860.0
2021-08-06,0.0,0.0,720.0,1920.0,280.0
2021-08-07,260.0,0.0,0.0,720.0,120.0
2021-08-08,0.0,260.0,0.0,0.0,480.0


#### Add weekday and month features

In [69]:
model_df['weekday'] = model_df.index.weekday
model_df['month'] =  model_df.index.month

model_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['weekday'] = model_df.index.weekday
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['month'] =  model_df.index.month


Unnamed: 0_level_0,amount_dispensed,t_minus_1,t_minus_2,t_minus_3,t_minus_7,weekday,month
Terminal DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-13,240.0,0.0,500.0,240.0,240.0,0,7
2020-07-14,240.0,240.0,0.0,500.0,220.0,1,7
2020-07-15,0.0,240.0,240.0,0.0,380.0,2,7
2020-07-16,0.0,0.0,240.0,240.0,700.0,3,7
2020-07-17,620.0,0.0,0.0,240.0,240.0,4,7
...,...,...,...,...,...,...,...
2021-08-05,0.0,720.0,1920.0,60.0,860.0,3,8
2021-08-06,0.0,0.0,720.0,1920.0,280.0,4,8
2021-08-07,260.0,0.0,0.0,720.0,120.0,5,8
2021-08-08,0.0,260.0,0.0,0.0,480.0,6,8


In [70]:
# double check to see if you have any null values
model_df.isna().sum()

amount_dispensed    0
t_minus_1           0
t_minus_2           0
t_minus_3           0
t_minus_7           0
weekday             0
month               0
dtype: int64

# Add Air quality data

In Spokane, the city that has the ATM we've been having worse than normal Air quality. To see the influnce on ATM transactions I downloaded the data for one of their sensors from https://aqicn.org/data-platform/



In [71]:
air_df = pd.read_csv('e-broadway ave, spokane, washington-air-quality.csv')
air_df['date'] = pd.DatetimeIndex(air_df['date'])

air_df.columns = ['date', 'pm25', 'pm10']
air_df.set_index('date', inplace=True)

def convert_to_float(s:str):
    try:
        return np.float(s)
    except:
        return np.nan

air_df['pm25'] = air_df['pm25'].apply(convert_to_float)
air_df['pm10'] = air_df['pm10'].apply(convert_to_float)
air_df.sort_index(inplace=True)
air_df

Unnamed: 0_level_0,pm25,pm10
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-05-09,13.0,
2014-05-11,32.0,
2014-05-12,9.0,
2014-05-13,21.0,
2014-05-14,63.0,
...,...,...
2021-07-23,73.0,30.0
2021-07-24,60.0,
2021-08-12,,74.0
2021-08-13,163.0,90.0


In [72]:
air_df = air_df.loc[model_df.index.min():model_df.index.max()]
air_df

Unnamed: 0_level_0,pm25,pm10
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-13,,10.0
2020-07-14,18.0,20.0
2020-07-15,22.0,24.0
2020-07-16,21.0,26.0
2020-07-17,21.0,17.0
...,...,...
2021-07-20,58.0,34.0
2021-07-21,45.0,18.0
2021-07-22,29.0,34.0
2021-07-23,73.0,30.0


### I only have AQI data for a subset. For the rest I'll just impute the average.

In [88]:
full_df = pd.concat([model_df,air_df], axis='columns')
print(full_df.columns)
full_df.fillna(np.nan, inplace=True)

cols = ['Date', 'amount_dispensed', 't_minus_1', 't_minus_2', 't_minus_3', 't_minus_7','weekday', 'month', 'pm25', 'pm10']
       
full_df.reset_index(inplace=True)
full_df.columns = cols

Index(['amount_dispensed', 't_minus_1', 't_minus_2', 't_minus_3', 't_minus_7',
       'weekday', 'month', 'pm25', 'pm10'],
      dtype='object')


In [89]:
full_df

Unnamed: 0,Date,amount_dispensed,t_minus_1,t_minus_2,t_minus_3,t_minus_7,weekday,month,pm25,pm10
0,2020-07-13,240.0,0.0,500.0,240.0,240.0,0,7,,10.0
1,2020-07-14,240.0,240.0,0.0,500.0,220.0,1,7,18.0,20.0
2,2020-07-15,0.0,240.0,240.0,0.0,380.0,2,7,22.0,24.0
3,2020-07-16,0.0,0.0,240.0,240.0,700.0,3,7,21.0,26.0
4,2020-07-17,620.0,0.0,0.0,240.0,240.0,4,7,21.0,17.0
...,...,...,...,...,...,...,...,...,...,...
388,2021-08-05,0.0,720.0,1920.0,60.0,860.0,3,8,,
389,2021-08-06,0.0,0.0,720.0,1920.0,280.0,4,8,,
390,2021-08-07,260.0,0.0,0.0,720.0,120.0,5,8,,
391,2021-08-08,0.0,260.0,0.0,0.0,480.0,6,8,,


In [92]:
full_df.to_csv('feature_df.csv', index=False)

# Building the model


In [81]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


# If you train your imputer on the test data then you over estimate your accuracy.

simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
target = 'amount_dispensed'
features = [a for a in model_df.columns if a != target]

X = model_df[features]
y = model_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # empirical testing has shown this to be the best random state.


simple_imputer.fit(X_train)
X_train = pd.DataFrame(data=simple_imputer.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(data=simple_imputer.transform(X_test), columns = X.columns)
X_train

Unnamed: 0,t_minus_1,t_minus_2,t_minus_3,t_minus_7,weekday,month
0,500.0,1160.0,720.0,1540.0,0.0,6.0
1,140.0,480.0,100.0,840.0,0.0,1.0
2,1380.0,300.0,0.0,300.0,4.0,1.0
3,200.0,100.0,440.0,60.0,0.0,9.0
4,660.0,1140.0,1860.0,200.0,5.0,6.0
...,...,...,...,...,...,...
309,0.0,400.0,100.0,280.0,1.0,9.0
310,1000.0,460.0,660.0,220.0,1.0,10.0
311,680.0,780.0,620.0,1540.0,4.0,4.0
312,600.0,500.0,280.0,1220.0,5.0,6.0
