In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymongo as pm
import pprint
from enum import Enum
from datetime import datetime, timedelta
import pytz

In [2]:

client = pm.MongoClient('bigdatadb.polito.it',                     
                        ssl=True,                     
                        authSource = 'carsharing',                     
                        username = 'ictts',                     
                        password ='Ict4SM22!',                     
                        tlsAllowInvalidCertificates=True) 
db = client['carsharing'] 

#Choose the DB to use 
permenant_booking = db['PermanentBookings']
permenant_parking = db['PermanentParkings']
enjoy_permenant_booking = db['enjoy_PermanentBookings']
enjoy_permenant_parking = db['enjoy_PermanentParkings']

#ENUM of cities
class CITY_ENUM(Enum):
    TO = 'Torino'
    SEA = 'Seattle'
    STU = 'Stuttgart'
class CITY_TIMEZONES(Enum):
    TO = 'Europe/Rome'
    SEA = 'America/Los_Angeles'
    STU = 'Europe/Berlin'

# def get_start_end_unix_zone(timezone):
#     # start_timestamp = datetime(2018, 1, 1,0,0,0,0, pytz.timezone(timezone)).timestamp()
#     # end_timestamp  = datetime(2018, 1, 31,23,59,59,0, pytz.timezone(timezone)).timestamp()
#     return start_timestamp,end_timestamp    

#date starts from 01/01/2018 to 31/01/2018 1514761200 - 1517353200
start_unix_time = datetime.strptime("27/12/2017", "%d/%m/%Y").timestamp()
end_unix_time = datetime.strptime("27/01/2018", "%d/%m/%Y").timestamp()

In [3]:
#pipeline for getting the data for the rentals with the filteration of the data
#too short and too long rentals are filtered out
#considered if car is moved
def filter_pipeline(city,start_unix_time,end_unix_time):
    return [
    {
        '$match': {
            'city': city,
            'init_time': {
                '$gte': start_unix_time,
                '$lt': end_unix_time
            },
            'final_time': {
                '$gte': start_unix_time,
                '$lt': end_unix_time
            }
        }
    },
    {
        '$project': {
            '_id': 0,
            'duration': {
                '$divide': [
                    { '$subtract': ['$final_time', '$init_time'] },
                    60  # Divide by 60 to convert seconds to minutes
                ]
            },
            'day': {'$dayOfMonth': '$init_date'},
            'hour': {'$hour': '$init_date'},
            'date': {
                '$dateToString': {
                    'format': '%Y-%m-%d',
                    'date': '$init_date'
                }
            },
            'moved': {
                '$ne':[
                    {"$arrayElemAt": [ "$origin_destination.coordinates", 0]},
                    {"$arrayElemAt": [ "$origin_destination.coordinates", 1]}
                 ]
            }
        }
    },
    {
        '$match': {
            'moved': True,
            'duration':{'$gt':5, '$lt':180},
                
        }
    },
    {
        '$group':{
            '_id': {'day': '$day', 'hour': '$hour', 'date': '$date'},
            'total_count': {'$sum': 1},
        }
    },
    {
        '$sort': {
            '_id': 1,
        }
    },
]


### Getting the data from Database

In [4]:
TO_Data = list(enjoy_permenant_booking.aggregate(filter_pipeline(CITY_ENUM.TO.value,
          start_unix_time,end_unix_time)))
SEA_Data = list(permenant_booking.aggregate(filter_pipeline(CITY_ENUM.SEA.value,
          start_unix_time,end_unix_time)))
STU_Data = list(permenant_booking.aggregate(filter_pipeline(CITY_ENUM.STU.value,
          start_unix_time,end_unix_time)))
cities_data_array = [(CITY_ENUM.TO.value,TO_Data),(CITY_ENUM.SEA.value,SEA_Data),(CITY_ENUM.STU.value,STU_Data)]

#### checking if there are missing records - lwngth must be 744

In [1060]:
print("TO_Data",len(TO_Data))
print("SEA_Data",len(SEA_Data))
print("STU_Data",len(STU_Data))

TO_Data 744
SEA_Data 744
STU_Data 744


##### Dropping the _id col and flattening the data

In [5]:
def dfModifier(city_list):
  df = pd.DataFrame(city_list, columns =['_id', 'total_count'])
  df['date'] = df['_id'].apply(lambda x: x['date'])
  df['day'] = df['_id'].apply(lambda x: x['day'])
  df['hour'] = df['_id'].apply(lambda x: x['hour'])
  df['myIndex'] = (df['day']-1)*24 + (df['hour']+1)
  df.drop(['_id'], axis=1, inplace=True)
  return df
#day | hour
#1   | 0 -> day*24 + hour => 1*24 + 0 = 24
#1   | 1 -> day*24 + hour => 1*24 + 1 = 25
#1   | 2 -> day*24 + hour => 1*24 + 2 = 26
#day | hour
#0   | 1 -> day*24 + hour => 0*24 + 1 = 1
#0   | 2 -> day*24 + hour => 0*24 + 2 = 2
#0   | 3 -> day*24 + hour => 0*24 + 3 = 3

TO_df = dfModifier(TO_Data)
SEA_df = dfModifier(SEA_Data)
STU_df = dfModifier(STU_Data)
cities_df_array = [(CITY_ENUM.TO.value,TO_df),(CITY_ENUM.SEA.value,SEA_df),(CITY_ENUM.STU.value,STU_df)]


In [6]:
# calculating the avg for each hour of the day

TO_hourly_avg = TO_df.groupby('hour')['total_count'].mean().round().reset_index().astype(int)['total_count'].tolist()
SEA_hourly_avg = SEA_df.groupby('hour')['total_count'].mean().round().reset_index().astype(int)['total_count'].tolist()
STU_hourly_avg = STU_df.groupby('hour')['total_count'].mean().round().reset_index().astype(int)['total_count'].tolist()

# print("TO_hourly_avg",TO_hourly_avg)
# print("SEA_hourly_avg",SEA_hourly_avg)
# print("STU_hourly_avg",STU_hourly_avg)


In [1063]:
print("TO_hourly_avg",TO_hourly_avg)

TO_hourly_avg [67, 42, 27, 17, 17, 23, 39, 68, 99, 85, 75, 74, 91, 91, 97, 101, 101, 110, 118, 118, 110, 94, 85, 80]


##### finding the mising values and filling them with Mean of Col

In [7]:
def fillMissingValues(df:pd.DataFrame, avg_df):
  missingValues=set(np.arange(1,31*24+1)).difference(set(df['myIndex']))
  # dfMean = round(np.mean(df['total_count']))
  print("Missing values are:", len(missingValues), missingValues)
  df2 = df
  for value in missingValues:
    dayOfValue = int((value-1)/24)+1
    hourOfValue = (value-1)%24
    new_row = pd.DataFrame({'total_count':avg_df[hourOfValue],'date':f'2018-01-{dayOfValue:02d}',
                            'day':dayOfValue,'hour':hourOfValue,'myIndex':value}, index =[0])
    df2 = pd.concat([new_row,df2.loc[:]]).reset_index(drop = True)
  df2.sort_values(by=['myIndex'], inplace=True)
  return df2

To_FilledValues = fillMissingValues(TO_df, TO_hourly_avg)
SEA_FilledValues = fillMissingValues(SEA_df,SEA_hourly_avg)
STU_FilledValues = fillMissingValues(STU_df,SEA_hourly_avg)

Missing values are: 0 set()
Missing values are: 0 set()
Missing values are: 0 set()


In [8]:
print("To_FilledValues",len(To_FilledValues))
print("SEA_FilledValues",len(SEA_FilledValues))
print("STU_FilledValues",len(STU_FilledValues), STU_FilledValues[743:745])

To_FilledValues 744
SEA_FilledValues 744
STU_FilledValues 744      total_count        date  day  hour  myIndex
743           43  2017-12-31   31    23      744


#### Plotting the data and Rolling mean and std to check if the time series is stationary

In [1066]:
def plotter(plotTitle, df:pd.DataFrame):
    mean = df['total_count'].rolling(window=24*7).mean()
    std = df['total_count'].rolling(window=24*7).std()
    plt.figure(figsize=(14, 6))
    plt.plot(df['myIndex'], mean, label='Rolling Mean', color='red')
    plt.plot(df['myIndex'], std, label='Rolling Std', color='green')
    plt.plot()
    plt.plot(df['myIndex'], df['total_count'], label='Rental', color='blue')
    plt.xlabel('Date')
    plt.ylabel('Total Count')
    plt.legend()
    plt.grid(True)
    plt.title(f'Total Counts in Dates and Hours in - {plotTitle}')
    plt.grid(True)
    plt.savefig(f'{plotTitle}-Roolings-mean-std')
    plt.clf()

In [1067]:
plotter('Torino',To_FilledValues)
plotter('Seattle',SEA_FilledValues)
plotter('Stuttgart',STU_FilledValues)

<Figure size 1400x600 with 0 Axes>

<Figure size 1400x600 with 0 Axes>

<Figure size 1400x600 with 0 Axes>

In [9]:
cleanFilledCities = [(CITY_ENUM.TO.value,To_FilledValues),(CITY_ENUM.SEA.value,SEA_FilledValues),(CITY_ENUM.STU.value,STU_FilledValues)]

## Computing the ACF and PACF

##### ACF Figure

In [1069]:
from statsmodels.tsa.stattools import acf,pacf
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf

# Use ACF to find q.
# Use PACF to find p.

def ACF_PACF(city_data):
  # plot acf
  plt.figure(figsize=(6,4))
  plot_acf(city_data[1]["total_count"], lags=48)
  plt.title(f'Autocorrelation Function 48 Hours - {city_data[0]}')
  plt.xlabel('Lags')
  plt.grid(True)
  # plt.show()
  plt.savefig(f'{city_data[0]}-ACF')

  # plot pacf
  plt.figure(figsize=(6,4))
  plot_pacf(city_data[1]["total_count"], lags=48)
  plt.title(f'Partial Autocorrelation Function 48 Hours - {city_data[0]}')
  plt.xlabel('Lags')
  plt.grid(True)
  # plt.show()
  plt.savefig(f'{city_data[0]}-PACF')
  


In [12]:
for city_data in cities_df_array:
  ACF_PACF(city_data)

### ARIMA Model and Prediction

In [13]:
for city in cleanFilledCities:
  print(city[0], len(city[1]))

In [18]:
from statsmodels.tsa.arima.model import ARIMA
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

q = 4
p = 2
d = 0
train_size = 24 * 7 * 2 # 24 * 7 * 3 # 3 weeks -> we will change this to 14 days
test_size = 24 * 3 # 10 days -> this takes too long to run so we will use 72 hours
myModel = None
def Predictor(cleanCity):
  originalData = list(cleanCity[1]['total_count'][:train_size])#.tolist()
  y_hat = [None for _ in range(train_size)] # should it be a list or pandas array?
  for record in range(train_size,train_size+test_size):
    model = ARIMA(originalData, order=(p,d,q))
    model_fit = model.fit()
    prediction = int(model_fit.forecast()[0])
    # print(f'Prediction for {cleanCity[0]} at {record} is {prediction}')
    y_hat.append(prediction)
    originalData.append(cleanCity[1]['total_count'][record])
    originalData = originalData[1:]
    myModel = model_fit

  plt.figure(figsize=(15,5))
  plt.title("Predicted values vs Real values")
  plt.plot(list(cleanCity[1]['total_count'][train_size:train_size+test_size]), color='blue', label="Real values")
  plt.plot(list(y_hat[train_size: train_size+test_size]), color='red', label="Predicted values")
  plt.legend()
  plt.xlabel("Lags")
  plt.ylabel("Rentals")
  plt.grid(True)
  plt.savefig(f'2 day prediction {cleanCity[0]}')
  plt.clf()

  # plot residual errors
  residuals = pd.DataFrame(myModel.resid)
  residuals.plot()
  plt.title(f'Residuals - {cleanCity[0]}')
  plt.xlabel("Residual Error")
  plt.ylabel("Residuals")
  plt.grid(True)
  plt.savefig(f'2 day Residuals {cleanCity[0]}')
  plt.clf()

  residuals.plot(kind='kde')
  plt.title(f'Density of Residuals - {cleanCity[0]}')
  plt.xlabel("Residual Error")
  plt.ylabel("Density")
  plt.grid(True)
  plt.savefig(f'2 day Density of Residuals {cleanCity[0]}')
  plt.clf()
  return y_hat, model_fit


In [14]:
comparisonArray = []
for city_data in cleanFilledCities:
  print(city_data[0])
  y_hat, model_fit = Predictor(city_data)
  y_hat = y_hat[train_size:train_size+test_size]
  comparisonArray.append((city_data[0],city_data[1]['total_count'][train_size:train_size+test_size], y_hat, model_fit))

In [1087]:
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_percentage_error

for item in comparisonArray:
  mse = mean_squared_error(item[1], item[2])
  rmse = np.sqrt(mse)
  r2 = r2_score(item[1], item[2])
  mape = mean_absolute_percentage_error(item[1], item[2])
  print(f'{item[0]} -> MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2: {r2:.2f}, MAPE: {mape:.2f}')

Torino -> MSE: 365.86, RMSE: 19.13, R2: 0.77, MAPE: 0.32
Seattle -> MSE: 373.31, RMSE: 19.32, R2: 0.86, MAPE: 0.38
Stuttgart -> MSE: 526.42, RMSE: 22.94, R2: 0.75, MAPE: 0.43


In [None]:
# Torino -> MSE: 365.86, RMSE: 19.13, R2: 0.77, MAPE: 0.32
# Seattle -> MSE: 373.31, RMSE: 19.32, R2: 0.86, MAPE: 0.38
# Stuttgart -> MSE: 526.42, RMSE: 22.94, R2: 0.75, MAPE: 0.43

### Variant values for p d q

In [20]:
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_percentage_error
p = [1,2,3,4,5,6]
q = [1,2,3,4]
d = 0
finalValues = []

for cleanCity in cleanFilledCities:
  for i in p:
    for j in q:
      worked = False
      originalData = list(cleanCity[1]['total_count'][:train_size])#.tolist()
      y_hat = [None for _ in range(train_size)] # should it be a list or pandas array?
      for record in range(train_size,train_size+test_size):
        try:
          model = ARIMA(originalData, order=(i,d,j))
          model_fit = model.fit()
          prediction = int(model_fit.forecast()[0])
          # print(f'Prediction for {cleanCity[0]} at {record} is {prediction}')
          y_hat.append(prediction) #shoudl it be int(prediction) or prediction as a float
          originalData.append(cleanCity[1]['total_count'][record])
          originalData = originalData[1:]
          worked = True
        except:
          print("error")
          worked = False
          continue
      if worked:
        actual_values = cleanCity[1]['total_count'][train_size:train_size+test_size]
        prediction_values = y_hat[train_size:train_size+test_size]
        mse = mean_squared_error(actual_values, prediction_values)
        rmse = np.sqrt(mse)
        r2 = r2_score(actual_values, prediction_values)
        mape = mean_absolute_percentage_error(actual_values, prediction_values)
        finalValues.append((cleanCity[0],i,j,mse,rmse,r2,mape))
      elif not worked:
        finalValues.append((cleanCity[0],i,j,0,0,0,0))


  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error


  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


In [46]:
from itertools import groupby
import seaborn as sb

# print(finalValues)
dont_touch_this = finalValues
dont = finalValues
touchthis = finalValues

touchThat = pd.DataFrame(touchthis, columns=['city','p','q','mse','rmse','r2','mape'])
# touchThat.to_csv('ARMA_RESULT.csv', index=False)
#group by city
touchThat = touchThat.groupby('city')

for name, group in touchThat:
    mmmmm = group
    mmmmm.loc[mmmmm['mape'] > 1, 'mape'] = None
    mapeList = mmmmm['mape'].replace(0, None).tolist()
    mapPD = pd.DataFrame(mapeList)
    MAPE2d = mapPD.values.reshape(6,4)
    print(group)
    # print(MAPE2d)
    sb.heatmap(MAPE2d, annot=True, cmap="YlGnBu", fmt=".4f", linewidths=.5,
            xticklabels=[1,2,3,4], yticklabels=[1,2,3,4,5,6])
    plt.title(f'MAPE - {name}')
    plt.xlabel("q")
    plt.ylabel("p")
    # plt.show()
    plt.savefig(f'MAPE - {name}')
    plt.clf()

# print(list(group.columns))
# Seattle        [Seattle]
# Stuttgart    [Stuttgart]
# Torino          [Torino]

#seattle -> p=3, q=3
#stuttgart -> p=2, q=2
#torino -> p=4 , q=3 | p=3, q=4


       city  p  q         mse       rmse        r2      mape
24  Seattle  1  1  436.013889  20.880946  0.834188  0.413454
25  Seattle  1  2  417.791667  20.439953  0.841117  0.418309
26  Seattle  1  3  396.375000  19.909169  0.849262  0.411125
27  Seattle  1  4  400.055556  20.001389  0.847862  0.416211
28  Seattle  2  1  352.194444  18.766844  0.866063  0.349057
29  Seattle  2  2  348.458333  18.667039  0.867484  0.360136
30  Seattle  2  3  351.805556  18.756480  0.866211  0.360097
31  Seattle  2  4  373.305556  19.321117  0.858035  0.377191
32  Seattle  3  1  345.888889  18.598088  0.868461  0.361333
33  Seattle  3  2  353.611111  18.804550  0.865525  0.358623
34  Seattle  3  3  304.763889  17.457488  0.884101  0.323147
35  Seattle  3  4  351.402778  18.745740  0.866364  0.366698
36  Seattle  4  1  603.583333  24.567933  0.770462  0.384295
37  Seattle  4  2    0.000000   0.000000  0.000000  0.000000
38  Seattle  4  3  363.777778  19.072959  0.861658  0.342199
39  Seattle  4  4  345.0

<Figure size 640x480 with 0 Axes>

In [55]:

different_train_sizes = [24*7, 24*7*2, 24*7*3]
bestValues = [(CITY_ENUM.TO.value,cleanFilledCities[0][1]['total_count'], 3, 2),
              (CITY_ENUM.SEA.value,cleanFilledCities[1][1]['total_count'], 3, 3),
              (CITY_ENUM.STU.value,cleanFilledCities[2][1]['total_count'], 2, 2)]
testSize = 72
strategies = ["expand", "slide"]

finalResults = []
def NVariant(v):
  for trainSize in different_train_sizes:
    trainSamples = cleanCity[1][:trainSize]
    testSamples = cleanCity[1][trainSize:trainSize+testSize]
    p = cleanCity[2]
    q = cleanCity[3]
    d = 0
    y_hat = [None for _ in range(trainSize)]
    result = {}
    for strategy in strategies:
      originalData = list(trainSamples)
      for record in range(testSize):
        worked = False
        try:
          model = ARIMA(originalData, order=(p,d,q))
          model_fit = model.fit()
          prediction = model_fit.forecast()[0]
          y_hat.append(prediction)
          originalData.append(originalData[record])
          worked = True
          if strategy == "slide":
            originalData = originalData[1:]
            worked = True
        except:
          worked = False
          continue
      if worked:
        # result["strategy"] = strategy
        # result["y_hat"] = y_hat
        # result["p"] = p
        # result["q"] = q
        # result["d"] = d
        # result["trainSize"] = trainSize
        # result["testSize"] = testSize
        # result["mse"] = mean_squared_error(testSamples, y_hat[trainSize:trainSize+testSize])
        # result["rmse"] = np.sqrt(result["mse"])
        # result["r2"] = r2_score(testSamples, y_hat[trainSize:trainSize+testSize])
        # result["mape"] = mean_absolute_percentage_error(testSamples, y_hat[trainSize:trainSize+testSize])
        finalResults.append((cleanCity[0],strategy, y_hat, p, q, d, trainSize, testSize,
                             mean_squared_error(testSamples, y_hat[trainSize:trainSize+testSize]),
                             np.sqrt(mean_squared_error(testSamples, y_hat[trainSize:trainSize+testSize])),
                             r2_score(testSamples, y_hat[trainSize:trainSize+testSize]),
                             mean_absolute_percentage_error(testSamples, y_hat[trainSize:trainSize+testSize])))
      elif not worked:
        # result["strategy"] = strategy
        # result["y_hat"] = y_hat
        # result["p"] = p
        # result["q"] = q
        # result["d"] = d
        # result["trainSize"] = trainSize
        # result["testSize"] = testSize
        # result["mse"] = None
        # result["rmse"] = np.sqrt(result["mse"])
        # result["r2"] = None
        # result["mape"] = None
        finalResults.append((cleanCity[0],strategy, y_hat, p, q, d, trainSize, testSize,
                             None,
                             None,
                             None,
                             None))
       




In [56]:
for cleanCity in bestValues:
  NVariant(cleanCity)

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'


In [99]:
dont_do_this = finalResults
dont = finalResults
do_this = finalResults

nvarpd = pd.DataFrame(finalResults, columns=['city','strategy','y_hat','p','q','d','trainSize','testSize','mse','rmse','r2','mape'])
nvarpd.to_csv('ARMA_NVARIANT_RESULT.csv', index=False)

# print(do_this)
# navrient_df = pd.DataFrame(do_this, columns=['city','strategy','y_hat','p','q','d','trainSize','testSize','mse','rmse','r2','mape'])
# # print(navrient_df)
# grouped_nvarient_df = navrient_df.groupby(['city','strategy'])
# # grouped_nvarient_df = navrient_df.groupby(['city'])
# for name, group in grouped_nvarient_df:
#   print(name)
#   print(group['r2'].tolist())




# for name, group in grouped_nvarient_df:
#   print(name)
#   print(group['mape'])
  # start_group = group.groupby(['strategy'])
  # print(start_group)
  # print(start_group['mape'].tolist())
  # slide_list = start_group.get_group('slide')['mape'].tolist()
  # expand_list = start_group.get_group('expand')['mape'].tolist()
  # # print("slide_list",slide_list)
  # # print("expand_list",expand_list)
  # final_list = []
  # final_list.append(slide_list)
  # final_list.append(expand_list)
  # print(final_list)
  # sb.heatmap(final_list, annot=True, cmap="YlGnBu", fmt=".4f", linewidths=.5)
  # plt.show()



  # mapeItems = group['mape'].tolist()
  # # Create a DataFrame for the heatmap
  # df_heatmap = pd.DataFrame({f'Index {i}': mapeItems[i] for i in range(len(mapeItems))}, index=[0])

  # # Plot the heatmap
  # sb.heatmap(df_heatmap, annot=True, cmap="YlGnBu", fmt=".2f", linewidths=.5)
  
  # Set labels and title
  # plt.xlabel("Index")
  # plt.ylabel(f"{city} - {strategy}")
  # plt.title(f"MAPE Heatmap - {city} - {strategy}")

  # Show the plot for each group
  # plt.show()
