In [1]:
# ignore warning
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# visualize 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# working with dates
from datetime import datetime

# to evaluated performance using rmse
from sklearn.metrics import mean_squared_error
from math import sqrt 

# for tsa 
import statsmodels.api as sm

# holt's linear trend model. 
from statsmodels.tsa.api import Holt

In [2]:
df = pd.read_csv('cpi.csv')

In [3]:
df.head()

Unnamed: 0,year,period,label,all_items_value,all_items_ monthly_change,apparel_value,apparel_monthly_change,energy_value,energy_monthly_change,food_value,food_monthly_change,gas_value,gas_monthly_change,medical_value,medical_monthly_change,transportation_value,transportation_monthly_change
0,1957,M01,1957 Jan,27.67,0.1,44.3,0.0,21.3,0.0,28.4,-0.4,,,16.7,0.6,23.5,0.9
1,1957,M02,1957 Feb,27.8,0.5,44.3,0.0,21.4,0.5,28.7,1.1,,,16.7,0.0,23.7,0.9
2,1957,M03,1957 Mar,27.86,0.2,44.5,0.5,21.5,0.5,28.6,-0.3,,,16.8,0.6,23.7,0.0
3,1957,M04,1957 Apr,27.93,0.3,44.4,-0.2,21.6,0.5,28.6,0.0,,,16.9,0.6,23.8,0.4
4,1957,M05,1957 May,28.0,0.3,44.5,0.2,21.6,0.0,28.7,0.3,,,16.9,0.0,23.9,0.4


In [4]:
# Check null values
df.isnull().sum()

year                               0
period                             0
label                              0
all_items_value                    0
all_items_ monthly_change          0
apparel_value                      0
apparel_monthly_change             0
energy_value                       0
energy_monthly_change              0
food_value                         0
food_monthly_change                0
gas_value                        120
gas_monthly_change               120
medical_value                      0
medical_monthly_change             0
transportation_value               0
transportation_monthly_change      0
dtype: int64

In [5]:
# Convert label date to datetime
df['label'] = pd.to_datetime(df['label'], infer_datetime_format=True)

In [6]:
# Convert year to datetime
df['year'] =  pd.to_datetime(df['year']).dt.to_period('Y')

In [7]:
# Rename period to month
df = df.rename(columns={'period': 'month'})

In [8]:
# Convert period to datetime
df['month'] = df['month'].str.replace('M', '')

In [9]:
df.dtypes

year                              period[A-DEC]
month                                    object
label                            datetime64[ns]
all_items_value                         float64
all_items_ monthly_change               float64
apparel_value                           float64
apparel_monthly_change                  float64
energy_value                            float64
energy_monthly_change                   float64
food_value                              float64
food_monthly_change                     float64
gas_value                               float64
gas_monthly_change                      float64
medical_value                           float64
medical_monthly_change                  float64
transportation_value                    float64
transportation_monthly_change           float64
dtype: object

In [10]:
df.head()

Unnamed: 0,year,month,label,all_items_value,all_items_ monthly_change,apparel_value,apparel_monthly_change,energy_value,energy_monthly_change,food_value,food_monthly_change,gas_value,gas_monthly_change,medical_value,medical_monthly_change,transportation_value,transportation_monthly_change
0,1970,1,1957-01-01,27.67,0.1,44.3,0.0,21.3,0.0,28.4,-0.4,,,16.7,0.6,23.5,0.9
1,1970,2,1957-02-01,27.8,0.5,44.3,0.0,21.4,0.5,28.7,1.1,,,16.7,0.0,23.7,0.9
2,1970,3,1957-03-01,27.86,0.2,44.5,0.5,21.5,0.5,28.6,-0.3,,,16.8,0.6,23.7,0.0
3,1970,4,1957-04-01,27.93,0.3,44.4,-0.2,21.6,0.5,28.6,0.0,,,16.9,0.6,23.8,0.4
4,1970,5,1957-05-01,28.0,0.3,44.5,0.2,21.6,0.0,28.7,0.3,,,16.9,0.0,23.9,0.4


In [None]:
def get_data():
    '''
    This function returns the cleaned dataset
    '''
    df = pd.read_csv('cpi.csv')
    # Convert label date to datetime
    df['label'] = pd.to_datetime(df['label'], infer_datetime_format=True)
    # Convert year to datetime
    df['year'] =  pd.to_datetime(df['year']).dt.to_period('Y')
    # Rename period to month
    df = df.rename(columns={'period': 'month'})
    # Convert period to datetime
    df['month'] = df['month'].str.replace('M', '')
    
    return df