# End To End Exercise - Liam Jackson

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt 

import statsmodels.api as sm
from statsmodels.tsa.api import Holt

In [2]:
df = pd.read_csv('GlobalLandTemperaturesByCity.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'GlobalLandTemperaturesByCity.csv'

In [None]:
def prep_temp_data(df):
    df = df[df.City == 'Xingtai']
    # converts temp to degree farenheit
    df['avg_temp_f'] = (df.AverageTemperature * 9/5) + 32 
    
    return df

In [None]:
df = prep_temp_data(df)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.City.value_counts()

In [None]:
df.Country.value_counts()

In [None]:
df.Latitude.value_counts()

In [None]:
df.Longitude.value_counts()

In [None]:
df.AverageTemperature.hist()

In [None]:
df.AverageTemperatureUncertainty.hist()

# Check for null then use ffill or bfill. 

In [None]:
df.isnull().count()

In [None]:
def add_layers(df):
    # Sort rows by the date and then set the index as that date
    df.dt = pd.to_datetime(df.dt)
    df = df.set_index("dt", drop=False).sort_index()
    df['weekday'] = pd.DatetimeIndex(df.dt).day
    df['month'] = pd.DatetimeIndex(df.dt).month
    df['year'] = pd.DatetimeIndex(df.dt).year
    
    # remove leap days
    df = df[df.index != '2016-02-29']
    df = df.fillna(method='ffill')

    return df

In [None]:
df = add_layers(df)
df.head()

# Train, Validate, and Test

In [None]:
def train_val_test(df):
    train_size = int(len(df) * .5)
    validate_size = int(len(df) * .3)
    test_size = int(len(df) - train_size - validate_size)
    validate_end_index = train_size + validate_size

    train = df[: train_size]
    validate = df[train_size:validate_end_index]
    test = df[validate_end_index:]
    
    check1 = pd.concat([train.tail(1), validate.head(1)])
    check2 = pd.concat([validate.tail(1), test.head(1)])
    
    print(len(train) + len(validate) + len(test) == len(df))
    print(df.head(1) == train.head(1))
    
    return df, train, validate, test, check1, check2

In [None]:
df, train, validate, test, check1, check2 = train_val_test(df)

In [None]:
check1

In [None]:
check2

In [None]:
train.dt.nunique()

In [None]:
df.dt.min()

In [None]:
df.dt.max()

In [None]:
df.dt.max() - df.dt.min()

In [None]:
# 193 years of data (assuming all years are filled)

In [None]:
# China industrial reveluation 1988 (I want to look at 1988 - 2013)

In [None]:
def plot_temp_data(train):
    
    plt.figure(figsize=(20, 30))
    
    plt.subplot(10,1,1)
    train.avg_temp_f.plot()
    
    plt.subplot(10,1,2)
    train.AverageTemperature.plot()
    
    plt.subplot(10,1,3)
    train.AverageTemperatureUncertainty.plot()

In [None]:
plot_temp_data(train)

In [None]:
# Plot mean temperature by month (bar plot)
train.groupby('year').avg_temp_f.mean().plot.bar()

In [None]:
sns.boxplot(data = train, x = 'month', y = 'avg_temp_f')

In [None]:
sns.boxplot(data = train, x = 'year', y = 'avg_temp_f')

In [None]:
train.head()

In [None]:
train.dt = pd.to_datetime(train.dt)
validate.dt = pd.to_datetime(validate.dt)
test.dt = pd.to_datetime(test.dt)

train = train.set_index("dt", drop=False).sort_index()
validate = validate.set_index("dt", drop=False).sort_index()
test = test.set_index("dt", drop=False).sort_index()


In [None]:
train.avg_temp_f.resample('M').mean().plot(label = 'Monthly')
train.avg_temp_f.resample('Y').mean().plot(label = 'Yearly')
plt.legend()

# Evaluate/Model

In [None]:
def evaluate(target_var):
    rmse = round(sqrt(mean_squared_error(validate[target_var], yhat_df[target_var])), 0)
    return rmse

In [None]:
def plot_and_eval(target_var):
    plt.figure(figsize = (12,4))
    plt.plot(train[target_var], label='Train', linewidth=1)
    plt.plot(validate[target_var], label='Validate', linewidth=1)
    plt.plot(yhat_df[target_var])
    plt.title(target_var)
    rmse = evaluate(target_var)
    print(target_var, '-- RMSE: {:.0f}'.format(rmse))
    plt.show()

In [None]:
# create an empty dataframe
eval_df = pd.DataFrame(columns=['model_type', 'target_var', 'rmse'])

# function to store the rmse so that we can compare
def append_eval_df(model_type, target_var):
    rmse = evaluate(target_var)
    d = {'model_type': [model_type], 'target_var': [target_var],
        'rmse': [rmse]}
    d = pd.DataFrame(d)
    return eval_df.append(d, ignore_index = True)

# Last observed value

In [None]:
def lov_model(train, validate):
    avg_temp_f = round(train['avg_temp_f'][-1:][0], 2)
    yhat_df = pd.DataFrame({'avg_temp_f': [avg_temp_f]},  
                      index = validate.index)
    return train, validate, avg_temp_f, yhat_df

In [None]:
train, validate, avg_temp_f, yhat_df = lov_model(train, validate)

In [None]:
yhat_df.head()

In [None]:
yhat_df.describe()

In [None]:
plot_and_eval('avg_temp_f')

In [None]:
eval_df = append_eval_df(model_type = 'last_observed_value', 
                             target_var = 'avg_temp_f' )
eval_df

# Simple Average

In [None]:
def sa_model(train, validate):
    # compute simple average
    # plt.plot(train['dollars_sold'])
    avg_temp_f = round(train['avg_temp_f'].mean(),2)
    avg_temp_f
    
    # make predictions
    yhat_df = pd.DataFrame({'avg_temp_f': [avg_temp_f]}, 
                           index = validate.index)
    
    return train, validate, avg_temp_f, yhat_df

In [None]:
train, validate, avg_temp_f, yhat_df = sa_model(train, validate)

In [None]:
yhat_df.head()

In [None]:
yhat_df.describe()

In [None]:
plot_and_eval('avg_temp_f')

In [None]:
eval_df = append_eval_df(model_type='simple_average', 
                            target_var = 'avg_temp_f')
eval_df

# Moving Average

In [None]:
# train['dollars_sold'].tail(30)
plt.figure(figsize=(12,4))
plt.plot(train['avg_temp_f'].rolling(7).mean())
plt.plot(train['avg_temp_f'].rolling(30).mean())
plt.plot(train['avg_temp_f'].rolling(90).mean())
plt.plot(train['avg_temp_f'].rolling(120).mean())
plt.plot(train['avg_temp_f'], alpha=.3)

In [None]:
periods = [30, 100, 365, 730]

for p in periods:
    avg_temp_f = round(train['avg_temp_f'].rolling(p).mean()[-1], 2)

print(avg_temp_f)

In [None]:
# make predictions
yhat_df = pd.DataFrame({'avg_temp_f': [avg_temp_f]}, 
                           index = validate.index)

In [None]:
yhat_df
yhat_df.head()

In [None]:
plot_and_eval('avg_temp_f')

In [None]:
eval_df = append_eval_df(model_type = '30-730d_moving_avg', 
                            target_var = 'avg_temp_f' )
eval_df

In [None]:
eval_df.rmse.min()

In [None]:
# Best models are simple average and 30-730d moving average with a rmse of 19.

In [None]:
sm.tsa.seasonal_decompose(train.avg_temp_f.resample('M').mean()).plot()
plt.show()

In [None]:
def sa_model_test(test, validate):
    # compute simple average
    # plt.plot(train['dollars_sold'])
    avg_temp_f = round(test['avg_temp_f'].mean(),2)
    avg_temp_f
    
    # make predictions
    yhat_df = pd.DataFrame({'avg_temp_f': [avg_temp_f]}, 
                           index = validate.index)
    
    return test, validate, avg_temp_f, yhat_df

In [None]:
test, validate, avg_temp_f, yhat_df = sa_model_test(test, validate)

In [None]:
yhat_df.head()

In [None]:
yhat_df.describe()

In [None]:
plot_and_eval('avg_temp_f')

In [None]:
eval_df = append_eval_df(model_type='simple_average_test', 
                            target_var = 'avg_temp_f')
eval_df