## Importing Libraries

In [1]:
#Importing the data libraries
import numpy as np
import pandas as pd
import itertools
import warnings
warnings.filterwarnings('ignore')

#importing visualisation libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Importing modeling libraries
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from matplotlib.pylab import rcParams
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

warnings.filterwarnings('ignore')

## Loading Data

In [4]:
import pandas as pd

class DataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
    
    def load_data(self):
        try:
            data = pd.read_csv(self.file_path)
            return data
        except FileNotFoundError:
            print(f"File not found: {self.file_path}")
        except pd.errors.EmptyDataError:
            print("No data: File is empty")
        except pd.errors.ParserError:
            print("Error parsing the file")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

# Usage example:
file_path = 'zillow_data (2).csv'
data_loader = DataLoader(file_path)
data = data_loader.load_data()

if data is not None:
    print(data.head())


   RegionID  RegionName      City State              Metro CountyName  \
0     84654       60657   Chicago    IL            Chicago       Cook   
1     90668       75070  McKinney    TX  Dallas-Fort Worth     Collin   
2     91982       77494      Katy    TX            Houston     Harris   
3     84616       60614   Chicago    IL            Chicago       Cook   
4     93144       79936   El Paso    TX            El Paso    El Paso   

   SizeRank   1996-04   1996-05   1996-06  ...  2017-07  2017-08  2017-09  \
0         1  334200.0  335400.0  336500.0  ...  1005500  1007500  1007800   
1         2  235700.0  236900.0  236700.0  ...   308000   310000   312500   
2         3  210400.0  212200.0  212200.0  ...   321000   320600   320200   
3         4  498100.0  500900.0  503100.0  ...  1289800  1287700  1287400   
4         5   77300.0   77300.0   77300.0  ...   119100   119400   120000   

   2017-10  2017-11  2017-12  2018-01  2018-02  2018-03  2018-04  
0  1009600  1013300  1018700  1

### Data Preparation

In [11]:
data.shape

(14723, 272)

In [12]:
data.columns

Index(['RegionID', 'Zipcode', 'City', 'State', 'Metro', 'CountyName',
       'SizeRank', '1996-04', '1996-05', '1996-06',
       ...
       '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12',
       '2018-01', '2018-02', '2018-03', '2018-04'],
      dtype='object', length=272)

In [13]:
# Print unique values and their counts for each column
for col in data.columns:
    unique_values = data[col].unique()
    unique_count = len(unique_values)
    print(f'{col} (unique count: {unique_count}): {unique_values}')

RegionID (unique count: 14723): [84654 90668 91982 ... 75672 93733 95851]
Zipcode (unique count: 14723): [60657 75070 77494 ... 40404 81225 89155]
City (unique count: 7554): ['Chicago' 'McKinney' 'Katy' ... 'Pine Valley' 'Esopus'
 'Mount Crested Butte']
State (unique count: 51): ['IL' 'TX' 'NY' 'CA' 'FL' 'TN' 'NC' 'GA' 'DC' 'MO' 'OK' 'AZ' 'NJ' 'MD'
 'VA' 'WA' 'OH' 'MI' 'MA' 'KS' 'NM' 'CT' 'NV' 'PA' 'CO' 'OR' 'IN' 'SC'
 'KY' 'AR' 'ND' 'MN' 'AL' 'DE' 'LA' 'MS' 'ID' 'MT' 'HI' 'WI' 'UT' 'ME'
 'SD' 'WV' 'IA' 'RI' 'NE' 'WY' 'AK' 'NH' 'VT']
Metro (unique count: 702): ['Chicago' 'Dallas-Fort Worth' 'Houston' 'El Paso' 'New York'
 'San Francisco' 'The Villages' 'Nashville'
 'Los Angeles-Long Beach-Anaheim' 'Austin' 'Charlotte' 'McAllen' 'Atlanta'
 'Washington' 'San Antonio' 'Clarksville' 'St. Louis' 'Oklahoma City'
 'Phoenix' 'Baltimore' 'Miami-Fort Lauderdale' 'Brownsville'
 'Virginia Beach' 'Seattle' 'Cleveland' 'Ann Arbor' 'Boston' 'Kansas City'
 'Sacramento' 'Tucson' 'Jacksonville' 'Napa' '

1999-01 (unique count: 2955): [432600. 193400. 277000. ... 404800. 764300. 292700.]
1999-02 (unique count: 2970): [438600. 193100. 283600. ... 411100. 773600. 297100.]
1999-03 (unique count: 2994): [444200. 192700. 288500. ... 337500. 417300. 781000.]
1999-04 (unique count: 3011): [450000. 193000. 293900. ... 340300. 423500. 787100.]
1999-05 (unique count: 3054): [455900. 193700. 299200. ... 429700. 791600. 308700.]
1999-06 (unique count: 3048): [462100. 194800. 304300. ... 241100. 346300. 184300.]
1999-07 (unique count: 3068): [468500. 196100. 308600. ... 796700. 315900. 184700.]
1999-08 (unique count: 3063): [475300. 197800. 311400. ... 449400. 798700. 319200.]
1999-09 (unique count: 3083): [482500. 199700. 312300. ... 165900. 456400. 801100.]
1999-10 (unique count: 3101): [490200. 201900. 311900. ... 463700. 804600. 325500.]
1999-11 (unique count: 3162): [498200. 204500. 311100. ... 471200. 808200. 328700.]
1999-12 (unique count: 3154): [507200. 207800. 311700. ... 478500. 812200. 3

2010-09 (unique count: 4340): [ 754900.  202400.  250000. ...  651000. 1354000.  555300.]
2010-10 (unique count: 4344): [ 746200.  202400.  249900. ...  668200. 1332900.  553700.]
2010-11 (unique count: 4329): [ 737300.  202500.  249700. ...  670500. 1320000.  547400.]
2010-12 (unique count: 4308): [ 730800.  202500.  247900. ...  667600. 1314700.  535500.]
2011-01 (unique count: 4296): [ 729300.  202400.  247400. ... 1294000.  302400.  528900.]
2011-02 (unique count: 4293): [ 730200.  202500.  248800. ...  674600. 1267000.  526400.]
2011-03 (unique count: 4274): [ 730700.  202100.  249700. ...  674100. 1239900.  521100.]
2011-04 (unique count: 4268): [ 730000.  201300.  249100. ...  659400. 1225000.  516100.]
2011-05 (unique count: 4240): [ 730100.  200700.  249200. ...  631200. 1225400.  516400.]
2011-06 (unique count: 4256): [ 730100.  200500.  249500. ...  614000. 1236100.  517800.]
2011-07 (unique count: 4249): [ 731200.  200000.  249400. ...  178900. 1231000.  516800.]
2011-08 (u

In [14]:
data.describe()

Unnamed: 0,RegionID,Zipcode,SizeRank,1996-04,1996-05,1996-06,1996-07,1996-08,1996-09,1996-10,...,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04
count,14723.0,14723.0,14723.0,13684.0,13684.0,13684.0,13684.0,13684.0,13684.0,13684.0,...,14723.0,14723.0,14723.0,14723.0,14723.0,14723.0,14723.0,14723.0,14723.0,14723.0
mean,81075.010052,48222.348706,7362.0,118299.1,118419.0,118537.4,118653.1,118780.3,118927.5,119120.5,...,273335.4,274865.8,276464.6,278033.2,279520.9,281095.3,282657.1,284368.7,286511.4,288039.9
std,31934.118525,29359.325439,4250.308342,86002.51,86155.67,86309.23,86467.95,86650.94,86872.08,87151.85,...,360398.4,361467.8,362756.3,364461.0,365600.3,367045.4,369572.7,371773.9,372461.2,372054.4
min,58196.0,1001.0,1.0,11300.0,11500.0,11600.0,11800.0,11800.0,12000.0,12100.0,...,14400.0,14500.0,14700.0,14800.0,14500.0,14300.0,14100.0,13900.0,13800.0,13800.0
25%,67174.5,22101.5,3681.5,68800.0,68900.0,69100.0,69200.0,69375.0,69500.0,69600.0,...,126900.0,127500.0,128200.0,128700.0,129250.0,129900.0,130600.0,131050.0,131950.0,132400.0
50%,78007.0,46106.0,7362.0,99500.0,99500.0,99700.0,99700.0,99800.0,99900.0,99950.0,...,188400.0,189600.0,190500.0,191400.0,192500.0,193400.0,194100.0,195000.0,196700.0,198100.0
75%,90920.5,75205.5,11042.5,143200.0,143300.0,143225.0,143225.0,143500.0,143700.0,143900.0,...,305000.0,306650.0,308500.0,309800.0,311700.0,313400.0,315100.0,316850.0,318850.0,321100.0
max,753844.0,99901.0,14723.0,3676700.0,3704200.0,3729600.0,3754600.0,3781800.0,3813500.0,3849600.0,...,18889900.0,18703500.0,18605300.0,18569400.0,18428800.0,18307100.0,18365900.0,18530400.0,18337700.0,17894900.0


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14723 entries, 0 to 14722
Columns: 272 entries, RegionID to 2018-04
dtypes: float64(219), int64(49), object(4)
memory usage: 30.6+ MB


In [8]:
# Lets rename the regionmane to zipcode because it contains the zipcode data
data = data.rename(columns={'RegionName': 'Zipcode'})

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14723 entries, 0 to 14722
Columns: 272 entries, RegionID to 2018-04
dtypes: float64(219), int64(49), object(4)
memory usage: 30.6+ MB


In [16]:
data.tail()

Unnamed: 0,RegionID,Zipcode,City,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,...,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04
14718,58333,1338,Ashfield,MA,Greenfield Town,Franklin,14719,94600.0,94300.0,94000.0,...,216800,217700,218600,218500,218100,216400,213100,209800,209200,209300
14719,59107,3293,Woodstock,NH,Claremont,Grafton,14720,92700.0,92500.0,92400.0,...,202100,208400,212200,215200,214300,213100,213700,218300,222700,225800
14720,75672,40404,Berea,KY,Richmond,Madison,14721,57100.0,57300.0,57500.0,...,121800,122800,124600,126700,128800,130600,131700,132500,133000,133400
14721,93733,81225,Mount Crested Butte,CO,,Gunnison,14722,191100.0,192400.0,193700.0,...,662800,671200,682400,695600,695500,694700,706400,705300,681500,664400
14722,95851,89155,Mesquite,NV,Las Vegas,Clark,14723,176400.0,176300.0,176100.0,...,333800,336400,339700,343800,346800,348900,350400,353000,356000,357200
