In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Load demand data

dataset1 = pd.read_csv('/Users/manish/Downloads/us_demand_data.csv')

# Load supply data
dataset2 = pd.read_csv('/Users/manish/Downloads/us_supply_data.csv')

# Convert "DATE" column in dataset1 to consistent date format
dataset1['DATE'] = pd.to_datetime(dataset1['DATE'])

# Convert "period" column in dataset2 to consistent date format
dataset2['Period'] = pd.to_datetime(dataset2['Period'], format='%d-%m-%Y').dt.strftime('%Y-%m-%d')
dataset2.rename(columns={'Period': 'DATE'}, inplace=True)
dataset2['DATE'] = pd.to_datetime(dataset2['DATE'])

#Merge dataset1 and dataset2 into dataset3
dataset3 = pd.merge(dataset1, dataset2)
# Save dataset3 
dataset3.to_csv('dataset3.csv', index=False)

In [None]:
# Extract year, month, and day From DATE column so the model can train easily
dataset3['year'] = dataset3['DATE'].dt.year
dataset3['month'] = dataset3['DATE'].dt.month
dataset3['day'] = dataset3['DATE'].dt.day

In [None]:
#drop Date column
dataset3 = dataset3.drop('DATE', axis=1)
dataset3 = dataset3.drop('Unnamed: 0', axis=1)

In [None]:
dataset3.info()

In [None]:
dataset3.head()

In [None]:
#Checking null values
dataset3.isnull().sum()

In [None]:
#Checking categorical variables 
categorical_features=[feature for feature in dataset3.columns if dataset3[feature].dtypes=='O']
print('number of categorical variables:',len(categorical_features))

In [None]:
##Checking NUMERICAL CONTINUOUS FEATURE : float type

numerical_continuous_feature=[feature for feature in dataset3.columns if  dataset3[feature].dtype=='float']
dataset3[numerical_continuous_feature].head()

In [None]:

for feature in numerical_continuous_feature:
  plt.figure(figsize=(7,7))
  sns.distplot(dataset3[feature],kde=True)

In [None]:
for feature in numerical_continuous_feature:
  plt.figure(figsize=(7,7))
  plt.scatter(x=feature,y='HPI',data=dataset3)
  plt.xlabel(feature)
  plt.ylabel("HPI")
  plt.title(feature,fontsize=30)

In [None]:
# => Before we go ahead with heatmaps and pairplots , lets do transformation of the data and see the relation with respect to HPI once again

for feature in numerical_continuous_feature:
  dataset3[feature]=np.log(dataset3[feature])

In [None]:
dataset3['HPI']=np.log(dataset3['HPI'])
dataset3['HPI'].hist()

In [None]:
for feature in numerical_continuous_feature:
  plt.figure(figsize=(7,7))
  plt.scatter(x=feature,y='HPI',data=dataset3)
  plt.xlabel(feature)
  plt.ylabel("HPI")
  plt.title(feature,fontsize=30)

In [None]:
# checking the heatmaps and conclude if there is a factor that have any correlation with HPI
plt.figure(figsize=(15,15))
sns.heatmap(dataset3.corr(),annot=True)
plt.show()

In [None]:
# => GDPC1,construction ,year have very strong correlation with the HPI

In [None]:
sns.pairplot(data=dataset3)

In [None]:
#looking into outliers
for feature in numerical_continuous_feature:
  data=dataset3.copy()
  if 0 in data[feature].unique():
    pass
  else:
    data.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()

In [None]:
##Checking NUMERICAL CONTINUOUS FEATURE: int type

numerical_continuous_feature=[feature for feature in dataset3.columns if  dataset3[feature].dtype=='int']
dataset3[numerical_continuous_feature].head()

In [None]:
for feature in numerical_continuous_feature:
  plt.figure(figsize=(7,7))
  sns.distplot(dataset3[feature],kde=True)

In [None]:
for feature in numerical_continuous_feature:
  plt.figure(figsize=(7,7))
  plt.scatter(x=feature,y='HPI',data=dataset3)
  plt.xlabel(feature)
  plt.ylabel("HPI")
  plt.title(feature,fontsize=30)

In [None]:

# => Before we go ahead with heatmaps and pairplots , lets do transformation of the data and see the relation with respect to HPI once again 
for feature in numerical_continuous_feature:
  dataset3[feature]=np.log(dataset3[feature])

In [None]:
for feature in numerical_continuous_feature:
  plt.figure(figsize=(7,7))
  plt.scatter(x=feature,y='HPI',data=dataset3)
  plt.xlabel(feature)
  plt.ylabel("HPI")
  plt.title(feature,fontsize=30)

In [None]:
#looking into outliers
for feature in numerical_continuous_feature:
  data=dataset3.copy()
  if 0 in data[feature].unique():
    pass
  else:
    data.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()

# # **Training**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
X=dataset3.drop(['HPI'],axis=1)
y=dataset3['HPI']

In [None]:
#spliting our dependent and independent features
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
#using a minmaxScaler
from sklearn.preprocessing import MinMaxScaler
minmax=MinMaxScaler()

In [None]:
scaled_X_train= pd.DataFrame(minmax.fit_transform(X_train),columns=X_train.columns)
scaled_X_test=pd.DataFrame(minmax.transform(X_test),columns=X_test.columns)

In [None]:
#Linear regression model
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', LinearRegression())
                             ])


scores = -1 * cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

In [None]:

model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
r2_score(y_test,y_pred)


In [None]:
#RandomForestRegressor model
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor())
                             ])


scores = -1 * cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

In [None]:
model=RandomForestRegressor()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
#GradientBoostingRegressor model
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', GradientBoostingRegressor())
                             ])


scores = -1 * cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

In [None]:
model=GradientBoostingRegressor()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
r2_score(y_test,y_pred)
