## Import python libraries 

In [None]:
import numpy as np
import pandas as pd

## Import dataset

In [2]:
df = pd.read_csv('USA_Housing.csv')

In [3]:
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0,USS Barnett\nFPO AP 44820
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5,USNS Raymond\nFPO AE 09386


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


## Create Blank column "Label"

In [5]:
df["Label"] = ""

In [6]:
df['Price'] = df['Price'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   int32  
 6   Address                       5000 non-null   object 
 7   Label                         5000 non-null   object 
dtypes: float64(5), int32(1), object(2)
memory usage: 293.1+ KB


## Very low condition shows the region of houses based on the average income between 0-500000. Low condition shows the region of houses based on the average income between 1000000. Medium condition predicts the house prices bases on average income 1000000-1500000. High condition states the house prices where average income range is 1500000-2000000.  Very high condition determines the house prices where average income is 1500000-2000000. 

In [7]:
df['Label'] = ['Very Low' if 0<x<=500000 
               else 'Low' if 500000<x<=1000000 
               else 'Medium' if 1000000<x<=1500000 
               else 'High' if 1500000<x<=2000000 
               else 'Very High' for x in df['Price']]
df

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,Label
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1059033,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",Medium
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1505890,"188 Johnson Views Suite 079\nLake Kathleen, CA...",High
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1058987,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",Medium
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260616,USS Barnett\nFPO AP 44820,Medium
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943,USNS Raymond\nFPO AE 09386,Low
...,...,...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1060193,USNS Williams\nFPO AP 30153-7653,Medium
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1482617,"PSC 9258, Box 8489\nAPO AA 42991-3352",Medium
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1030729,"4215 Tracy Garden Suite 076\nJoshualand, VA 01...",Medium
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1198656,USS Wallace\nFPO AE 73316,Medium


## Groupby function

In [8]:
df.groupby('Label').count()

Unnamed: 0_level_0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
High,1061,1061,1061,1061,1061,1061,1061
Low,1166,1166,1166,1166,1166,1166,1166
Medium,2602,2602,2602,2602,2602,2602,2602
Very High,74,74,74,74,74,74,74
Very Low,97,97,97,97,97,97,97


## Dataframe using Label and Avg. Area Income

In [9]:
df = df[['Label', 'Avg. Area Income']]

In [10]:
df

Unnamed: 0,Label,Avg. Area Income
0,Medium,79545.45857
1,High,79248.64245
2,Medium,61287.06718
3,Medium,63345.24005
4,Low,59982.19723
...,...,...
4995,Medium,60567.94414
4996,Medium,78491.27543
4997,Medium,63390.68689
4998,Medium,68001.33124


## Set index column of Avg. Area Income

In [11]:
df =  df.set_index('Avg. Area Income')
df

Unnamed: 0_level_0,Label
Avg. Area Income,Unnamed: 1_level_1
79545.45857,Medium
79248.64245,High
61287.06718,Medium
63345.24005,Medium
59982.19723,Low
...,...
60567.94414,Medium
78491.27543,Medium
63390.68689,Medium
68001.33124,Medium


## Train and Test split 

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
traindf, testdf = train_test_split(df, test_size=0.2)

In [14]:
from tensorflow.keras.models import Sequential

## LSTM model 

In [15]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [16]:
model = Sequential()
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(traindf, epochs=10, batch_size=1, verbose=2)

Epoch 1/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 2/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 3/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 4/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 5/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 6/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 7/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 8/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 9/10
4000/4000 - 1s - loss: 0.0000e+00
Epoch 10/10
4000/4000 - 1s - loss: 0.0000e+00


<keras.callbacks.History at 0x28791802b80>

## Predict result based on testdf

In [17]:
result = model.predict(testdf)

## Reshape of result value

In [18]:
result=result.reshape(result.shape[0])

## Create blank column named predicted_result 

In [19]:
testdf['predicted_result']= ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['predicted_result']= ''


## Show the array of predicted result value

In [20]:
testdf['predicted_result'] = np.array(result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['predicted_result'] = np.array(result)


In [21]:
testdf

Unnamed: 0_level_0,Label,predicted_result
Avg. Area Income,Unnamed: 1_level_1,Unnamed: 2_level_1
72007.48880,Medium,b'Medium'
74900.48595,High,b'High'
73120.65722,Low,b'Low'
69609.59370,Medium,b'Medium'
71060.40601,Medium,b'Medium'
...,...,...
46517.17527,Low,b'Low'
72175.28350,High,b'High'
50343.76352,Low,b'Low'
39777.60691,Low,b'Low'
