# Importing Pandas

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR


# Loading & Reading Dataset

In [16]:
df = pd.read_csv("data.csv")
print(df)

                date         price  bedrooms  bathrooms  sqft_living  \
0      5/2/2014 0:00  3.130000e+05       3.0       1.50       1340.0   
1      5/2/2014 0:00  2.384000e+06       5.0       2.50       3650.0   
2      5/2/2014 0:00  3.420000e+05       3.0       2.00       1930.0   
3      5/2/2014 0:00  4.200000e+05       3.0       2.25       2000.0   
4      5/2/2014 0:00  5.500000e+05       4.0       2.50       1940.0   
...              ...           ...       ...        ...          ...   
4595   7/9/2014 0:00  3.081667e+05       3.0       1.75       1510.0   
4596   7/9/2014 0:00  5.343333e+05       3.0        NaN       1460.0   
4597   7/9/2014 0:00  4.169042e+05       3.0        NaN       3010.0   
4598  7/10/2014 0:00  2.034000e+05       4.0       2.00       2090.0   
4599  7/10/2014 0:00  2.206000e+05       3.0       2.50       1490.0   

      sqft_lot  floors  waterfront  view  condition  sqft_above  \
0       7912.0     1.5         0.0   0.0          3      1340.0   
1

# Data Exploration

In [17]:
df.isnull().sum()

date             0
price            3
bedrooms         5
bathrooms        2
sqft_living      2
sqft_lot         2
floors           2
waterfront       5
view             2
condition        0
sqft_above       3
sqft_basement    3
yr_built         6
yr_renovated     0
street           3
city             4
statezip         3
country          6
dtype: int64

In [18]:
df.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4597.0,4595.0,4598.0,4598.0,4598.0,4598.0,4595.0,4598.0,4600.0,4597.0,4597.0,4594.0,4600.0
mean,552119.0,3.401088,2.160668,2139.498913,14856.07,1.511853,0.007182,0.240757,3.451739,1827.509463,312.122036,1970.799086,808.608261
std,563983.4,0.908983,0.78392,963.387024,35891.77,0.537905,0.084449,0.778558,0.67723,862.13854,464.228562,29.740666,979.414536
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,323833.3,3.0,1.75,1460.0,5001.0,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,461000.0,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0,1976.0,0.0
75%,655000.0,4.0,2.5,2620.0,11003.75,2.0,0.0,0.0,4.0,2300.0,610.0,1997.0,1999.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0,2014.0,2014.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4597 non-null   float64
 2   bedrooms       4595 non-null   float64
 3   bathrooms      4598 non-null   float64
 4   sqft_living    4598 non-null   float64
 5   sqft_lot       4598 non-null   float64
 6   floors         4598 non-null   float64
 7   waterfront     4595 non-null   float64
 8   view           4598 non-null   float64
 9   condition      4600 non-null   int64  
 10  sqft_above     4597 non-null   float64
 11  sqft_basement  4597 non-null   float64
 12  yr_built       4594 non-null   float64
 13  yr_renovated   4600 non-null   int64  
 14  street         4597 non-null   object 
 15  city           4596 non-null   object 
 16  statezip       4597 non-null   object 
 17  country        4594 non-null   object 
dtypes: float

In [20]:

df.count()

date             4600
price            4597
bedrooms         4595
bathrooms        4598
sqft_living      4598
sqft_lot         4598
floors           4598
waterfront       4595
view             4598
condition        4600
sqft_above       4597
sqft_basement    4597
yr_built         4594
yr_renovated     4600
street           4597
city             4596
statezip         4597
country          4594
dtype: int64

In [21]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,5/2/2014 0:00,313000.0,3.0,1.5,1340.0,7912.0,1.5,0.0,0.0,3,1340.0,0.0,1955.0,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,5/2/2014 0:00,2384000.0,5.0,2.5,3650.0,9050.0,2.0,0.0,4.0,5,3370.0,280.0,1921.0,0,709 W Blaine St,Seattle,WA 98119,USA
2,5/2/2014 0:00,342000.0,3.0,2.0,1930.0,11947.0,1.0,0.0,0.0,4,1930.0,0.0,1966.0,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,5/2/2014 0:00,420000.0,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4,1000.0,1000.0,1963.0,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,5/2/2014 0:00,550000.0,4.0,2.5,1940.0,10500.0,1.0,0.0,0.0,4,1140.0,800.0,1976.0,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [22]:
df.tail()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
4595,7/9/2014 0:00,308166.6667,3.0,1.75,1510.0,6360.0,1.0,0.0,0.0,4,1510.0,0.0,1954.0,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,7/9/2014 0:00,534333.3333,3.0,,1460.0,7573.0,2.0,0.0,0.0,3,1460.0,0.0,1983.0,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,7/9/2014 0:00,416904.1667,3.0,,3010.0,7014.0,2.0,0.0,0.0,3,3010.0,0.0,2009.0,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,7/10/2014 0:00,203400.0,4.0,2.0,2090.0,6630.0,1.0,0.0,0.0,3,1070.0,1020.0,1974.0,0,5148 S Creston St,Seattle,WA 98178,USA
4599,7/10/2014 0:00,220600.0,3.0,2.5,1490.0,8102.0,2.0,0.0,0.0,4,1490.0,0.0,1990.0,0,18717 SE 258th St,Covington,WA 98042,USA


In [23]:
df.shape

(4600, 18)

In [24]:
df.dtypes

date              object
price            float64
bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot         float64
floors           float64
waterfront       float64
view             float64
condition          int64
sqft_above       float64
sqft_basement    float64
yr_built         float64
yr_renovated       int64
street            object
city              object
statezip          object
country           object
dtype: object

In [25]:
df['city'].unique()

array(['Shoreline', 'Seattle', 'Kent', 'Bellevue', 'Redmond',
       'Maple Valley', 'North Bend', 'Lake Forest Park', 'Sammamish',
       'Auburn', 'Des Moines', 'Bothell', 'Federal Way', 'Kirkland',
       'Issaquah', 'Woodinville', 'Normandy Park', 'Fall City', 'Renton',
       'Carnation', 'Snoqualmie', 'Duvall', 'Burien', 'Covington',
       'Inglewood-Finn Hill', 'Kenmore', 'Newcastle', 'Mercer Island',
       'Black Diamond', 'Ravensdale', 'Clyde Hill', 'Algona', 'Skykomish',
       'Tukwila', 'Vashon', 'Yarrow Point', 'SeaTac', 'Medina',
       'Enumclaw', 'Snoqualmie Pass', 'Pacific', 'Beaux Arts Village',
       'Preston', 'Milton', nan], dtype=object)

In [26]:
df.nunique()

date               70
price            1738
bedrooms           10
bathrooms          26
sqft_living       566
sqft_lot         3112
floors              6
waterfront          2
view                5
condition           5
sqft_above        511
sqft_basement     207
yr_built          115
yr_renovated       60
street           4522
city               44
statezip           77
country             1
dtype: int64

# Data Pre-Processing


In [27]:
df_processed = df.drop(['street', 'city', 'statezip', 'country'], axis=1)


In [28]:

df_processed.dropna(inplace=True)


In [29]:

df_processed['date'] = pd.to_datetime(df_processed['date'])
df_processed['year_built'] = df_processed['yr_built']
df_processed['year_renovated'] = df_processed['yr_renovated']
df_processed['date_year'] = df_processed['date'].dt.year
df_processed['date_month'] = df_processed['date'].dt.month
df_processed['date_day'] = df_processed['date'].dt.day

df_processed.drop(columns=['date'], axis=1, inplace=True)
df_processed['waterfront'] = df_processed['waterfront'].astype('category').cat.codes
df_processed['view'] = df_processed['view'].astype('category').cat.codes
df_processed['condition'] = df_processed['condition'].astype('category').cat.codes



In [30]:
X = df_processed.drop(['price'], axis=1)
y = df_processed['price']


train test split and SVM

In [31]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


svm_regressor = SVR(kernel='linear')
svm_regressor.fit(X_train_scaled, y_train)


y_pred = svm_regressor.predict(X_test_scaled)
print("Predictions:", y_pred)


Predictions: [477946.6311583  458325.02824692 461291.05231208 463605.87231113
 455336.81254439 463329.50081865 466658.36363455 460190.4162429
 461305.69985184 462580.41190884 459205.60634302 459202.17795081
 462316.72971075 489547.43248157 462230.66015052 469551.57667656
 456978.71135903 463707.99691022 475163.57143239 461023.28956659
 465973.11634378 465327.36735993 457689.64563943 461497.65671901
 468370.97714147 469844.68389474 457852.69465906 462322.58765002
 459719.4812847  456646.518577   466323.50393991 465601.88057522
 461500.65415807 462005.37264762 459001.46714484 471016.74151555
 463496.72039233 456927.32529085 457139.46087647 458039.28860854
 464656.80407614 459579.44600326 463400.81000363 469311.2706727
 462470.13116869 462020.12297619 467098.12135517 453817.52620793
 464827.25611298 463432.62496108 478826.41795002 454855.64178595
 459888.79744355 469315.51590054 458387.05593306 463951.4161078
 464030.84987283 463964.32040797 458348.14166735 474215.01836854
 467625.3987006