In [1]:
import pandas as pd
import numpy as np 

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
df.head()

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   RM       489 non-null    float64
 1   LSTAT    489 non-null    float64
 2   PTRATIO  489 non-null    float64
 3   MEDV     489 non-null    float64
dtypes: float64(4)
memory usage: 15.4 KB


### We can see there are no NAN values in any feature. 

In [5]:
x = df.to_numpy()

In [6]:
x

array([[6.575e+00, 4.980e+00, 1.530e+01, 5.040e+05],
       [6.421e+00, 9.140e+00, 1.780e+01, 4.536e+05],
       [7.185e+00, 4.030e+00, 1.780e+01, 7.287e+05],
       ...,
       [6.976e+00, 5.640e+00, 2.100e+01, 5.019e+05],
       [6.794e+00, 6.480e+00, 2.100e+01, 4.620e+05],
       [6.030e+00, 7.880e+00, 2.100e+01, 2.499e+05]])

In [7]:
x.ndim           #2 Dimensional array

2

In [8]:
corr = abs(df.corr())
corr.MEDV

RM         0.697209
LSTAT      0.760670
PTRATIO    0.519034
MEDV       1.000000
Name: MEDV, dtype: float64

### For now, I'll remove the 2 least related columns with our target "MEDV". 
### This is just to avoid dropping random columns and make more sense of the Task

In [9]:
df.drop(["PTRATIO", "RM"], axis = 1, inplace=False)   #This doesn't make changes to the original dataset. 
                                                      #This just shows what the changed whould look like.

Unnamed: 0,LSTAT,MEDV
0,4.98,504000.0
1,9.14,453600.0
2,4.03,728700.0
3,2.94,701400.0
4,5.33,760200.0
...,...,...
484,9.67,470400.0
485,9.08,432600.0
486,5.64,501900.0
487,6.48,462000.0


To drop columns from the original dataset we need to seet the "inplace" parameter to True after which the code looks like

df.drop(["PTRATIO", "RM"], axis = 1, inplace = True)

In [10]:
column_names = ["one", "two", "three", "four"]       
df.columns = column_names                                   #This method changes names for ALL the columns. 
df.head()

Unnamed: 0,one,two,three,four
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0


The column names have been changed shown above.

In [11]:
df.rename(columns={'one':'RM', 'two':'LSTAT', 'three':'PTRATIO', 'four':'MEDV'}, inplace = True)
df.head()

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0


This method helps to rename SELECTED columns.



In [12]:
target = df.loc[:, "MEDV"]     #Separating the taget variable by slicing the original dataframe using df.loc
target.head()

0    504000.0
1    453600.0
2    728700.0
3    701400.0
4    760200.0
Name: MEDV, dtype: float64

In [13]:
x = df.iloc[:,0:3]             #Separating the taget variable by slicing the original dataframe using df.iloc
x.head()

Unnamed: 0,RM,LSTAT,PTRATIO
0,6.575,4.98,15.3
1,6.421,9.14,17.8
2,7.185,4.03,17.8
3,6.998,2.94,18.7
4,7.147,5.33,18.7


In [14]:
display(x,target)

Unnamed: 0,RM,LSTAT,PTRATIO
0,6.575,4.98,15.3
1,6.421,9.14,17.8
2,7.185,4.03,17.8
3,6.998,2.94,18.7
4,7.147,5.33,18.7
...,...,...,...
484,6.593,9.67,21.0
485,6.120,9.08,21.0
486,6.976,5.64,21.0
487,6.794,6.48,21.0


0      504000.0
1      453600.0
2      728700.0
3      701400.0
4      760200.0
         ...   
484    470400.0
485    432600.0
486    501900.0
487    462000.0
488    249900.0
Name: MEDV, Length: 489, dtype: float64