## Importing Libraries

In [2]:
import pandas as pd 
import numpy as np

## Loading the data

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

## Importing the data

In [3]:
df = pd.read_csv("housing.csv")

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'ocean_proximity']
target = 'median_house_value'


## Data preparation

In [6]:
df[target].isnull().sum()

0

In [7]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
df.fillna(0,inplace=True)

In [9]:
df_train = df[features]
df_train.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
dtype: int64

In [10]:
df_train["rooms_per_household"] = df_train["total_rooms"]/df_train["households"]

In [11]:
df_train["bedrooms_per_room"] = df_train["total_bedrooms"]/df_train["total_rooms"]

In [12]:
df_train["population_per_household"] = df_train["population"]/df_train["households"]

In [13]:
df_train.head()    

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,6.281853,0.172096,2.181467


### Question 1: most frequent observation for ocean_proximity

In [14]:
df_train["ocean_proximity"].describe()

count         20640
unique            5
top       <1H OCEAN
freq           9136
Name: ocean_proximity, dtype: object

## Splitting the data

In [15]:
from sklearn.model_selection import train_test_split
X = df_train
y = df[target]
X_train,X_val,y_train,y_val = train_test_split(X,y, train_size=0.6,test_size=0.4,random_state=42)
X_val,X_test,y_val,y_test = train_test_split(X_val,y_val,test_size=0.5,random_state=42)

### Question 2: Creating correlation matrix

In [16]:
df.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.134153
total_bedrooms,0.068082,-0.065318,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.049148
population,0.099773,-0.108785,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049148,-0.02465,0.065843,0.688075,1.0


## Changing target column to binary

In [17]:
df['above_average'] = (df['median_house_value'] > df['median_house_value'].mean()).astype(int)

In [18]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,above_average
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,1
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,1
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,1
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,1


### Question 3: mutual information score for categorical variable

In [19]:
cat=[]
for c in df.columns:
    if df[c].dtype == "object":
        cat.append(c)

In [20]:
cat

['ocean_proximity']

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
 10  above_average       20640 non-null  int64  
dtypes: float64(9), int64(1), object(1)
memory usage: 1.7+ MB


In [22]:
from sklearn.metrics import mutual_info_score
def mutual_info(series):
    return mutual_info_score(series,df['median_house_value'])
round(df[cat].apply(mutual_info),2)

ocean_proximity    0.47
dtype: float64

### Qusetion 4: Training Logistic Regression

In [23]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'ocean_proximity']

target = 'above_average'

X = df[features]
y = df[target]

X_train,X_val,y_train,y_val = train_test_split(X,y, train_size=0.6,test_size=0.4,random_state=42)
X_val,X_test,y_val,y_test = train_test_split(X_val,y_val,test_size=0.5,random_state=42)

In [24]:
from sklearn.feature_extraction import DictVectorizer

train_dicts = X_train.to_dict(orient = "records")
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
val_dicts =  X_val.to_dict(orient = "records")
X_val = dv.transform(val_dicts)

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

In [26]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_val)

score =accuracy_score(y_val,y_pred)
round(score,2)

0.83

### Question 5: Finding least useful features

In [27]:
l = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'ocean_proximity']

target = 'above_average'

for i in range(len(l)):
    features = l[:]
    features.remove(l[i])
    X = df[features]
    y = df[target]

    X_train,X_val,y_train,y_val = train_test_split(X,y, train_size=0.6,test_size=0.4,random_state=42)
    X_val,X_test,y_val,y_test = train_test_split(X_val,y_val,test_size=0.5,random_state=42)
    train_dicts = X_train.to_dict(orient = "records")
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    val_dicts =  X_val.to_dict(orient = "records")
    X_val = dv.transform(val_dicts)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    score = round(accuracy_score(y_val,y_pred),2)
    print("{} --- {}---- {}".format(l[i],score,0.83-score))

latitude --- 0.82---- 0.010000000000000009
longitude --- 0.82---- 0.010000000000000009
housing_median_age --- 0.82---- 0.010000000000000009
total_rooms --- 0.83---- 0.0
total_bedrooms --- 0.83---- 0.0
population --- 0.8---- 0.029999999999999916
households --- 0.82---- 0.010000000000000009
median_income --- 0.78---- 0.04999999999999993
ocean_proximity --- 0.81---- 0.019999999999999907


### Question 6: Using Linear Regression from scikit learn

In [29]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

df1 = pd.read_csv("housing.csv")
df1.fillna(0,inplace=True)
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'ocean_proximity']
target = 'median_house_value'


X = df1[features]
y = df1[target]
X_train,X_val,y_train,y_val = train_test_split(X,y, train_size=0.6,test_size=0.4,random_state=42)
X_val,X_test,y_val,y_test = train_test_split(X_val,y_val,test_size=0.5,random_state=42)
train_dicts = X_train.to_dict(orient = "records")
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
val_dicts =  X_val.to_dict(orient = "records")
X_val = dv.transform(val_dicts)
y_train = np.log1p(y_train.values)
y_val = np.log1p(y_val.values)
y_test = np.log1p(y_test.values)

alpha_list = [0, 0.01, 0.1, 1, 10]
for a in alpha_list:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    score = mean_squared_error(y_val, y_pred,squared=False)
    print("{}  ---  {}".format(a,score))


0  ---  0.5459873268612719
0.01  ---  0.5459873268669022
0.1  ---  0.5459873269203644
1  ---  0.5459873274578687
10  ---  0.5459873328258108


# END