In [242]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV

In [243]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [244]:
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [245]:
df.nunique()

Id               10000
DistrictId         205
Rooms                9
Square           10000
LifeSquare        7887
KitchenSquare       58
Floor               33
HouseFloor          44
HouseYear           97
Ecology_1          129
Ecology_2            2
Ecology_3            2
Social_1            51
Social_2           142
Social_3            30
Healthcare_1        79
Helthcare_2          7
Shops_1             16
Shops_2              2
Price            10000
dtype: int64

In [246]:
df.isna().sum()

Id                  0
DistrictId          0
Rooms               0
Square              0
LifeSquare       2113
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Healthcare_1     4798
Helthcare_2         0
Shops_1             0
Shops_2             0
Price               0
dtype: int64

In [247]:
# Ecology_1, Ecology_2, Ecology_3, Social_1, Social_2, Social_3, Helthcare_2, Shops_1, Shops_2

In [248]:
df = pd.concat([df, pd.get_dummies(df['Ecology_2'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Ecology_3'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Shops_2'])], axis=1)

In [249]:
train_df = df[df['Healthcare_1'].notnull()]
train_df.isna().sum()

Id                 0
DistrictId         0
Rooms              0
Square             0
LifeSquare       745
KitchenSquare      0
Floor              0
HouseFloor         0
HouseYear          0
Ecology_1          0
Ecology_2          0
Ecology_3          0
Social_1           0
Social_2           0
Social_3           0
Healthcare_1       0
Helthcare_2        0
Shops_1            0
Shops_2            0
Price              0
A                  0
B                  0
A                  0
B                  0
A                  0
B                  0
dtype: int64

In [250]:
# features = ['Ecology_1', 'Social_1', 'Shops_1']
features = ['Ecology_1', 'Social_1', 'Shops_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1', 'A', 'B']
target = 'Healthcare_1'

In [251]:
x = train_df[features]
y = train_df[target]

In [252]:
x

Unnamed: 0,Ecology_1,Social_1,Shops_1,Social_2,Social_3,Helthcare_2,Shops_1.1,A,A.1,A.2,B,B.1,B.2
1,0.000070,46,16,10309,1,1,16,0,0,0,1,1,1
2,0.049637,34,3,7759,0,1,3,0,0,0,1,1,1
3,0.437885,23,5,5735,3,0,5,0,0,0,1,1,1
4,0.012339,35,4,5776,1,2,4,0,0,0,1,1,1
5,0.309479,35,6,7715,4,0,6,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,0.036270,6,1,1318,1,1,1,0,0,0,1,1,1
9991,0.265089,37,2,5288,0,3,2,0,0,0,1,1,1
9995,0.135650,46,11,7960,6,3,11,0,0,0,1,1,1
9998,0.307467,30,5,5048,9,2,5,0,1,0,1,0,1


In [253]:
y

1        240.0
2        229.0
3       1084.0
4       2078.0
5        990.0
         ...  
9990     200.0
9991    1937.0
9995     350.0
9998     325.0
9999      30.0
Name: Healthcare_1, Length: 5202, dtype: float64

In [254]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [255]:
parameters = {
    'n_estimators': [200, 250, 300],
    'max_depth': np.arange(10, 15),
}

clf = GridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid=parameters,
    scoring='r2',
    cv=5,
)

clf.fit(x, y)
clf.best_params_

{'max_depth': 14, 'n_estimators': 300}

In [261]:
model = RandomForestRegressor(max_depth=14,
                             n_estimators=300,
                             random_state=55)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
r2(y_test, y_pred)

0.9995762091277541

In [257]:
y_pred_train = model.predict(x_train)
r2(y_train, y_pred_train)

0.9999338877048215

In [258]:
def fill_healthcare_nan(df):
    features = ['Ecology_1', 'Social_1', 'Shops_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1', 'A', 'B']
    target = 'Healthcare_1'
    
    train_df = df[df['Healthcare_1'].notnull()]
    
    x = train_df[features]
    y = train_df[target]
    model = RandomForestRegressor(max_depth=14,
                             n_estimators=300,
                             random_state=55)
    model.fit(x, y)
    
    pred_df = df[df['Healthcare_1'].isna()][features]
    y_pred = model.predict(pred_df)
    
    df.loc[df['Healthcare_1'].isna(), target] = y_pred
    return df

In [259]:
df = pd.read_csv('train.csv')
df = pd.concat([df, pd.get_dummies(df['Ecology_2'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Ecology_3'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Shops_2'])], axis=1)

pred = fill_healthcare_nan(df)
pred

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Shops_2,Price,A,B,A.1,B.1,A.2,B.2
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,...,0,11,B,184966.930730,0,1,0,1,0,1
1,15053,41,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,...,1,16,B,300009.450063,0,1,0,1,0,1
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,1,3,B,220925.908524,0,1,0,1,0,1
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,0,5,B,175616.227217,0,1,0,1,0,1
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,2,4,B,150226.531644,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,77,32,2.0,50.401785,30.476203,5.0,6,5.0,1968,0.135650,...,3,11,B,196684.316040,0,1,0,1,0,1
9996,6159,18,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.000000,...,0,5,A,189050.289571,0,1,0,1,1,0
9997,5123,27,1.0,47.939008,,1.0,12,16.0,2015,0.072158,...,0,0,A,159143.805370,0,1,0,1,1,0
9998,5400,75,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,...,2,5,B,181595.339808,0,1,1,0,0,1


In [260]:
df.isna().sum()

Id                  0
DistrictId          0
Rooms               0
Square              0
LifeSquare       2113
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Healthcare_1        0
Helthcare_2         0
Shops_1             0
Shops_2             0
Price               0
A                   0
B                   0
A                   0
B                   0
A                   0
B                   0
dtype: int64