# 0. Create Dataset

In [1]:
import warnings
import re
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv")
macro = pd.read_csv("macro.csv")

## 0.1 Merge train and macro

Combine on timestamp, keeping all observations from train.

In [3]:
comb = train.merge(macro, on="timestamp", how="left")

## 0.2 Variable selection

Investigate columns with fewer missing values.

In [4]:
train_nas = comb.isnull().sum(axis = 0)
with pd.option_context('display.max_rows', None):
    pass
    #print(train_nas)

Subset dataframe to only our chosen 39 explanatory variables and the target variable.

In [5]:
df = comb[["full_sq", "life_sq", "floor", "num_room", "state", "product_type", 
          "raion_popul", "green_zone_part","indust_part", "children_preschool",
           "children_school", "healthcare_centers_raion", "university_top_20_raion",
         "culture_objects_top_25_raion", "shopping_centers_raion", "oil_chemistry_raion",
         "radiation_raion", "railroad_terminal_raion", "big_market_raion", 
           "nuclear_reactor_raion","detention_facility_raion", "work_all", "ekder_all",
         "park_km", "public_transport_station_km", "big_road1_km", "fitness_km", 
          "big_church_count_5000", "mosque_count_5000", "cafe_avg_price_5000", 
         "office_count_5000", "ecology",'salary','cpi','usdrub','mortgage_rate',
          'unemployment','bandwidth_sports','rent_price_2room_eco',"price_doc"]]

## 0.3 Cleaning & dealing with missing values

### life_sq, full_sq

In [6]:
ls2 = len(df[df['life_sq']<2])
ls2fs = len(df[np.logical_and(df['life_sq']<2,df['full_sq']<2)])
comp_fs = len(df[df['life_sq']>df['full_sq']])
comp_fsm = df[df['life_sq']>df['full_sq']]['life_sq'].mean()
comp_fsm2 = df[df['life_sq']>df['full_sq']]['full_sq'].mean()
comp_fsm_s = df[df['life_sq']>df['full_sq']]['life_sq'].std()
comp_fsm2_s = df[df['life_sq']>df['full_sq']]['full_sq'].std()
min1 = df[df['life_sq']>df['full_sq']]['life_sq'].min()
max1 = df[df['life_sq']>df['full_sq']]['life_sq'].max()
min2 = df[df['life_sq']>df['full_sq']]['full_sq'].min()
max2 = df[df['life_sq']>df['full_sq']]['full_sq'].max()

print("- " + str(ls2) + " observations with 0 or 1 living square meters")
print("- " + str(ls2fs) + " of these have full square meters of 0 or 1\n")
print("- " + str(comp_fs) + " observations with life_sq > full_sq")
print("- " + "these have life_sq mean = "+ str(comp_fsm) + ", std = " + str(comp_fsm_s) +
     ", range [" + str(min1) + "," + str(max1) + "]")
print("- " + "full_sq mean = "+ str(comp_fsm2) + ", std = " + str(comp_fsm2_s) +
     ", range [" + str(min2) + "," + str(max2) + "]")
print()

- 435 observations with 0 or 1 living square meters
- 21 of these have full square meters of 0 or 1

- 37 observations with life_sq > full_sq
- these have life_sq mean = 348.86486486486484, std = 1215.8504376718324, range [38.0,7478.0]
- full_sq mean = 44.45945945945946, std = 26.494857566657675, range [0,84]



There are 435 observations with 0 or 1 living square meters, only 21 of which have similarly small full square meters. Since these data are on sales of houses, it's probably safe to assume that a value of <= 1 square meters of living space is a mistake.
For the 414 observations with a reasonable full_sq value but 0 or 1 for life_sq, we set life_sq to NaN to later fill in with a more helpful estimate of life_sq.

There are also 37 observations where living square meters is greater than the full square meters. Each of these has a full_sq value of less than 100 meters, while life_sq values average around 350. For these, we set life_sq to NaN to later fill in with a more helpful estimate.

Then, double check that we don't have any rows with NaN for both full_sq and life_sq.

In [7]:
df.loc[np.logical_and(df['life_sq']<2,df['full_sq']>20), "life_sq"] = np.nan
df.loc[np.logical_and(df['life_sq']>10,df['full_sq']<2), "full_sq"] = np.nan
df.loc[np.logical_and(df['life_sq']>df['full_sq'],df['full_sq']<10), "full_sq"] = np.nan
df.loc[df['life_sq']>df['full_sq'],"life_sq"] = np.nan

In [8]:
assert len(df[np.logical_or(df['life_sq']<2,df['full_sq']<2)]) == 21

In [9]:
assert len(df[np.logical_and(df['life_sq'].isnull(),df['full_sq'].isnull())]) == 0

Get average proportion of life_sq / full_sq to help fill in values.

In [10]:
mean = (df["life_sq"]/df["full_sq"]).mean()
std = (df["life_sq"]/df["full_sq"]).std()
corr = df['life_sq'].corr(df['full_sq'])
print(mean, std, corr)

0.6445236350818558 0.1663612277314424 0.4423223378652508


Fill null values of life_sq with average proportion * full_sq and fill null values of full_sq with life_sq / average proportion.

In [11]:
df['life_sq'] = df["life_sq"].fillna(df["full_sq"]*mean)
df['full_sq'] = df["full_sq"].fillna(df["life_sq"]/mean)

In [12]:
df.loc[np.logical_or(df['life_sq']<2,df['full_sq']<2),["life_sq","full_sq"]] = np.nan

In [13]:
df['life_sq'] = df["life_sq"].fillna(df["life_sq"].mean())
df['full_sq'] = df["full_sq"].fillna(df["full_sq"].mean())

### floor, cafe_avg_price_5000

In [14]:
print(df["floor"].isna().sum())
print(df["cafe_avg_price_5000"].isna().sum())

167
297


Since there are less than 300 missing values (less than 1% of observations) for number of floors and average cafe price, we chose to simply fill in NaNs with the average of the column.

In [15]:
df["floor"] = df["floor"].fillna(df["floor"].mean())
df["cafe_avg_price_5000"] = df["cafe_avg_price_5000"].fillna(
    df["cafe_avg_price_5000"].mean())

### product_type

Map product type to a binary variable, where 1 means the property was an investment.

In [16]:
df['product_type'] = df['product_type'].map({'Investment':1, 'OwnerOccupier':0})

### num_room

In [17]:
df['num_room'].isnull().sum()

9572

There are more missing values for num_room - almost 1/3 of the dataset - so it is preferable to avoid using a simple mean to fill in. Since num_room is very likely related to size of living space, we choose to forward-fill in null values for num_room, sorting by life_sq.

In [18]:
df = df.sort_values(by="life_sq")
df['num_room'] = df['num_room'].fillna(method='ffill')
df = df.sort_values(by="life_sq",ascending=False)
df['num_room'] = df['num_room'].fillna(method='ffill')
df = df.sort_index()

### state

In [19]:
print(str(df['state'].isnull().sum()) + " missing values")
print("values included: " + str(df['state'].unique()))
print(str(len(df[df['state']==33])) + " observations with state=33")

13559 missing values
values included: [nan  3.  1.  2.  4. 33.]
1 observations with state=33


State seems to be on a scale of 1-4. There is one value of 33, which seems to be a mistake, so we will set this to NaN. There are lots of missing values for state - a little under 1/2 of the dataset. Because of this, it would likely hurt us to try to estimate these values. However, the condition of the house is a very helpful feature for the houses that do have it recorded. In order to keep this helpful information and not dilute it with estimated values, we designate a value of 0 to represent an unknown state.

In [20]:
df.loc[df['state']==33,"state"] = np.nan
df['state'] = df['state'].fillna(0)

### ecology

In [21]:
df['ecology'].unique()

array(['good', 'excellent', 'poor', 'satisfactory', 'no data'],
      dtype=object)

Convert ecology (ordinal) to numeric. Use 0 for 'no data'.

In [22]:
df['ecology'] = df['ecology'].map({'poor':1, 'satisfactory':2,'good':3,'excellent':4,'no data':0})

### oil_chemistry_raion, radiation_raion, railroad_terminal_raion, big_market_raion, nuclear_reactor_raion, detention_facility_raion

Map values from no/yes to 0/1.

In [23]:
to_fix = ["oil_chemistry_raion", "radiation_raion", "railroad_terminal_raion",
          "big_market_raion", "nuclear_reactor_raion", "detention_facility_raion"]
for col in to_fix:
    df[col] = df[col].map({'yes':1, 'no':0})

## Final check & export

Ensure columns have no missing values and are numeric.

In [24]:
for col in df.columns:
    assert df[col].isnull().sum() == 0
    assert df[col].dtype == np.float64 or df[col].dtype == np.int64

In [25]:
df.to_csv("data.csv",encoding="utf-8")