In [150]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm

from math import ceil
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectPercentile as SP
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm

In [105]:
df = pd.read_csv('train.csv', index_col = 'id')

X_test = pd.read_csv('test.csv', index_col='id')

# Dropping the rows which have missing prediction values
df = df.dropna(axis=0, subset=['price_doc'])

y_train = df.price_doc
X_train = df.drop(['price_doc'], axis = 1)

In [107]:
def null_values(X_train, X_test):
    null_train = X_train.isnull().sum()
    null_test = X_test.isnull().sum()
    
    df_dict = {'Train Data': null_train,
                'Test Data': null_test}
    null_df = pd.DataFrame(df_dict)
    
    null_df = null_df.loc[(null_df['Train Data']!=0) | (null_df['Test Data']!=0)]
    null_df = null_df.sort_values(by=['Train Data','Test Data'],ascending=False)
    
    return null_df

In [108]:
null_vals = null_values(X_train, X_test)
null_vals

Unnamed: 0,Train Data,Test Data
hospital_beds_raion,14441,3418
build_year,13605,1049
state,13559,694
cafe_sum_500_min_price_avg,13281,3159
cafe_sum_500_max_price_avg,13281,3159
cafe_avg_price_500,13281,3159
max_floor,9572,0
material,9572,0
num_room,9572,0
kitch_sq,9572,0


In [109]:
drop_cols = null_vals.index.values.tolist()

In [110]:
df = df.drop(columns=drop_cols)

In [118]:
categorical_features = list(df.select_dtypes('object').columns)

# Casting to category datatype
for name in categorical_features:
    df[name] = df[name].astype("category")
    
    # Add a None category for missing values
    if "None" not in df[name].cat.categories:
        df[name] = df[name].cat.add_categories("None")

In [119]:
# Label encoding for categoricals
for colname in df.select_dtypes(["category"]):
    df[colname] = df[colname].cat.codes

In [120]:
df.dtypes.value_counts()

int64      156
float64     67
int8        13
int16        2
dtype: int64

In [121]:
y_train = df.price_doc
X_train = df.drop(['price_doc'], axis = 1)

In [124]:
def make_mi_scores(X, y):
    X = X.copy()
    # All discrete features should now have integer dtypes
    # discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X.select_dtypes('number'), y, random_state=0)
    mi_scores = pd.DataFrame(mi_scores.round(2), columns=["MI_Scores"], index=X.select_dtypes('number').columns)
    return mi_scores

In [125]:
mi_scores = make_mi_scores(X_train, y_train)
linear_corr = pd.DataFrame(X_train.corrwith(y_train).round(2), columns=['Lin_Correlation'])

corr_with_price = pd.concat([mi_scores, linear_corr], axis=1)
corr_with_price = corr_with_price.sort_values('MI_Scores',ascending=False)

def feature_selector(i, data):

    selected_features = []

    for row in corr_with_price.head(i).index:
        print(row)
        selected_features.append(''+row)
    X_model = pd.DataFrame(data).loc[:,selected_features]
    
    return X_model

In [126]:
corr_with_price

Unnamed: 0,MI_Scores,Lin_Correlation
full_sq,0.71,0.34
sub_area,0.59,-0.09
0_17_male,0.58,0.14
0_13_all,0.58,0.14
work_all,0.58,0.14
young_female,0.58,0.14
young_all,0.58,0.14
0_6_female,0.58,0.13
7_14_male,0.58,0.14
7_14_female,0.58,0.14


In [127]:
top = sum(corr_with_price.MI_Scores>0.1)

In [128]:
X_model = feature_selector(top, X_train)

full_sq
sub_area
0_17_male
0_13_all
work_all
young_female
young_all
0_6_female
7_14_male
7_14_female
work_female
0_17_female
work_male
0_13_male
raion_popul
0_13_female
0_6_male
16_29_male
young_male
female_f
full_all
ekder_all
ekder_female
0_6_all
ekder_male
7_14_all
children_preschool
0_17_all
children_school
male_f
green_zone_part
16_29_all
area_m
16_29_female
indust_part
ID_railroad_station_avto
ID_metro
trc_sqm_5000
sadovoe_km
ttk_km
kremlin_km
zd_vokzaly_avto_km
bulvar_ring_km
workplaces_km
preschool_km
basketball_km
thermal_power_plant_km
oil_chemistry_km
cafe_count_5000
detention_facility_km
stadium_km
school_km
trc_count_5000
nuclear_reactor_km
catering_km
kindergarten_km
power_transmission_line_km
exhibition_km
metro_km_avto
cafe_count_3000
sport_count_5000
university_km
sport_count_3000
cafe_count_5000_price_1000
radiation_km
big_church_km
office_km
swim_pool_km
incineration_km
office_sqm_5000
metro_min_avto
fitness_km
ID_big_road2
shopping_centers_km
hospice_morgue_km
publi

In [153]:
VIF = df.copy()


for i in tqdm(range(300)):
    X = VIF[list(X_model.columns)]
    vif_info = pd.DataFrame()
    vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif_info['Column'] = X.columns
    vif_info.reset_index(ascending=False, inplace=True)
    vif_info.reset_index(inplace=True)
    vif_info.drop(columns=['index'], inplace=True)
    vif = vif_info['VIF'][0]
    print(vif)
    drop_var = vif_info['Column'][0]
    if vif>5:
        X_model.drop(columns=[drop_var], inplace=True)
    if vif<=8:
        break

  vif = 1. / (1. - r_squared_i)
  0%|                                                                                          | 0/300 [07:13<?, ?it/s]


TypeError: reset_index() got an unexpected keyword argument 'ascending'