In [1]:
import numpy as np
from sklearn import preprocessing
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))
import os
#import xgboost as xgb
#import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
import json
import ast
import eli5
from functools import reduce
import warnings
from sklearn.metrics import mean_squared_error
from scipy import stats
from math import sqrt
from lightgbm import plot_tree
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import STATUS_OK
from hyperopt.pyll.stochastic import sample
import time
import gc
import tensorflow as tf
from tensorflow.python.client import device_lib

device_lib.list_local_devices()
warnings.filterwarnings('ignore')


In [12]:
train = pd.read_csv('dataset-0510/train.csv')
test  = pd.read_csv('dataset-0510/test.csv')
data = pd.concat([train, test], axis=0)
gc.collect()
data.drop([i for i in data.columns if 'index' in i and np.sum(data[i]) == 70000], axis = 1, inplace = True)
data.drop(['village', 'doc_rate', 'master_rate', 'bachelor_rate', 'highschool_rate', 'jobschool_rate', 'junior_rate'], axis = 1, inplace = True)
train = data[:60000]
test = data[60000:]
#train.drop(['village'], axis =1, inplace = True)
#test.drop(['village'], axis =1, inplace = True)

In [27]:
#categorical feature to one-hot
def hit_score(preds, train_data):
    trues  = train_data.get_label()
    trues = np.expm1(trues)
    preds = np.expm1(preds)
    scores = (np.absolute(preds - trues) / trues) > 0.1
    hit_error = np.sum(scores) / train_data.num_data()
    return 'hit_error', round(scores), False

def one_hot(train, test, categorical_features):
    data = pd.concat([train, test], axis=0)
    for i in categorical_features:
        data = data.join(pd.get_dummies(data[i], prefix = i))
        data.drop(i, axis = 1, inplace =True)
    train = data[:60000]
    test  = data[60000:]
    return train, test

In [5]:
def display_importances(feature_importance_df):
    cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

In [6]:
def Submission(Ids, preds):
    file_name = datetime.datetime.today().strftime('%m-%d-%H-%M')
    submission = pd.DataFrame({'building_id' : Ids, 'total_price' : preds})
    if not os.path.isdir('Submission'):
        os.makedirs('Submission')
    submission.to_csv('Submission/' + file_name + '.csv', index= False)

In [25]:
def lgb_model(split_num, train, test, stratified = False, if_one_hot = True):
    

    category_cols = ['building_material','building_type','building_use','parking_way']
    
    if stratified:
        kf = StratifiedKFold(n_splits = split_num, random_state = 42, shuffle = True)
    else :
        kf = KFold(n_splits = split_num, random_state=42, shuffle=True)
    train['total_price_log'] = np.log1p(train['total_price'])
    feature_importance_df = pd.DataFrame()
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    
    param ={
        'n_estimators': 5, 'max_depth' : -1, 'num_leaves' :30,         
        'objective': 'regression',   'metric': 'rmse',   
        'learning_rate': 0.01,      'boosting': 'gbdt',     'min_data_in_leaf': 10,
        'feature_fraction': 0.9,    'bagging_freq':1,       'bagging_fraction': 0.8,     'importance_type': 'gain',
        'lambda_l1': 0.2,  'subsample': .8,   'colsample_bytree': .9, 'device' : 'gpu', 'num_thread' : 2
    }
    features = [i for i in train.columns if i not in ['total_price_log', 'total_price', 'city', 'building_id']]
    print(features)
    for fold_, (trn_idx, val_idx) in enumerate(kf.split(train[features].values,train['total_price_log'].values)):
        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label= train['total_price_log'].iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][features], label= train['total_price_log'].iloc[val_idx])
        
        clf = lgb.train(params= param, train_set= trn_data, valid_sets= [trn_data, val_data], verbose_eval=100, early_stopping_rounds= 10000, categorical_feature=category_cols, feval=hit_score)
        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration = clf.best_iteration)
        predictions += clf.predict(test[features], num_iteration = clf.best_iteration) / kf.n_splits
        
        y   = np.expm1(train['total_price_log'].iloc[val_idx])
        yhat = np.expm1(oof[val_idx])
        Hit_score = np.sum([1 for i in np.abs((y - yhat) / y)  if i <= 0.1 ])
        print('fold {} hit_score : {}'.format(fold_ + 1, round(Hit_score, 4) /len(train.iloc[val_idx]) * 10000))
        print('-'*30)
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature']    = features
        fold_importance_df['importance'] = np.log1p(clf.feature_importance(importance_type='gain', iteration=clf.best_iteration))
        fold_importance_df['fold']       = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print('CV scrore : {}'.format(sqrt(mean_squared_error(train['total_price_log'], oof))))
    print('-'*30)
    y = np.expm1(train['total_price_log']) 
    yhat = np.expm1(oof)
    Hit_score = np.sum([1 for i in np.abs((y - yhat) / y)  if i <= 0.1 ])
    print('Hit rate : {}'.format(round(Hit_score, 4) /len(train) * 10000))
    
    display_importances(feature_importance_df)
    return predictions, round(Hit_score, 4) /len(train) * 10000, oof


In [8]:
def feature_process(df):
    
    #Impute missing value
    #df['village_income_median'] = df.groupby(['city', 'town'])['village_income_median'].transform(lambda x : x .fillna(x.median())))
    
    #floor
    df.loc[df['txn_floor'].isna(), 'department'] = 1 
    df.loc[df['txn_floor'].notna(), 'department'] = 0
    df.loc[df['txn_floor'].isna(), 'txn_floor'] = df.loc[df['txn_floor'].isna(), 'total_floor'] 
    df['avg_height_floor'] = df['txn_floor'] / df['total_floor'] 
    df['avg_height_floor'].fillna(0, inplace = True)
    
    #location
    df['location_2'] = df.apply(lambda x : int(str(x['town']) + str(x['village'])), axis=1)
    df['location_2'] = df['location_2'].astype('category')
    df['town'] = df['town'].astype('category')
    
    #date
    df['day_between_txn_complete'] = df['txn_dt'] - df['building_complete_dt']
    df['year_between_txn_complete'] = round(df['day_between_txn_complete'] / 365)
    
    #area
    #df['land*bulid_area'] = df['land_area'] * df ['building_area']
    df['land/bulid_area'] = df['land_area'] / df ['building_area']
    df['house_area'] = df['building_area'] - df['parking_area']
    df['house_rate']  = df['building_area'] / df['house_area']
    
    #parking
    df['miss_parking_area'] = 0
    df['miss_parking_price'] = 0
    df.loc[df['parking_area'].isna(), 'miss_parking_area'] = 1
    df.loc[df['parking_price'].isna(), 'miss_parking_price'] = 1
    df['parking_price_every_area'] = df['parking_price'] / df['parking_area']
    df['parking_way'] = df['parking_way'].astype('category')
    df['parking_area'].fillna(0, inplace =True)
    df['parking_price'].fillna(0, inplace =True)
    
    #population

    
    #building   #building_type = 4 is house
    df['building_type'] = df['building_type'].astype('category')
    df['building_use'] = df['building_use'].astype('category')
    df['building_material'] = df['building_material'].astype('category')
    #df = df.join(pd.get_dummies(df['building_type'], prefix = 'building_type'))
    
    #MIN
    
    MIN_cols = [i for i in df.columns if '_MIN' in i]
    df['min_cat'] = 0
    for col in [i for i in df.columns if 'MIN' in i]:
        df['min_cat'] = df.apply(lambda x : col if x[col] == min(x[MIN_cols]) else x['min_cat'], axis=1)
    le = preprocessing.LabelEncoder()
    df['min_cat'] = le.fit_transform(df['min_cat'])
    
    
    df['MIN_dis_plus'] = reduce(lambda x,y: x + y, [df[i] for i in [i for i in df.columns if '_MIN' in i]])
    df['MIN_dis'] = df.apply(lambda x : x[MIN_cols].min(), axis = 1)
    df['MAX_dis'] = df.apply(lambda x : x[MIN_cols].max(), axis = 1)
    df['Std_dis'] = df.apply(lambda x : x[MIN_cols].std(), axis = 1)
    df['Median_dis'] = df.apply(lambda x : x[MIN_cols].median(), axis = 1)
    df['Diff_dis'] = df['MAX_dis'] - df['MIN_dis']
    
    #10 50 100 250 500 1000 5000 10000
    df['diff_500_50'] = df['N_500'] - df['N_50']
    df['diff_1000_500'] = df['N_1000'] - df['N_500']
    df['diff_5000_1000'] = df['N_5000'] - df['N_1000']
    df['diff_10000_5000'] = df['N_10000'] - df['N_5000']
    df['rate_500_50'] = df['N_500'] / df['N_50']
    df['rate_1000_500'] = df['N_1000'] / df['N_500']
    df['rate_5000_1000'] = df['N_5000'] / df['N_1000']
    df['rate_10000_5000'] = df['N_10000'] / df['N_5000']
    
    
    All_10    = [i for i in df.columns if i.endswith('_10') and 'index' not in i and 'N' not in i]
    All_50    = [i for i in df.columns if i.endswith('_50') and 'index' not in i and 'N' not in i]
    All_100   = [i for i in df.columns if i.endswith('_100') and 'index' not in i and 'N' not in i]
    All_250   = [i for i in df.columns if i.endswith('_250') and 'index' not in i and 'N' not in i]
    All_500   = [i for i in df.columns if i.endswith('_500') and 'index' not in i and 'N' not in i]
    All_1000  = [i for i in df.columns if i.endswith('_1000') and 'index' not in i and 'N' not in i]
    All_5000  = [i for i in df.columns if i.endswith('_5000') and 'index' not in i and 'N' not in i]
    All_10000 = [i for i in df.columns if i.endswith('_10000') and 'index' not in i and 'N' not in i]
    df['All_10'] = reduce(lambda x, y: x + y, [df[i] for i in All_10])
    df['All_50'] = reduce(lambda x, y: x + y, [df[i] for i in All_50])
    df['All_100'] = reduce(lambda x, y: x + y, [df[i] for i in All_100])
    df['All_250'] = reduce(lambda x, y: x + y, [df[i] for i in All_250])
    df['All_500'] = reduce(lambda x, y: x + y, [df[i] for i in All_500])
    df['All_1000'] = reduce(lambda x, y: x + y, [df[i] for i in All_1000])
    df['All_5000'] = reduce(lambda x, y: x + y, [df[i] for i in All_5000])
    df['All_10000'] = reduce(lambda x, y: x + y, [df[i] for i in All_10000])
    
    '''
    for i,j in  zip([All_10, All_50, All_100, All_250, All_500, All_1000, All_5000], [All_50, All_100, All_250, All_500, All_1000, All_5000, All_10000]):
        for order in range(len(i)):
            df[j[order] + '_' + i[order] + '_rate'] = df[j[order]] / df[i[order]]
    '''
    #interection
    inter_cols = ['building_type', 'parking_way', 'building_use', 'building_material']
    for i in range(4):
        for j in range(4):
            if j > i:
                df['inter_btw_' + inter_cols[i] + '_' + inter_cols[j]] = df.apply(lambda x : str(x[inter_cols[i]]) + str(x[inter_cols[j]]), axis=1)
                df['inter_btw_' + inter_cols[i] + '_' + inter_cols[j]] = df['inter_btw_' + inter_cols[i] + '_' + inter_cols[j]].astype('category')
    

    
    #groupby encoding
    
    category_cols  = ['building_material','building_type','building_use','parking_way','location_2']
    numerical_cols = ['building_area', 'land_area', 'day_between_txn_complete', 'txn_dt', 'building_complete_dt', 'house_area', 'house_rate']
    statistics = ['mean', 'median', 'max', 'min']
    for category in category_cols:
        for numerical in numerical_cols:
            for stat in statistics:
                df[numerical + '_' + stat + '_gb_' + category] = df.groupby([category])[numerical].transform(stat)
                df[numerical + '_diff_' + stat + '_gb_' + category] = df[numerical] - df[numerical + '_' + stat + '_gb_' + category]

    category_cols =['town', 'location_2']
    count_cols = ['building_type', 'building_use', 'building_material', 'parking_way']
    for category in category_cols:
        for count_col in count_cols:
            df['size_gb_' + category + '_' + count_col] = df.groupby([category, count_col])[count_col].transform('size')
    
    #Polynomail feature
    
    #useless cols
    df.drop(['village'], axis = 1, inplace = True)
    df.drop(['doc_rate', 'master_rate', 'bachelor_rate', 'highschool_rate', 'jobschool_rate', 'junior_rate'], axis = 1, inplace= True)
    print([i for i in df.columns if 'index' in i and np.sum(df[i]) == 60000])
    df.drop([i for i in df.columns if 'index' in i and np.sum(df[i]) == 60000], axis = 1, inplace = True)
    return df

In [11]:
begin = time.time()
Final_data = feature_process(data)
train = Final_data[:60000]
test = Final_data[60000:]
gc.collect()
(time.time() - begin) / 60 

[]


14.821091544628143

In [9]:
len(train.columns), len(Final_data), len(data)

NameError: name 'Final_data' is not defined

In [10]:
[i for i in train.columns if train[i].dtype not in ['int', 'float', 'O']]

['III_10',
 'III_100',
 'III_1000',
 'III_10000',
 'III_250',
 'III_50',
 'III_500',
 'III_5000',
 'III_index_1000',
 'III_index_50',
 'III_index_500',
 'II_10',
 'II_100',
 'II_1000',
 'II_10000',
 'II_250',
 'II_50',
 'II_500',
 'II_5000',
 'II_index_1000',
 'II_index_50',
 'II_index_500',
 'IV_10',
 'IV_100',
 'IV_1000',
 'IV_10000',
 'IV_250',
 'IV_50',
 'IV_500',
 'IV_5000',
 'IV_index_1000',
 'IV_index_50',
 'IV_index_500',
 'IV_index_5000',
 'IX_10',
 'IX_100',
 'IX_1000',
 'IX_10000',
 'IX_250',
 'IX_50',
 'IX_500',
 'IX_5000',
 'IX_index_1000',
 'IX_index_50',
 'IX_index_500',
 'IX_index_5000',
 'I_10',
 'I_100',
 'I_1000',
 'I_10000',
 'I_250',
 'I_50',
 'I_500',
 'I_5000',
 'I_index_1000',
 'I_index_50',
 'I_index_500',
 'N_1000',
 'N_10000',
 'N_50',
 'N_500',
 'N_5000',
 'VIII_10',
 'VIII_100',
 'VIII_1000',
 'VIII_10000',
 'VIII_250',
 'VIII_50',
 'VIII_500',
 'VIII_5000',
 'VIII_index_1000',
 'VIII_index_50',
 'VIII_index_500',
 'VII_10',
 'VII_100',
 'VII_1000',
 'VII_1

In [13]:
len(train.columns)

206

In [28]:
begin = time.time()
avg_hit_rate = 0
prediction_df = pd.DataFrame()
hit_score_list = []
train_num_list = []
city_df_list   = []
oof_list       = []

for city in train['city'].unique():
    temp_train = train[train['city'] == city]
    temp_test  = test[test['city'] == city]
    temp_train.drop(['city'], axis =1, inplace =True)
    temp_test.drop(['city'], axis =1, inplace =True)
    
    target_df = temp_train.groupby(['town']).agg({'building_area' : ['mean', 'median'], 'land_area' : ['mean', 'median'], 'total_price' : ['mean', 'median']}).reset_index()
    target_df.columns = [i[0] + '_' + i[1]  if i[1] != '' else i[0] for i in target_df.columns.tolist()]
    target_df['price_land_rate_median'] = target_df['total_price_median'] / target_df['land_area_median']
    target_df['price_building_rate_median'] = target_df['total_price_median'] / target_df['building_area_median']
    target_df['price_land_rate_mean'] = target_df['total_price_mean'] / target_df['land_area_mean']
    target_df['price_building_rate_mean'] = target_df['total_price_mean'] / target_df['building_area_mean']

    combine_cols = ['town', 'price_land_rate_median', 'price_building_rate_median', 'price_land_rate_mean', 'price_building_rate_mean']
    temp_train = pd.merge(temp_train, target_df[combine_cols], on =['town'], how='left')
    temp_test = pd.merge(temp_test, target_df[combine_cols], on =['town'], how='left')
    
    preds, hit_score, oof = lgb_model(5, temp_train, temp_test, if_one_hot=False)
    temp = pd.DataFrame({'building_id' : temp_test['building_id'], 'total_price' : preds})
    prediction_df = pd.concat([prediction_df, temp], axis=0)
    
    print('City : {}'.format(city))
    print('Train_num: {}'.format(len(temp_train)))
    print('Test_num: {}'.format(len(temp_test)))
    print('-'*1000)
    hit_score_list.append(hit_score)
    train_num_list.append(len(temp_train))
    city_df_list.append(city)
    oof_list.append(oof)
    avg_hit_rate += hit_score / 60000 * len(temp_train)

Result_df = pd.DataFrame({'City' : city_df_list,
                        'Train_num' : train_num_list,
                        'Hit_score' : hit_score_list})

print('Avg hit_score : {}'.format(avg_hit_rate))
print('總共花：{} 分'.format((time.time() - begin) / 60))
print('現在時間 ： {}'.format(datetime.datetime.today().strftime('%m-%d-%H-%M')))
Submission(prediction_df['building_id'], np.expm1(prediction_df['total_price']))

['III_10', 'III_100', 'III_1000', 'III_10000', 'III_250', 'III_50', 'III_500', 'III_5000', 'III_MIN', 'III_index_1000', 'III_index_50', 'III_index_500', 'II_10', 'II_100', 'II_1000', 'II_10000', 'II_250', 'II_50', 'II_500', 'II_5000', 'II_MIN', 'II_index_1000', 'II_index_50', 'II_index_500', 'IV_10', 'IV_100', 'IV_1000', 'IV_10000', 'IV_250', 'IV_50', 'IV_500', 'IV_5000', 'IV_MIN', 'IV_index_1000', 'IV_index_50', 'IV_index_500', 'IV_index_5000', 'IX_10', 'IX_100', 'IX_1000', 'IX_10000', 'IX_250', 'IX_50', 'IX_500', 'IX_5000', 'IX_MIN', 'IX_index_1000', 'IX_index_50', 'IX_index_500', 'IX_index_5000', 'I_10', 'I_100', 'I_1000', 'I_10000', 'I_250', 'I_50', 'I_500', 'I_5000', 'I_MIN', 'I_index_1000', 'I_index_50', 'I_index_500', 'N_1000', 'N_10000', 'N_50', 'N_500', 'N_5000', 'VIII_10', 'VIII_100', 'VIII_1000', 'VIII_10000', 'VIII_250', 'VIII_50', 'VIII_500', 'VIII_5000', 'VIII_MIN', 'VIII_index_1000', 'VIII_index_50', 'VIII_index_500', 'VII_10', 'VII_100', 'VII_1000', 'VII_10000', 'VII_25

TypeError: cannot convert the series to <class 'float'>

In [None]:
#Record
#local hit_score : 5600, public : 5773    Handle target-encoding on twon

In [69]:
train['preds'] = 0
for pred, city in zip(oof_list, train['city'].unique()):
    train.loc[train['city'] == city, 'preds'] = np.expm1(pred)

In [81]:
train['If_get'] = train.apply(lambda x : 1 if np.abs(x['preds'] - x['total_price']) / x['total_price'] <= 0.1 else 0, axis=1)
gc.collect()

In [None]:
train.gr

In [105]:
for i in ['building_type', 'building_use', 'building_material', 'parking_way']:
    print(train.groupby(i)['If_get'].mean())
    print('-'*30)

building_type
0    0.624562
1    0.606153
2    0.576541
3    0.573652
4    0.425230
Name: If_get, dtype: float64
------------------------------
building_use
0     0.547425
1     0.391659
2     0.567411
3     0.428571
4     0.451613
5     0.521553
6     0.621011
7     0.459459
8     0.544379
10    0.542936
Name: If_get, dtype: float64
------------------------------
building_material
1     0.432360
3     0.571429
4     0.000000
5     1.000000
7     0.392000
8     0.572145
9     0.497006
10    0.569264
11    0.250000
Name: If_get, dtype: float64
------------------------------
parking_way
0    0.607434
1    0.591965
2    0.541344
Name: If_get, dtype: float64
------------------------------


In [122]:
train['mim_cat_dis'] = train[[i for i in train.columns if 'MIN' in i]].min(axis = 1)

In [125]:
[i for i in train.columns if 'MIN' in i]

['I_MIN',
 'II_MIN',
 'III_MIN',
 'IV_MIN',
 'V_MIN',
 'VI_MIN',
 'VII_MIN',
 'VIII_MIN',
 'IX_MIN',
 'X_MIN',
 'XI_MIN',
 'XII_MIN',
 'XIII_MIN',
 'XIV_MIN']

In [126]:
train['min_cat'] = 0
for col in [i for i in train.columns if 'MIN' in i]:
    train['min_cat'] = train.apply(lambda x : col if x['mim_cat_dis'] == x[col] else x['min_cat'], axis=1)

In [131]:
train[[i for i in train.columns if 'MIN' in i]].head()

Unnamed: 0,I_MIN,II_MIN,III_MIN,IV_MIN,V_MIN,VI_MIN,VII_MIN,VIII_MIN,IX_MIN,X_MIN,XI_MIN,XII_MIN,XIII_MIN,XIV_MIN
0,84.745794,102.554396,42.635964,287.735804,205.295496,270.838262,116.075571,69.868801,68.178741,64.774668,132.498164,63.478618,112.582703,34.469803
1,85.529684,71.994648,7.157464,805.037288,65.829031,719.071571,6.864786,65.829031,102.299486,272.553558,125.670173,23.717447,32.370655,40.073573
2,82.676961,110.761328,86.589412,62.428191,58.042563,29.607781,40.43989,32.523085,75.268914,108.270812,164.05233,35.796546,174.954657,54.462081
3,590.458945,279.365544,45.508654,982.3527,423.658555,421.260244,157.922588,73.978254,47.969413,42.380415,283.620244,43.654182,1247.796459,99.628966
4,603.643541,19.028439,546.047664,1435.928054,689.539284,2727.723917,669.730617,332.171857,1354.750025,1165.273099,607.948642,124.956094,1766.374491,124.131236


In [128]:
train['min_cat'].head()

0    XIV_MIN
1    VII_MIN
2     VI_MIN
3      X_MIN
4     II_MIN
Name: min_cat, dtype: object

In [15]:
train.groupby(['city', 'town'])['total_price'].median()

city  town
3     0       1.917940e+06
      17      1.395402e+06
      18      1.562556e+06
      27      1.506032e+06
      34      1.796452e+06
      122     1.631427e+06
      169     1.506032e+06
5     20      9.540510e+05
      31      6.794688e+05
      38      9.870323e+05
      63      9.169484e+05
      130     1.528546e+06
      134     1.093640e+06
      146     1.796452e+06
      152     7.945632e+05
      155     6.437717e+05
      177     1.796452e+06
      181     6.895745e+05
      182     5.638755e+05
      186     8.940765e+05
      223     6.956315e+05
      225     1.235682e+06
      234     1.924694e+06
      259     7.621663e+05
      279     1.634954e+06
      280     6.794439e+05
      300     1.235682e+06
      304     1.059229e+06
      312     1.429050e+06
      327     6.012235e+05
                  ...     
21    46      1.353245e+06
      48      2.432520e+06
      49      2.432520e+06
      102     1.341328e+06
      105     1.163936e+06
      110     1.9

In [17]:
train.groupby(['city', 'town','village'])['total_price'].mean().reset_index()

Unnamed: 0,city,town,village,total_price
0,3,0,394,2.978010e+06
1,3,0,417,5.796243e+06
2,3,0,593,9.156955e+06
3,3,0,794,3.204437e+06
4,3,0,795,1.385097e+06
5,3,0,1045,1.699048e+06
6,3,0,1060,2.384268e+06
7,3,0,1887,1.796788e+06
8,3,0,1988,2.311094e+06
9,3,0,1995,1.354498e+06
