In [591]:
import json
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.feature_selection import VarianceThreshold

DISTRICT_NAMES_RU = ["Печерский", "Соломенский", "Деснянский", "Шевченковский", "Голосеевский", "Дарницкий", "Подольский", "Святошинский", "Оболонский", "Днепровский"] 

DISTRICT_NAMES_UA = ["Печерський", "Солом'янський", "Деснянський", "Шевченківський", "Голосіївський", "Дарницький", "Подільський", "Святошинський", "Оболонський", "Дніпровський"] 

METRO_RU = [u'Академгородок',u'Арсенальная',u'Берестейская',u'Бориспольская',u'Васильковская',u'Вокзальная',u'Выдубичи',u'Вырлица',u'Выставочный центр (ВДНХ)',u'Героев Днепра',u'Гидропарк',u'Голосеевская',u'Дарница',u'Дворец Украина',u'Дворец спорта',u'Демеевская',u'Днепр',u'Дорогожичи',u'Дружбы народов',u'Житомирская',u'Золотые Ворота',u'Ипподром',u'Кловская',u'Контрактовая площадь',u'Красный хутор',u'Крещатик',u'Левобережная',u'Лесная',u'Лукьяновская',u'Лыбидская',u'Майдан Незалежности',u'Минская',u'Нивки',u'Оболонь',u'Олимпийская (Республиканский стадион)',u'Осокорки',u'Петровка',u'Печерская',u'Площадь Льва Толстого',u'Позняки',u'Политехнический институт',u'Почтовая площадь',u'Святошино',u'Славутич',u'Сырец',u'Тараса Шевченко',u'Театральная',u'Теремки',u'Университет',u'Харьковская',u'Черниговская',u'Шулявская']

WALLS_RU = [u'Газоблок',u'Дерево/кирпич',u'Керамзито-бетон',u'Кирпич',u'Комбинир.',u'Монолит',u'Павильон (стекло)',u'Панель',u'Пенобетон',u'Пеноблок',u'Пеноблок/кирпич']
   
ROOMS_ARRANGEMENT = [u'\u0420', u'\u0421', u'\u0421-\u0440']

WC_TYPES = [ u'\u0420\u0430\u0437\u0434.', u'\u0421\u043e\u0432\u043c\u0435\u0441\u0442\u043d\u044b\u0439']

STATES = [u'\u0415\u0432\u0440\u043e\u0440\u0435\u043c\u043e\u043d\u0442', u'\u041a\u043e\u0441\u043c. \u0440\u0435\u043c\u043e\u043d\u0442', u'\u0420\u0435\u043c\u043e\u043d\u0442']

FLOOR_RU = [u'\u0414\u0435\u0440\u0435\u0432\u043e',  u'\u0414\u043e\u0441\u043a\u0430', u'\u041a\u043e\u0432\u0440\u043e\u043b\u0438\u043d',
 u'\u041a\u043e\u043c\u0431\u0438\u043d\u0438\u0440.',  u'\u041b\u0430\u043c\u0438\u043d\u0430\u0442',  u'\u041b\u0438\u043d\u043e\u043b\u0435\u0443\u043c',
 u'\u041f\u0430\u0440\u043a\u0435\u0442',  u'\u041f\u0430\u0440\u043a\u0435\u0442\u043d\u0430\u044f \u0434\u043e\u0441\u043a\u0430',
 u'\u041f\u043b\u0438\u0442\u043a\u0430', u'\u0421\u0442\u044f\u0436\u043a\u0430']

X_RU = [u'\u0414\u0435\u0440\u0435\u0432\u043e',
 u'\u0416\u0435\u043b\u0435\u0437\u043e\u0431\u0435\u0442\u043e\u043d',
 u'\u041a\u043e\u043c\u0431\u0438\u043d\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0435']

DETAILS = ["square", 
           "kitchen_square",
           "live_square", 
           "rooms", 
#           "floor_count", 
           "floor", 
           'wc_count'
          ] #, 'levels']

NOMINAL_DETAILS = ['material', 'rooms_arrangement','state', 'wc_type'] #, 'floor_material', 'x_material']

YES_OR_NO_DETAILS = ['parking', 'telephone', 'refrigerator', 'tvset', 'balcon']

FIELDS = DETAILS + NOMINAL_DETAILS + YES_OR_NO_DETAILS

stop_price = [u'222502230000', u'2777780000', u'2777750000', u'1036680000', u'300000000', u'225000000', u'700000000', u'16750000']

FEATURES = DETAILS +  DISTRICT_NAMES_RU+ METRO_RU + WALLS_RU + ROOMS_ARRANGEMENT + STATES + WC_TYPES + FLOOR_RU + X_RU

USD = 24.5
EUR =  26.9

In [2]:
def get_list_of(data, field):
    smth = []
    for d in data:
        try:
            smth += [d[field]]
        except:
            pass
    smth = list(set(smth))
    smth.sort()
    for m in smth:
        print m
    return smth

In [3]:
def district_vector(district_name):
    vec = [0]*len(DISTRICT_NAMES_RU)
    try:
        name = district_name.split()[0].encode('utf-8')
        vec[DISTRICT_NAMES_RU.index(name)] = 1
    except ValueError:
        try:
            vec[DISTRICT_NAMES_UA.index(name)] = 1
        except: 
            pass
    except:
        pass
    return vec

In [4]:
def metro_vector(metro_list):
    vec = [0]*len(METRO_RU)
    for metro_name in metro_list:
        try:
            vec[METRO_RU.index(metro_name)] = 1
        except:
            pass
    return vec

def nominal_vector(name, VALUES):
    vec = [0]*len(VALUES)
    try:
        vec[VALUES.index(name)] = 1
    except:
        pass
    return vec

def yes_or_no(ans):
    if ans.encode('utf-8') == 'да':
        return 1
    if ans.encode('utf-8') == 'Есть':
        return 1
    return 0

In [594]:
def extract_data(flat):
    X = []
    for d in DETAILS:
        try:
            X += [float(flat[d])]
            if d == 'live_square':
                if X[-1] > X[0]:
                    X[-1] = 0
            if d == 'kitchen_square':
                if X[-1] > X[0]:
                    X[-1] = 0
            if d == 'rooms':
                if X[-1] > 10:
                    X[-1] = 0
            if d == 'square':
                if X[-1] > 900:
                    X[-1] = 0
        except:
            X += [0]
    try:
        X += district_vector(flat['district'])
    except KeyError:
        X += [0] * len(DISTRICT_NAMES_RU)
    try:
        X += metro_vector(flat['metro'])
        X += [len(flat['metro'])]
    except KeyError:
        X += [0] * (len(METRO_RU)+1)
#        X += [0] 
    try:
        X += nominal_vector(flat['material'],WALLS_RU)
    except KeyError:
        X += [0] * len(WALLS_RU)
    try:
        X += nominal_vector(flat['rooms_arrangement'],ROOMS_ARRANGEMENT)
    except KeyError:
        X += [0] * len(ROOMS_ARRANGEMENT)
    try:
        X += nominal_vector(flat['state'],STATES)
    except KeyError:
        X += [0] * len(STATES)
    try:
        X += nominal_vector(flat['wc_type'],WC_TYPES)
    except KeyError:
        X += [0] * len(WC_TYPES)
    try:
        X += nominal_vector(flat['floor_material'],FLOOR_RU)
    except KeyError:
        X += [0] * len(FLOOR_RU)
    try:
        X += nominal_vector(flat['x_material'],X_RU)
    except KeyError:
        X += [0] * len(X_RU)
    for d in YES_OR_NO_DETAILS:
        try:
            X += [yes_or_no(flat[d])]
        except:
            X += [0]
    y=0
    try:
        y = float(flat['price'])
    except:
        print flat['code']
    try:
        if flat['currency'].lower().encode() == 'usd':
            y *= USD
        if flat['currency'].lower().encode() == 'eur':
            y *= EUR
    except:
        pass
    return [X, y]

In [6]:
def process_with(X,y, info=False, short=False, return_short = False):
    train_X, test_X, train_y, test_y = cross_validation.train_test_split(X, y, test_size = 0.18, random_state = 3)
    regr = linear_model.LinearRegression()
    regr.fit(train_X, train_y)
    if info:
        print "Total: %d, train: %d, test: %d" %(len(X), len(train_X), len(test_X))
        print("Residual sum of squares: %.2f"% np.mean((regr.predict(test_X) - test_y) ** 2))
        print("Train absolute: %.2f"% np.mean(abs(regr.predict(train_X) - train_y)))
        print("Test absolute: %.2f"% np.mean(abs(regr.predict(test_X) - test_y)))
        print("Absolute to mean: %.2f%%"% (np.mean(abs(regr.predict(test_X) - test_y))/np.mean(test_y)*100))
        print('Train variance score: %.2f' % regr.score(train_X, train_y))
        print('Test variance score: %.2f' % regr.score(test_X, test_y))
    if short:
        print np.mean(abs(regr.predict(test_X) - test_y)),regr.score(test_X, test_y)
    if return_short:
        return np.mean(abs(regr.predict(test_X) - test_y)),regr.score(test_X, test_y)
    return regr

In [7]:
#remove data without price
#field - str, data - list of dicts
def remove_with_no(field, t_data):
    c = 1
    while c!=0:
        c=0
        for d in t_data:
            try:
                d[field]
            except:
                #print d
                c+=1
                t_data.remove(d)
        print c
    return t_data

In [8]:
def get_X_y_from(t_data):
    t_cleaned_data = [extract_data(i) for i in t_data]
    X = np.array([i[0] for i in t_cleaned_data])
    y = np.array([i[1] for i in t_cleaned_data])
    return X,y

## Load data & clean data

In [846]:
#with open('./spider_1000realty/flats.json') as data_file: #last scray
#with open('./spider_1000realty/results_new.json') as data_file: #first scrapy
with open('./filtered_data_1.json') as data_file:
#with open('./filtered_realty_data_no_required.json') as data_file: #last nice
    json_data = json.load(data_file)
print len(json_data)

realty_data = [i for i in json_data if i['city'].strip().encode('utf-8') in ["Киев","Київ"]]
print len(realty_data)

3717
3717


In [847]:
with open('places_kiev.json') as data_file:
    json_data = json.load(data_file)
print len(json_data)

#filter city Kiyv
metrovka_data = [i for i in json_data if i['city'].encode('utf-8') in ["Киев","Київ"]]
print len(metrovka_data)

10951
10905


In [848]:
print len(realty_data)
data = remove_with_no('price', realty_data)
#data = remove_with_no('district', data)
print len(realty_data)

3717
0
3717


In [849]:
print len(metrovka_data)
data = remove_with_no('price', metrovka_data)
#data = remove_with_no('district', data)
print len(metrovka_data)

10905
0
10905


In [850]:
#merging
realty_data += metrovka_data

In [851]:
realty_data_1 = [i for i in realty_data if i['type'] == 1]
realty_data_2 = [i for i in realty_data if i['type'] == 2]

In [852]:
metrovka_data_1 = [i for i in metrovka_data if i['type'] == 1]
metrovka_data_2 = [i for i in metrovka_data if i['type'] == 2]

## First models

In [271]:
realty_X_1, realty_y_1 = get_X_y_from(realty_data_1)

In [853]:
realty_X_2, realty_y_2 = get_X_y_from(realty_data_2)

In [854]:
regr = process_with(realty_X_2, realty_y_2, info=True)

Total: 12548, train: 10289, test: 2259
Residual sum of squares: 12654825037293.76
Train absolute: 1264209.87
Test absolute: 1342232.87
Absolute to mean: 47.23%
Train variance score: 0.33
Test variance score: 0.33


In [855]:
metrovka_X_2, metrovka_y_2 = get_X_y_from(metrovka_data_2)

In [856]:
print("Residual sum of squares: %.2f"% np.mean((regr.predict(metrovka_X_2) - metrovka_y_2) ** 2))
print("Test absolute: %.2f"% np.mean(abs(regr.predict(metrovka_X_2) - metrovka_y_2)))
print("Absolute to mean: %.2f%%"% (np.mean(abs(regr.predict(metrovka_X_2) - metrovka_y_2))/np.mean(metrovka_y_2)*100))
print('Test variance score: %.2f' % regr.score(metrovka_X_2, metrovka_y_2))

Residual sum of squares: 15657268618080.16
Test absolute: 1382814.14
Absolute to mean: 48.83%
Test variance score: 0.27


In [857]:
print "Features sorted by their score:"
f = sorted(zip(map(lambda x: round(x, 4), regr.coef_), FEATURES), reverse=True)
print regr.intercept_
print len(FEATURES)
for i in f:
    print "%4f \t %s" %(i[0], i[1])

Features sorted by their score:
565206.8141
100
3039713.119400 	 Вырлица
2184877.339300 	 Святошино
1922827.584700 	 Майдан Незалежности
1753155.767100 	 Университет
1261370.232500 	 Дружбы народов
1220193.675800 	 Лесная
1021497.279100 	 Монолит
987153.040500 	 Героев Днепра
922023.184800 	 Черниговская
901879.006000 	 Левобережная
840277.181500 	 Минская
838441.156100 	 Бориспольская
684044.835600 	 Житомирская
675503.646500 	 Оболонь
568484.038200 	 rooms
486234.052300 	 Печерский
457664.154600 	 Выставочный центр (ВДНХ)
417476.997100 	 Дарница
407802.227900 	 Олимпийская (Республиканский стадион)
397171.088800 	 Демеевская
294112.195200 	 Дерево
292152.710600 	 Линолеум
273545.957400 	 Пеноблок
272614.742900 	 Паркет
192425.370600 	 Академгородок
160430.943600 	 Печерская
140939.072100 	 Контрактовая площадь
132940.472700 	 Тараса Шевченко
129114.735600 	 Славутич
127692.544100 	 С-р
127567.933600 	 Дерево/кирпич
95886.739900 	 Арсенальная
94818.459600 	 Ипподром
83984.127600 	 Дво

In [707]:
#!-----extract frequency------
counts = [0] * len(FIELDS)
for d in realty_data:
    for k in d.keys():
        try:
            counts[FIELDS.index(k)]+=1
        except:
            pass

together = []
for i in range(0, len(FIELDS)):
    together += [[counts[i], FIELDS[i]]]

together.sort(reverse=True)

for i in range(0, len(FIELDS)):
    together[i] += [round(together[i][0]/float(len(realty_data))*100,2)]

for i in together:
    print i
#!---------------------------

[147641, 'rooms', 99.99]
[147567, 'square', 99.94]
[128487, 'floor', 87.02]
[115651, 'live_square', 78.33]
[113328, 'kitchen_square', 76.75]
[71534, 'material', 48.45]
[55644, 'rooms_arrangement', 37.69]
[41876, 'parking', 28.36]
[38385, 'state', 26.0]
[38072, 'wc_type', 25.79]
[24408, 'wc_count', 16.53]
[22313, 'balcon', 15.11]
[17782, 'telephone', 12.04]
[13490, 'refrigerator', 9.14]
[13265, 'tvset', 8.98]


## Plots

In [783]:
#show dots from all data
plt.scatter([np.linalg.norm(i) for i in realty_X_2], realty_y_2, color='black')
#plt.plot([np.linalg.norm(i) for i in X_2], regr.predict(X_2), color='blue', linewidth=1)
#plt.scatter(np.linalg.norm(X_2[ind]), y_2[ind], color='blue')
#plt.scatter([np.linalg.norm(i) for i in X_2[inx]], y_2[inx], color='blue')
plt.xticks(())
plt.yticks(())
plt.show()

##  Good vs bad

In [837]:
good = []
bad = []
for i,z in enumerate(zip(metrovka_X_2, metrovka_y_2)):
    err_procent = np.mean(abs(regr.predict(z[0]) - z[1])/z[1]*100)
    if err_procent > 30:
        bad += [[metrovka_data_2[i], regr.predict(z[0])[0], err_procent]]
    else:
        good += [[metrovka_data_2[i], regr.predict(z[0])[0], err_procent]]



In [838]:
print len(metrovka_data_2)
print len(bad), len(good)
print len(bad)/float(len(metrovka_data_2)), len(good)/float(len(metrovka_data_2))
print np.mean(map(lambda x: x[2], bad)), np.mean(map(lambda x: x[2], good))
print np.median(map(lambda x: x[2], bad)), np.median(map(lambda x: x[2], good))
#print bad[0]
#for b in bad:
#    b[-2]=b[-2][0]
#with open('bad.json', 'w') as outfile:
#    json.dump(bad, outfile)

7043
3025 4018
0.429504472526 0.570495527474
119.330600524 14.3140904688
48.0692510907 14.1076802514


## Manual checking

In [841]:
ind = 1126
#for i in metrovka_data_2[ind].values():
#    print i
#print metrovka_data_2[ind]
[x,y] = extract_data(metrovka_data_2[ind])
print metrovka_data_2[ind]['rooms']
print
print("%.2f \t diff"% np.mean((regr.predict(x) - y)))
print y, "real"
print regr.predict(x)[0]
print("%.2f"% np.mean(abs(regr.predict(x) - y)/y*100))

3

579621.98 	 diff
1592500.0 real
2172121.98357
36.40




In [842]:
[x,y] =extract_data({
 u'district': u'Печерский',
 u'price': u'3000000.00',
 u'rooms': u'3',
 u'square': u'80',
 u'type': 2,
})
print
print("%.2f \t diff"% np.mean((regr.predict(x) - y)))
print y, "real"
print regr.predict(x)[0]
print("%.2f"% np.mean(abs(regr.predict(x) - y)/y*100))


1177989.71 	 diff
3000000.0 real
4177989.70759
39.27




## Making some fields requiered

In [819]:
#remove_data_with_no_district_in_X
def make_field_required(t_data):
    ind = []
    X = get_X_y_from(t_data)[0]
    print len(X)
    for i, x in enumerate(X):
        if np.count_nonzero(x[6:16])==0:
            ind += [i]
    print len(ind)
    ind = sorted(ind, reverse=True)
    for i in ind:
        t=t_data.pop(i)
    return t_data

In [820]:
realty_data_2=make_field_required(realty_data_2)

12649
2


In [821]:
metrovka_data_2=make_field_required(metrovka_data_2)

7049
6


## Delete data with big diffs

In [22]:
def mean_diff(t_data, normalize = True, plots = True, info=True):
    X, y = get_X_y_from(t_data)
    print "Before:"
    before = process_with(X, y, short=True)
    
    if normalize:
        min_max_scaler = preprocessing.MinMaxScaler()
        X_n = min_max_scaler.fit_transform(X)
        y_n = min_max_scaler.fit_transform(y)
    
    pts = np.float32(zip(map(lambda x: np.linalg.norm(x), X_n),y_n))
    
    mean_pt = np.mean(pts,axis=0)
    mean_x,mean_y = mean_pt
    diffs = np.apply_along_axis(np.linalg.norm,1,pts - mean_pt)
    mean_diff = np.mean(diffs)

    filtered = np.float32([pts[i] for i,diff in enumerate(diffs) if diff <= 2*mean_diff])
    filtered_mean = np.mean(filtered,axis=0)
    
    t_data_f = [t_data[i] for i,diff in enumerate(diffs) if diff <= 2*mean_diff]
    X_f, y_f = get_X_y_from(t_data_f)
    print "After:"
    after = process_with(X_f,y_f, short=True)
    
    plt.plot(pts[:,0],pts[:,1],'ro')
    plt.plot(mean_x,mean_y,'w^')
    plt.plot(filtered[:,0],filtered[:,1],'bo')
    plt.plot(filtered_mean[0],filtered_mean[1],'w^')
    plt.show()
    return t_data_f

In [829]:
print len(realty_data_2)
print
realty_data_2 = mean_diff(realty_data_2)
print
print len(realty_data_2)

12647

Before:
789041.93135 0.67715056398




After:
686204.167695 0.641735307171

12647


## Delete "outlaws"

In [23]:
def get_ind_by_y(X, y, ind_max = True):
    if ind_max:
        return np.argmax(y)
    else:
        return np.argmin(y)

In [29]:
def get_ind_by_x_norm(X, y, ind_max=True):
    norms = [np.linalg.norm(i) for i in X]
    sorted_norm = [np.linalg.norm(i) for i in X]
    sorted_norm.sort()
    if ind_max:
        ind = norms.index(sorted_norm[-1])
    else:
        ind = norms.index(sorted_norm[1])
    return ind

In [30]:
def get_ind_by_y_to_x_norm(X, y, ind_max=True):
    y_to_norm_x = [y[i]/np.linalg.norm(X[i]) for i in range(0,len(X))]
    if ind_max:
        return np.argmax(y_to_norm_x)
    else:
        return np.argmin(y_to_norm_x)

In [788]:
def cut_data_by(t_data, get_ind_cut_function):
    previous_sum = -1000000
    previous_score = -100000
    X, y = get_X_y_from(t_data)
    [current_sum, current_score] = process_with(X, y, return_short=True)
    temp={}
    while current_score > previous_score or current_sum < previous_sum:
        previous_score = current_score
        previous_sum = current_sum

        inx = get_ind_cut_function(X, y,ind_max=True)
        #print t_data[norms.index(sorted_norm[-1])]['code'], sorted_norm[-1]
        temp = t_data.pop(inx)
        
        X, y = get_X_y_from(t_data)
        [current_sum, current_score] = process_with(X, y, return_short=True)
        
        print current_sum, previous_sum
        print current_score, previous_score
        print "-----"
    t_data+=[temp]
    return t_data

In [826]:
print len(realty_data_2)
print
realty_data_2 = cut_data_by(realty_data_2, get_ind_by_x_norm)
print
print len(realty_data_2)

12647

810086.985084 789041.93135
0.668567093928 0.67715056398
-----

12647


In [827]:
print len(realty_data_2)
print
realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y)
print
print len(realty_data_2)

12647

810086.985084 789041.93135
0.668567093928 0.67715056398
-----

12647


In [828]:
print len(realty_data_2)
print
realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y_to_x_norm)
print
print len(realty_data_2)

12647

811516.029959 789041.93135
0.668660138324 0.67715056398
-----

12647


## Saving and loading coefs

In [844]:
coefs = map(lambda x: round(x, 4), regr.coef_)
f = zip(coefs, FEATURES)
print coefs

import pickle
with open('filtered_mixed_coefs_district_square_required.json', 'w') as outfile:
    pickle.dump(coefs, outfile)

[54552.9183, 24645.0443, 1292.4827, -405691.3997, -8659.1026, -241937.3554, 1508782.5549, -138496.1104, -686037.4957, 713466.1528, -284578.7114, -666286.1012, 47842.5358, -471063.0538, 334431.1557, -358060.9268, -347318.9708, 2852719.5323, -1102663.8831, -562889.3687, -866946.3492, 540703.1301, -386408.8624, -0.0, 269254.7878, -0.0, -0.0, -183801.6181, 411985.7626, -517795.0141, 0.0, 549772.2696, 650344.789, -1169300.7428, 2161408.4464, 319148.7831, 224589.5797, -263395.5767, -277205.7813, 1159226.7517, 0.0, 226909.4734, -25325.2569, 565113.8782, -497841.69, -1685536.817, 0.0, -0.0, -609522.0229, 114284.334, -437968.7623, -400838.0082, -201579.1093, 233729.8989, -1060980.6289, -195251.6615, -624859.4, 1153036.5549, 765192.2224, 307091.7498, -1892852.3873, 404019.4932, -0.0, -479271.8897, 2366049.9964, -642902.7235, -10679.3917, -767594.8149, 63850.7024, -363643.527, -0.0, -463446.0153, -479380.9114, -0.0, -395379.1028, 0.0, -245460.3762, -0.0, -255414.1704, -21534.6831, -220258.0537, 2

In [835]:
with open('filtered_mixed_data_district_square_required.json', 'r') as infile:
    c = pickle.load(infile)

KeyError: '['

## Save filtered data

In [845]:
with open('filtered_mixed_data_district_square_required.json', 'w') as outfile:
    json.dump(realty_data_2, outfile)
#json.dumps(data_2[0:1])