In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.feature_selection import VarianceThreshold

DISTRICT_NAMES_RU = ["Печерский", "Соломенский", "Деснянский", "Шевченковский", "Голосеевский", "Дарницкий", "Подольский", "Святошинский", "Оболонский", "Днепровский"] 

DISTRICT_NAMES_UA = ["Печерський", "Солом'янський", "Деснянський", "Шевченківський", "Голосіївський", "Дарницький", "Подільський", "Святошинський", "Оболонський", "Дніпровський"] 

METRO_RU = [u'Академгородок',u'Арсенальная',u'Берестейская',u'Бориспольская',u'Васильковская',u'Вокзальная',u'Выдубичи',u'Вырлица',u'Выставочный центр (ВДНХ)',u'Героев Днепра',u'Гидропарк',u'Голосеевская',u'Дарница',u'Дворец Украина',u'Дворец спорта',u'Демеевская',u'Днепр',u'Дорогожичи',u'Дружбы народов',u'Житомирская',u'Золотые Ворота',u'Ипподром',u'Кловская',u'Контрактовая площадь',u'Красный хутор',u'Крещатик',u'Левобережная',u'Лесная',u'Лукьяновская',u'Лыбидская',u'Майдан Незалежности',u'Минская',u'Нивки',u'Оболонь',u'Олимпийская (Республиканский стадион)',u'Осокорки',u'Петровка',u'Печерская',u'Площадь Льва Толстого',u'Позняки',u'Политехнический институт',u'Почтовая площадь',u'Святошино',u'Славутич',u'Сырец',u'Тараса Шевченко',u'Театральная',u'Теремки',u'Университет',u'Харьковская',u'Черниговская',u'Шулявская']

WALLS_RU = [u'Газоблок',u'Дерево/кирпич',u'Керамзито-бетон',u'Кирпич',u'Комбинир.',u'Монолит',u'Павильон (стекло)',u'Панель',u'Пенобетон',u'Пеноблок',u'Пеноблок/кирпич']
   
ROOMS_ARRANGEMENT = [u'\u0420', u'\u0421', u'\u0421-\u0440']

WC_TYPES = [ u'\u0420\u0430\u0437\u0434.', u'\u0421\u043e\u0432\u043c\u0435\u0441\u0442\u043d\u044b\u0439']

STATES = [u'\u0415\u0432\u0440\u043e\u0440\u0435\u043c\u043e\u043d\u0442', u'\u041a\u043e\u0441\u043c. \u0440\u0435\u043c\u043e\u043d\u0442', u'\u0420\u0435\u043c\u043e\u043d\u0442']

FLOOR_RU = [u'\u0414\u0435\u0440\u0435\u0432\u043e',  u'\u0414\u043e\u0441\u043a\u0430', u'\u041a\u043e\u0432\u0440\u043e\u043b\u0438\u043d',
 u'\u041a\u043e\u043c\u0431\u0438\u043d\u0438\u0440.',  u'\u041b\u0430\u043c\u0438\u043d\u0430\u0442',  u'\u041b\u0438\u043d\u043e\u043b\u0435\u0443\u043c',
 u'\u041f\u0430\u0440\u043a\u0435\u0442',  u'\u041f\u0430\u0440\u043a\u0435\u0442\u043d\u0430\u044f \u0434\u043e\u0441\u043a\u0430',
 u'\u041f\u043b\u0438\u0442\u043a\u0430', u'\u0421\u0442\u044f\u0436\u043a\u0430']

X_RU = [u'\u0414\u0435\u0440\u0435\u0432\u043e',
 u'\u0416\u0435\u043b\u0435\u0437\u043e\u0431\u0435\u0442\u043e\u043d',
 u'\u041a\u043e\u043c\u0431\u0438\u043d\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0435']


In [688]:
DETAILS = ["square", 
           "kitchen_square",
           "live_square", 
           "rooms", 
#           "floor_count", 
           "floor", 
           'wc_count'
          ] #, 'levels']

NOMINAL_DETAILS = ['material', 'rooms_arrangement','state', 'wc_type'] #, 'floor_material', 'x_material']

YES_OR_NO_DETAILS = ['parking', 'telephone', 'refrigerator', 'tvset', 'balcon']

FIELDS = DETAILS + NOMINAL_DETAILS + YES_OR_NO_DETAILS

stop_price = [u'222502230000', u'2777780000', u'2777750000', u'1036680000', u'300000000', u'225000000', u'700000000', u'16750000']

FEATURES = DETAILS +  METRO_RU + WALLS_RU + ROOMS_ARRANGEMENT + STATES + WC_TYPES + FLOOR_RU + X_RU
USD = 24.5
EUR =  26.9

In [3]:
#DETAILS = ["square", "kitchen_square","live_square","rooms"]

In [4]:
def get_list_of(data, field):
    smth = []
    for d in data:
        try:
            smth += [d[field]]
        except:
            pass
    smth = list(set(smth))
    smth.sort()
    for m in smth:
        print m
    return smth

In [5]:
def district_vector(district_name):
    vec = [0]*len(DISTRICT_NAMES_RU)
    try:
        name = district_name.split()[0].encode('utf-8')
        vec[DISTRICT_NAMES_RU.index(name)] = 1
    except ValueError:
        try:
            vec[DISTRICT_NAMES_UA.index(name)] = 1
        except: 
            pass
    except:
        pass
    return vec


In [6]:
def metro_vector(metro_list):
    vec = [0]*len(METRO_RU)
    for metro_name in metro_list:
        try:
            vec[METRO_RU.index(metro_name)] = 1
        except:
            pass
    return vec

def nominal_vector(name, VALUES):
    vec = [0]*len(VALUES)
    try:
        vec[VALUES.index(name)] = 1
    except:
        pass
    return vec

def yes_or_no(ans):
    if ans.encode('utf-8') == 'да':
        return 1
    if ans.encode('utf-8') == 'Есть':
        return 1
    return 0

In [670]:
def extract_data(flat):
    X = []
    for d in DETAILS:
        try:
            X += [float(flat[d])]
            if d == 'live_square':
                if X[-1] > X[0]:
                    X[-1] = 0
            if d == 'kitchen_square':
                if X[-1] > X[0]:
                    X[-1] = 0
            if d == 'rooms':
                if X[-1] > 10:
                    X[-1] = 0
            if d == 'square':
                if X[-1] > 900:
                    X[-1] = 0
        except:
            X += [0]
#    try:
#        X += district_vector(flat['district'])
#    except KeyError:
#        X += [0] * len(DISTRICT_NAMES_RU)
    try:
        X += metro_vector(flat['metro'])
        X += [len(flat['metro'])]
    except KeyError:
        X += [0] * (len(METRO_RU)+1)
#        X += [0] 
    try:
        X += nominal_vector(flat['material'],WALLS_RU)
    except KeyError:
        X += [0] * len(WALLS_RU)
    try:
        X += nominal_vector(flat['rooms_arrangement'],ROOMS_ARRANGEMENT)
    except KeyError:
        X += [0] * len(ROOMS_ARRANGEMENT)
    try:
        X += nominal_vector(flat['state'],STATES)
    except KeyError:
        X += [0] * len(STATES)
    try:
        X += nominal_vector(flat['wc_type'],WC_TYPES)
    except KeyError:
        X += [0] * len(WC_TYPES)
    try:
        X += nominal_vector(flat['floor_material'],FLOOR_RU)
    except KeyError:
        X += [0] * len(FLOOR_RU)
    try:
        X += nominal_vector(flat['x_material'],X_RU)
    except KeyError:
        X += [0] * len(X_RU)
    for d in YES_OR_NO_DETAILS:
        try:
            X += [yes_or_no(flat[d])]
        except:
            X += [0]
    try:
        y = float(flat['price'])
    except:
        print flat
    try:
        if flat['currency'].lower().encode() == 'usd':
            y *= USD
        if flat['currency'].lower().encode() == 'eur':
            y *= EUR
    except:
        pass
    return [X, y]

In [8]:
def process_with(X,y):
    train_X, test_X, train_y, test_y = cross_validation.train_test_split(X, y, test_size = 0.18, random_state = 3)
    regr.fit(train_X, train_y)
    return [np.mean(abs(regr.predict(test_X) - test_y)),regr.score(test_X, test_y)]


Loading data

In [158]:
#with open('./spider_1000realty/flats.json') as data_file:
#with open('./spider_1000realty/results_new.json') as data_file:
with open('./filtered_data_1.json') as data_file:
    json_data = json.load(data_file)
print len(json_data)

data = [i for i in json_data if i['city'].strip().encode('utf-8') in ["Киев","Київ"]]
print len(data)

3717
3717


In [386]:
with open('places_kiev.json') as data_file:
    json_data = json.load(data_file)

#filter city Kiyv
metrovka_data = [i for i in json_data if i['city'].encode('utf-8') in ["Киев","Київ"]]
metrovka_data_1 = [i for i in metrovka_data if i['type'] == 1]
metrovka_data_2 = [i for i in metrovka_data if i['type'] == 2]

In [419]:
#remove data without price
c = 1
while c!=0:
    c=0
    for d in all_data:
        try:
            d['price']
            if d['price'] in stop_price:
                all_data.remove(d)
                c+=1
        except:
            #print d
            c+=1
            all_data.remove(d)
    print c

0


In [436]:
#remove data without district
c = 0
for d in metrovka_data_2:
    try:
        d['district']
    except:
        #print d
        c+=1
        metrovka_data_2.remove(d)
print c

0


In [453]:
ind = []
for i, x in enumerate(metrovka_X_2):
    if np.count_nonzero(x[6:16])==0:
        ind += [i]
print len(ind)
sorted(ind, reverse=True)
for i in ind:
    t=metrovka_data.pop(i)

1587


In [None]:
#remove data with no square ( rooms)
inds = [i for i,x in enumerate(all_X_2) if x[3] == 0]     
for i in inds:
    t = all_data.pop(i)
print len(inds), len(all_data_2)

all_cleaned_data_2 = [extract_data(i) for i in all_data_2]
all_X_2 = np.array([i[0] for i in all_cleaned_data_2])
all_y_2 = np.array([i[1] for i in all_cleaned_data_2])

In [651]:
#cut the data by max price
previous_sum = -1000000
previous_score = -100000
cleaned_data_2 = [extract_data(i) for i in data_2]
X_2 = np.array([i[0] for i in cleaned_data_2])
y_2 = np.array([i[1] for i in cleaned_data_2])
train_X_2, test_X_2, train_y_2, test_y_2 = cross_validation.train_test_split(X_2, y_2, test_size = 0.18, random_state = 3)
regr.fit(train_X_2, train_y_2)
[current_sum, current_score] = [np.mean(abs(regr.predict(test_X_2) - test_y_2)),regr.score(test_X_2, test_y_2)]
temp = {}
while current_score > previous_score or current_sum < previous_sum:
    previous_score = current_score
    previous_sum = current_sum
    print max(y_2), data_2[np.argmin(y_2)]['price']#, #data #_2[np.argmax(y_2)]['code']
    temp = data_2[np.argmin(y_2)]
    temp = data_2.pop(np.argmin(y_2))
    cleaned_data_2 = [extract_data(i) for i in data_2]
    X_2 = np.array([i[0] for i in cleaned_data_2])
    y_2 = np.array([i[1] for i in cleaned_data_2])
    train_X_2, test_X_2, train_y_2, test_y_2 = cross_validation.train_test_split(X_2, y_2, test_size = 0.18, random_state = 3)
    regr.fit(train_X_2, train_y_2)
    [current_sum, current_score] = [np.mean(abs(regr.predict(test_X_2) - test_y_2)),regr.score(test_X_2, test_y_2)]
    print current_sum, previous_sum
    print current_score, previous_score
    print "-----"


data_2+=[temp]

5390000.0 27300.00
489436.601043 461754.526762
0.535716370034 0.546839594559
-----


In [652]:
#cut the data by Х norm
previous_sum = -1000000
previous_score = -100000
cleaned_data_2 = [extract_data(i) for i in data_2]
X_2 = np.array([i[0] for i in cleaned_data_2])
y_2 = np.array([i[1] for i in cleaned_data_2])
train_X_2, test_X_2, train_y_2, test_y_2 = cross_validation.train_test_split(X_2, y_2, test_size = 0.18, random_state = 3)
regr.fit(train_X_2, train_y_2)
[current_sum, current_score] = [np.mean(abs(regr.predict(test_X_2) - test_y_2)),regr.score(test_X_2, test_y_2)]
temp={}
while current_score > previous_score or current_sum < previous_sum:
    norms = [np.linalg.norm(i) for i in X_2]
    sorted_norm = [np.linalg.norm(i) for i in X_2]
    sorted_norm.sort()
    previous_score = current_score
    previous_sum = current_sum
    #print data_2[norms.index(sorted_norm[-1])]['code'], sorted_norm[-1]
    temp = data_2[norms.index(sorted_norm[-1])]
    temp = data_2.pop(norms.index(sorted_norm[-1]))
    cleaned_data_2 = [extract_data(i) for i in data_2]
    X_2 = np.array([i[0] for i in cleaned_data_2])
    #X_2 = min_max_scaler.fit_transform(X_2)
    y_2 = np.array([i[1] for i in cleaned_data_2])
    train_X_2, test_X_2, train_y_2, test_y_2 = cross_validation.train_test_split(X_2, y_2, test_size = 0.18, random_state = 3)
    regr.fit(train_X_2, train_y_2)
    [current_sum, current_score] = [np.mean(abs(regr.predict(test_X_2) - test_y_2)),regr.score(test_X_2, test_y_2)]
    print current_sum, previous_sum
    print current_score, previous_score
    print "-----"
    
data_2+=[temp]

489362.093077 461754.526762
0.53583109089 0.546839594559
-----


In [653]:
#cut the data by y to Х norm
y_to_norm_x = [y_2[i]/np.linalg.norm(X_2[i]) for i in range(0,len(X_2))]
previous_sum = -1000000
previous_score = -100000
cleaned_data_2 = [extract_data(i) for i in data_2]
X_2 = np.array([i[0] for i in cleaned_data_2])
y_2 = np.array([i[1] for i in cleaned_data_2])
train_X_2, test_X_2, train_y_2, test_y_2 = cross_validation.train_test_split(X_2, y_2, test_size = 0.18, random_state = 3)
regr.fit(train_X_2, train_y_2)
[current_sum, current_score] = [np.mean(abs(regr.predict(test_X_2) - test_y_2)),regr.score(test_X_2, test_y_2)]
temp={}
while current_score >= previous_score or current_sum <= previous_sum:
    previous_score = current_score
    previous_sum = current_sum
    ind = np.argmax(y_to_norm_x)
    #print data_2[ind]['code'], y_to_norm_x[ind]
    temp = y_to_norm_x.pop(ind)
    temp = data_2.pop(ind)
    cleaned_data_2 = [extract_data(i) for i in data_2]
    X_2 = np.array([i[0] for i in cleaned_data_2])
    #X_2 = min_max_scaler.fit_transform(X_2)
    y_2 = np.array([i[1] for i in cleaned_data_2])
    train_X_2, test_X_2, train_y_2, test_y_2 = cross_validation.train_test_split(X_2, y_2, test_size = 0.18, random_state = 3)
    regr.fit(train_X_2, train_y_2)
    [current_sum, current_score] = [np.mean(abs(regr.predict(test_X_2) - test_y_2)),regr.score(test_X_2, test_y_2)]
    print current_sum, previous_sum
    print current_score, previous_score
    print "-----"
    
data_2+=[temp]

489528.099203 461754.526762
0.535406016764 0.546839594559
-----


In [None]:
#type 1
data_1 = [i for i in data if i['type'] == 1]
cleaned_data_1 = [extract_data(i) for i in data_1]
#cleaned_data_1 = [i for i in cleaned_data_1 if not(i[0][0] == 0)]
X_1 = np.array([i[0] for i in cleaned_data_1])
#X_1 = preprocessing.normalize(X_1, norm='l2')
y_1 = np.array([i[1] for i in cleaned_data_1])
train_X_1, test_X_1, train_y_1, test_y_1 = cross_validation.train_test_split(X_1, y_1, test_size = 0.23, random_state = 5)
print "Total: %d, train: %d, test: %d" %(len(X_1), len(train_X_1), len(test_X_1))

In [None]:
regr.fit(train_X_1, train_y_1)
#print('Coefficients: \n')
#for i in range(0,len(regr.coef_)):
#    print(FEATURES[i], regr.coef_[i])
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(test_X_1) - test_y_1) ** 2))
print('Variance score: %.4f' % regr.score(test_X_1, test_y_1))

In [None]:
res_1 = np.array([ [ regr.predict(test_X_1[i]), test_y_1[i],  regr.predict(test_X_1[i])/test_y_1[i]*100] for i in range(0, len(test_X_1))])
res_1 = res_1[np.argsort(res_1[:,2])]
#for r in res_1:
    #print "%d \t %d \t %d %%" % (r[0], r[1], r[2])

In [159]:
#type 2
data_2 = [i for i in data if i['type'] == 2]

In [676]:
#all_data_2 = [i for i in all_data if i['type'] == 2]
all_cleaned_data_2 = [extract_data(i) for i in all_data_2]
all_X_2 = np.array([i[0] for i in all_cleaned_data_2])
all_y_2 = np.array([i[1] for i in all_cleaned_data_2])

In [677]:
metrovka_data_2 = [i for i in metrovka_data if i['type'] == 2]
metrovka_cleaned_data_2 = [extract_data(i) for i in metrovka_data_2]
metrovka_X_2 = np.array([i[0] for i in metrovka_cleaned_data_2])
metrovka_y_2 = np.array([i[1] for i in metrovka_cleaned_data_2])

In [407]:
metrovka_X_2[0]

array([ 57.,   9.,  33.,   2.,   5.,   0.,   0.,   0.,   0.,   0.,   1.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.])

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
X_2 = min_max_scaler.fit_transform(X_2)

In [362]:
regr = linear_model.LinearRegression()

In [671]:
cleaned_data_2 = [extract_data(i) for i in data_2]
X_2 = np.array([i[0] for i in cleaned_data_2])
y_2 = np.array([i[1] for i in cleaned_data_2])

In [513]:
data_2 = data_2 + metrovka_data_2
#X_2=np.append(X_2,metrovka_X_2,axis=0)
#y_2=np.append(y_2,metrovka_y_2,axis=0)

In [675]:
train_X_2, test_X_2, train_y_2, test_y_2 = cross_validation.train_test_split(X_2, y_2, test_size = 0.15, random_state = 3)
print "Total: %d, train: %d, test: %d" %(len(X_2), len(train_X_2), len(test_X_2))
regr.fit(train_X_2, train_y_2)
print("Residual sum of squares: %.2f"% np.mean((regr.predict(test_X_2) - test_y_2) ** 2))
print("Train absolute: %.2f"% np.mean(abs(regr.predict(train_X_2) - train_y_2)))
print("Test absolute: %.2f"% np.mean(abs(regr.predict(test_X_2) - test_y_2)))
print("Absolute to mean: %.2f%%"% (np.mean(abs(regr.predict(test_X_2) - test_y_2))/np.mean(test_y_2)*100))
print('Train variance score: %.2f' % regr.score(train_X_2, train_y_2))
print('Test variance score: %.2f' % regr.score(test_X_2, test_y_2))

Total: 4054, train: 3445, test: 609
Residual sum of squares: 494357470711.04
Train absolute: 517374.46
Test absolute: 532141.76
Absolute to mean: 26.06%
Train variance score: 0.43
Test variance score: 0.43


In [678]:
print("Residual sum of squares: %.2f"% np.mean((regr.predict(all_X_2) - all_y_2) ** 2))
print("Test absolute: %.2f"% np.mean(abs(regr.predict(all_X_2) - all_y_2)))
print("Absolute to mean: %.2f%%"% (np.mean(abs(regr.predict(all_X_2) - all_y_2))/np.mean(all_y_2)*100))
print('Test variance score: %.2f' % regr.score(all_X_2, all_y_2))

Residual sum of squares: 14502856630270.68
Test absolute: 1175507.63
Absolute to mean: 40.74%
Test variance score: 0.28


In [679]:
print("Residual sum of squares: %.2f"% np.mean((regr.predict(metrovka_X_2) - metrovka_y_2) ** 2))
print("Test absolute: %.2f"% np.mean(abs(regr.predict(metrovka_X_2) - metrovka_y_2)))
print("Absolute to mean: %.2f%%"% (np.mean(abs(regr.predict(metrovka_X_2) - metrovka_y_2))/np.mean(metrovka_y_2)*100))
print('Test variance score: %.2f' % regr.score(metrovka_X_2, metrovka_y_2))

Residual sum of squares: 16821222160708.82
Test absolute: 1327814.71
Absolute to mean: 48.54%
Test variance score: 0.10


In [680]:
ind = 895
#for i in metrovka_data_2[ind].values():
#    print i
print metrovka_data_2[ind]
[x,y] = extract_data(metrovka_data_2[ind])
print metrovka_data_2[ind]['rooms']
print
print("%.2f \t diff"% np.mean((regr.predict(x) - y)))
print y, "real"
print regr.predict(x)[0]
print("%.2f"% np.mean(abs(regr.predict(x) - y)/y*100))

{u'square': u'62', u'user_data': {u'phones': [u'+38 (044) 501-14-25'], u'name': u'\u0410\u0433\u0435\u043d\u0442\u0441\u0442\u0432\u043e \u042d\u041a\u0421\u041f\u0415\u0420\u0422 \u041d\u0435\u0434\u0432\u0438\u0436\u0438\u043c\u043e\u0441\u0442\u044c', u'short_name': u'\u0410\u0433\u0435\u043d\u0442\u0441\u0442\u0432\u043e', u'avatar': u'https://www.gravatar.com/avatar/d41d8cd98f00b204e9800998ecf8427e.jpg?d=404s=40'}, u'currency': u'USD', u'street': u'\u0417\u043e\u0434\u0447\u0438\u0445', u'rooms': u'3', u'without_fee': u'False', u'images': [{u'url': u'http://www.expert-realty.com.ua/Production/images/198223/middle/96913_01.jpg', u'type': u'in_cloud', u'id': u'96913_01_frjznu'}, {u'url': u'http://www.expert-realty.com.ua/Production/images/198223/middle/96913_02.jpg', u'type': u'in_cloud', u'id': u'96913_02_xr2gmw'}, {u'url': u'http://www.expert-realty.com.ua/Production/images/198223/middle/96913_03.jpg', u'type': u'in_cloud', u'id': u'96913_03_ircjo7'}, {u'url': u'http://www.expert-



In [681]:
[x,y] =extract_data({
 u'district': u'Шевченковский',
 u'price': u'3000000.00',
 u'rooms': u'3',
 u'square': u'80',
 u'type': 2,
})
print
print("%.2f \t diff"% np.mean((regr.predict(x) - y)))
print y, "real"
print regr.predict(x)[0]
print("%.2f"% np.mean(abs(regr.predict(x) - y)/y*100))


-763546.86 	 diff
3000000.0 real
2236453.14394
25.45




In [684]:
good = []
bad = []
for i,z in enumerate(zip(metrovka_X_2, metrovka_y_2)):
    err_procent = np.mean(abs(regr.predict(z[0]) - z[1])/z[1]*100)
    if err_procent > 30:
        bad += [[all_data_2[i], regr.predict(z[0])[0], err_procent]]
    else:
        good += [[all_data_2[i], regr.predict(z[0])[0], err_procent]]



In [685]:
print len(metrovka_data_2)
print len(bad), len(good)
print len(bad)/float(len(metrovka_data_2)), len(good)/float(len(metrovka_data_2))
print np.mean(map(lambda x: x[1], bad)), np.mean(map(lambda x: x[1], good))
print np.median(map(lambda x: x[1], bad)), np.median(map(lambda x: x[1], good))
#print bad[0]
#for b in bad:
#    b[-2]=b[-2][0]
#with open('bad.json', 'w') as outfile:
#    json.dump(bad, outfile)

6721
3435 3286
0.511084660021 0.488915339979
1692838.30432 2073819.81657
1637533.61493 1889189.8192


In [689]:
print "Features sorted by their score:"
f = sorted(zip(map(lambda x: round(x, 4), regr.coef_), FEATURES), reverse=True)
#f = zip(map(lambda x: round(x, 4), regr.coef_), FEATURES)
print len(FEATURES)
for i in f:
    print "%4f \t %s" %(i[0], i[1])

Features sorted by their score:
90
32242.191400 	 square
27528.536000 	 kitchen_square
0.000000 	 Шулявская
0.000000 	 Черниговская
0.000000 	 Харьковская
0.000000 	 Университет
0.000000 	 Теремки
0.000000 	 Театральная
0.000000 	 Тараса Шевченко
0.000000 	 Сырец
0.000000 	 Стяжка
0.000000 	 Славутич
0.000000 	 Святошино
0.000000 	 С-р
0.000000 	 С
0.000000 	 Ремонт
0.000000 	 Разд.
0.000000 	 Р
0.000000 	 Почтовая площадь
0.000000 	 Политехнический институт
0.000000 	 Позняки
0.000000 	 Площадь Льва Толстого
0.000000 	 Плитка
0.000000 	 Печерская
0.000000 	 Петровка
0.000000 	 Пеноблок/кирпич
0.000000 	 Пеноблок
0.000000 	 Паркетная доска
0.000000 	 Паркет
0.000000 	 Панель
0.000000 	 Осокорки
0.000000 	 Олимпийская (Республиканский стадион)
0.000000 	 Оболонь
0.000000 	 Нивки
0.000000 	 Монолит
0.000000 	 Минская
0.000000 	 Майдан Незалежности
0.000000 	 Лыбидская
0.000000 	 Лукьяновская
0.000000 	 Линолеум
0.000000 	 Лесная
0.000000 	 Левобережная
0.000000 	 Ламинат
0.000000 	 Креща

In [497]:
#!-----extract frequency------
counts = [0] * len(FIELDS)
for d in data:
    for k in d.keys():
        try:
            counts[FIELDS.index(k)]+=1
        except:
            pass

together = []
for i in range(0, len(FIELDS)):
    together += [[counts[i], FIELDS[i]]]

together.sort(reverse=True)

for i in range(0, len(FIELDS)):
    together[i] += [round(together[i][0]/float(len(data))*100,2)]

for i in together:
    print i
#!---------------------------

[3717, 'square', 100.0]
[3717, 'rooms', 100.0]
[2957, 'floor', 79.55]
[2739, 'live_square', 73.69]
[2739, 'kitchen_square', 73.69]
[1365, 'wc_type', 36.72]
[1293, 'rooms_arrangement', 34.79]
[945, 'material', 25.42]
[419, 'wc_count', 11.27]
[262, 'parking', 7.05]
[203, 'state', 5.46]
[160, 'balcon', 4.3]
[74, 'telephone', 1.99]
[0, 'tvset', 0.0]
[0, 'refrigerator', 0.0]


In [167]:
regr.intercept_

599966.45112527045

In [None]:
# Plot outputs
test_X_plot = [i for i in range(0, len(test_X_1))]
plt.scatter(test_X_plot, test_y_1,  color='black')
plt.plot(test_X_plot, regr.predict(test_X_1), color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()

In [102]:
test_X_plot = [i for i in range(0, len(test_X_2))]
plt.scatter(test_X_plot, test_y_2,  color='black')
plt.plot(test_X_plot, regr.predict(test_X_2), color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()

In [None]:
plt.scatter([np.linalg.norm(i) for i in train_X_2], train_y_2,  color='black')
plt.plot([np.linalg.norm(i) for i in train_X_2], regr.predict(train_X_2), color='blue', linewidth=1)
plt.xticks(())
plt.yticks(())
plt.show()

In [None]:
plt.scatter([np.linalg.norm(i) for i in test_X_2], test_y_2,  color='black')
plt.plot([np.linalg.norm(i) for i in test_X_2], regr.predict(test_X_2), color='blue', linewidth=1)
#plt.scatter([np.linalg.norm(i) for i in test_X_2], test_y_2,  color='blue')
plt.xticks(())
plt.yticks(())
plt.show()

In [635]:
plt.scatter([np.linalg.norm(i) for i in X_2], y_2, color='black')
#plt.plot([np.linalg.norm(i) for i in X_2], regr.predict(X_2), color='blue', linewidth=1)
#plt.scatter(np.linalg.norm(X_2[ind]), y_2[ind], color='blue')
#plt.scatter([np.linalg.norm(i) for i in X_2[inx]], y_2[inx], color='blue')
plt.xticks(())
plt.yticks(())
plt.show()

In [None]:
X_2_n = preprocessing.normalize(X_2, norm='l2')
y_2_n = preprocessing.normalize(y_2, norm='l2')[0]
print X_2
print
print X_2_n
print y_2
print y_2_n
print len(X_2_n), len(y_2_n)
y_to_x = [y_2_n[i]/np.linalg.norm(X_2_n[i]) for i in range(0,len(X_2_n))]
ind = y_to_x.index(max(y_to_x))
print X_2[7916]
print y_2[7916]


In [None]:
plt.scatter([i[0] for i in X_2], [i[2] for i in X_2],  color='black')
plt.show()

In [None]:
print np.argmax([np.linalg.norm(i) for i in X_2])
print data_2[np.argmax([np.linalg.norm(i) for i in X_2])]
print X_2[np.argmax([np.linalg.norm(i) for i in X_2])]
print y_2[np.argmax([np.linalg.norm(i) for i in X_2])]

In [None]:
data_2.pop(np.argmax([np.linalg.norm(i) for i in X_2]))

In [None]:
print np.argmax(y_2)
print data_2[np.argmax(y_2)]
print X_2[np.argmax(y_2)]
print y_2[np.argmax(y_2)]

In [None]:
data_2.pop(np.argmax(y_2))

In [None]:
y_to_norm_x = [y_2[i]/np.linalg.norm(X_2[i]) for i in range(0,len(X_2))]

In [None]:
ind = np.argmax(y_to_norm_x)
print ind
print X_2[ind]
print y_2[ind]
print np.linalg.norm(X_2[ind])

In [None]:
data_2.pop(ind)

In [None]:
[y_mean, x_mean]=  [np.mean(y_2), np.mean([np.linalg.norm(x) for x in X_2])]
print [y_mean, x_mean]
mean_x_y = [ -(np.linalg.norm(X_2[i]) - x_mean)**1 - (y_2[i] - y_mean)**(2) for i in range(0, len(X_2))]
print "max %f" % min(mean_x_y)
ind = np.argmin(mean_x_y)
print ind
print X_2[ind]
print y_2[ind]
print np.linalg.norm(X_2[ind])

In [None]:
m = [np.linalg.norm(x) for x in X_2]
inx = [m.index(i) for i in m if i> 142 and i <142.4]
print inx
[y_2[i] for i in inx]
print len(y_2[inx]), len(X_2[inx])
X_2.size

In [None]:
process_with(X_2,y_2)

In [545]:
min_max_scaler = preprocessing.MinMaxScaler()
X_2_n = min_max_scaler.fit_transform(X_2)
y_2_n = min_max_scaler.fit_transform(y_2)
pts = np.float32(zip(map(lambda x: np.linalg.norm(x), X_2_n),y_2_n))
pts
print len(X_2), len(y_2), len(data_2)

10414 10414 10414




In [546]:
import numpy as np
from matplotlib import pyplot as plt

print len(pts) == len(data_2)
mean_pt = np.mean(pts,axis=0)
mean_x,mean_y = mean_pt
diffs = np.apply_along_axis(np.linalg.norm,1,pts - mean_pt)
mean_diff = np.mean(diffs)

filtered = np.float32([pts[i] for i,diff in enumerate(diffs) if diff <= mean_diff])
data_2 = [data_2[i] for i,diff in enumerate(diffs) if diff <= mean_diff]
X_2 = np.float32([X_2[i] for i,diff in enumerate(diffs) if diff <= 2*mean_diff])
y_2 = np.float32([y_2[i] for i,diff in enumerate(diffs) if diff <= 2*mean_diff])
filtered_mean = np.mean(filtered,axis=0)

plt.plot(pts[:,0],pts[:,1],'ro')
plt.plot(mean_x,mean_y,'w^')
plt.plot(filtered[:,0],filtered[:,1],'bo')
plt.plot(filtered_mean[0],filtered_mean[1],'w^')
plt.show()

True


In [590]:
print process_with(X_2, y_2)

[543030.98248630564, 0.62189713743117681]


In [682]:
plt.scatter([np.linalg.norm(i) for i in X_2], y_2, color='black')
plt.show()

In [119]:
with open('filtered_data_1.json', 'w') as outfile:
    json.dump(data_2, outfile)
#json.dumps(data_2[0:1])