In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.feature_selection import VarianceThreshold

DISTRICT_NAMES_RU = ["Печерский", "Соломенский", "Деснянский", "Шевченковский", "Голосеевский", "Дарницкий", "Подольский", "Святошинский", "Оболонский", "Днепровский"] 

DISTRICT_NAMES_UA = ["Печерський", "Солом'янський", "Деснянський", "Шевченківський", "Голосіївський", "Дарницький", "Подільський", "Святошинський", "Оболонський", "Дніпровський"] 

METRO_RU = [u'Академгородок',u'Арсенальная',u'Берестейская',u'Бориспольская',u'Васильковская',u'Вокзальная',u'Выдубичи',u'Вырлица',u'Выставочный центр (ВДНХ)',u'Героев Днепра',u'Гидропарк',u'Голосеевская',u'Дарница',u'Дворец Украина',u'Дворец спорта',u'Демеевская',u'Днепр',u'Дорогожичи',u'Дружбы народов',u'Житомирская',u'Золотые Ворота',u'Ипподром',u'Кловская',u'Контрактовая площадь',u'Красный хутор',u'Крещатик',u'Левобережная',u'Лесная',u'Лукьяновская',u'Лыбидская',u'Майдан Незалежности',u'Минская',u'Нивки',u'Оболонь',u'Олимпийская (Республиканский стадион)',u'Осокорки',u'Петровка',u'Печерская',u'Площадь Льва Толстого',u'Позняки',u'Политехнический институт',u'Почтовая площадь',u'Святошино',u'Славутич',u'Сырец',u'Тараса Шевченко',u'Театральная',u'Теремки',u'Университет',u'Харьковская',u'Черниговская',u'Шулявская']

WALLS_RU = [u'Газоблок',u'Дерево/кирпич',u'Керамзито-бетон',u'Кирпич',u'Комбинир.',u'Монолит',u'Павильон (стекло)',u'Панель',u'Пенобетон',u'Пеноблок',u'Пеноблок/кирпич']
   
ROOMS_ARRANGEMENT = [u'\u0420', u'\u0421', u'\u0421-\u0440']

WC_TYPES = [ u'\u0420\u0430\u0437\u0434.', u'\u0421\u043e\u0432\u043c\u0435\u0441\u0442\u043d\u044b\u0439']

STATES = [u'\u0415\u0432\u0440\u043e\u0440\u0435\u043c\u043e\u043d\u0442', u'\u041a\u043e\u0441\u043c. \u0440\u0435\u043c\u043e\u043d\u0442', u'\u0420\u0435\u043c\u043e\u043d\u0442']

FLOOR_RU = [u'\u0414\u0435\u0440\u0435\u0432\u043e',  u'\u0414\u043e\u0441\u043a\u0430', u'\u041a\u043e\u0432\u0440\u043e\u043b\u0438\u043d',
 u'\u041a\u043e\u043c\u0431\u0438\u043d\u0438\u0440.',  u'\u041b\u0430\u043c\u0438\u043d\u0430\u0442',  u'\u041b\u0438\u043d\u043e\u043b\u0435\u0443\u043c',
 u'\u041f\u0430\u0440\u043a\u0435\u0442',  u'\u041f\u0430\u0440\u043a\u0435\u0442\u043d\u0430\u044f \u0434\u043e\u0441\u043a\u0430',
 u'\u041f\u043b\u0438\u0442\u043a\u0430', u'\u0421\u0442\u044f\u0436\u043a\u0430']

X_RU = [u'\u0414\u0435\u0440\u0435\u0432\u043e',
 u'\u0416\u0435\u043b\u0435\u0437\u043e\u0431\u0435\u0442\u043e\u043d',
 u'\u041a\u043e\u043c\u0431\u0438\u043d\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0435']

DETAILS = ["square", 
           "kitchen_square",
           "live_square", 
           "rooms", 
#           "floor_count", 
           "floor", 
           'wc_count'
          ] #, 'levels']

NOMINAL_DETAILS = ['material', 'rooms_arrangement','state', 'wc_type'] #, 'floor_material', 'x_material']

YES_OR_NO_DETAILS = ['parking', 'telephone', 'refrigerator', 'tvset', 'balcon']

FIELDS = DETAILS + NOMINAL_DETAILS + YES_OR_NO_DETAILS

stop_price = [u'222502230000', u'2777780000', u'2777750000', u'1036680000', u'300000000', u'225000000', u'700000000', u'16750000']

FEATURES = DETAILS + DISTRICT_NAMES_RU + METRO_RU + WALLS_RU + ROOMS_ARRANGEMENT + STATES + WC_TYPES + FLOOR_RU + X_RU

USD = 24.5
EUR =  26.9

In [None]:
def get_list_of(data, field):
    smth = []
    for d in data:
        try:
            smth += [d[field]]
        except:
            pass
    smth = list(set(smth))
    smth.sort()
    for m in smth:
        print m
    return smth

In [None]:
def district_vector(district_name):
    vec = [0]*len(DISTRICT_NAMES_RU)
    try:
        name = district_name.split()[0].encode('utf-8')
        vec[DISTRICT_NAMES_RU.index(name)] = 1
    except ValueError:
        try:
            vec[DISTRICT_NAMES_UA.index(name)] = 1
        except: 
            pass
    except:
        pass
    return vec

In [None]:
def metro_vector(metro_list):
    vec = [0]*len(METRO_RU)
    for metro_name in metro_list:
        try:
            vec[METRO_RU.index(metro_name)] = 1
        except:
            pass
    return vec

def nominal_vector(name, VALUES):
    vec = [0]*len(VALUES)
    try:
        vec[VALUES.index(name)] = 1
    except:
        pass
    return vec

def yes_or_no(ans):
    if ans.encode('utf-8') == 'да':
        return 1
    if ans.encode('utf-8') == 'Есть':
        return 1
    return 0

In [None]:
def extract_data(flat):
    X = []
    for d in DETAILS:
        try:
            X += [float(flat[d])]
            if d == 'live_square':
                if X[-1] > X[0]:
                    X[-1] = 0
            if d == 'kitchen_square':
                if X[-1] > X[0]:
                    X[-1] = 0
            if d == 'rooms':
                if X[-1] > 10:
                    X[-1] = 0
            if d == 'square':
                if X[-1] > 900:
                    X[-1] = 0
        except:
            X += [0]
    try:
        X += district_vector(flat['district'])
    except KeyError:
        X += [0] * len(DISTRICT_NAMES_RU)
    try:
        if type(flat['metro']) == list:
            X += metro_vector(flat['metro'])
        if type(flat['metro']) == dict:
            X += metro_vector(flat['metro'].keys())        
        X += [len(flat['metro'])]
    except KeyError:
        X += [0] * (len(METRO_RU)+1)
#        X += [0] 
    try:
        X += nominal_vector(flat['material'],WALLS_RU)
    except KeyError:
        X += [0] * len(WALLS_RU)
    try:
        X += nominal_vector(flat['rooms_arrangement'],ROOMS_ARRANGEMENT)
    except KeyError:
        X += [0] * len(ROOMS_ARRANGEMENT)
    try:
        X += nominal_vector(flat['state'],STATES)
    except KeyError:
        X += [0] * len(STATES)
    try:
        X += nominal_vector(flat['wc_type'],WC_TYPES)
    except KeyError:
        X += [0] * len(WC_TYPES)
    try:
        X += nominal_vector(flat['floor_material'],FLOOR_RU)
    except KeyError:
        X += [0] * len(FLOOR_RU)
    try:
        X += nominal_vector(flat['x_material'],X_RU)
    except KeyError:
        X += [0] * len(X_RU)
    for d in YES_OR_NO_DETAILS:
        try:
            X += [yes_or_no(flat[d])]
        except:
            X += [0]
    y=0
    try:
        y = float(flat['price'])
    except:
        print flat['code']
    try:
        if flat['currency'].lower().encode() == 'usd':
            y *= USD
        if flat['currency'].lower().encode() == 'eur':
            y *= EUR
    except:
        pass
    return [X, y]

In [None]:
def process_with(X,y, info=False, short=False, return_short = False, new_coef = []):
    train_X, test_X, train_y, test_y = cross_validation.train_test_split(X, y, test_size = 0.18, random_state = 3)
    regr = linear_model.LinearRegression(normalize=False)
    if new_coef != []:
        print "new coefs"
        regr.coef_ = new_coef
    regr.fit(train_X, train_y)
    if info:
        print "Total: %d, train: %d, test: %d" %(len(X), len(train_X), len(test_X))
        print("Residual sum of squares: %.2f"% np.mean((regr.predict(test_X) - test_y) ** 2))
        print("Train absolute: %.2f"% np.mean(abs(regr.predict(train_X) - train_y)))
        print("Test absolute: %.2f"% np.mean(abs(regr.predict(test_X) - test_y)))
        print("Absolute to mean: %.2f%%"% (np.mean(abs(regr.predict(test_X) - test_y))/np.mean(test_y)*100))
        print('Train variance score: %.2f' % regr.score(train_X, train_y))
        print('Test variance score: %.2f' % regr.score(test_X, test_y))
    if short:
        print np.mean(abs(regr.predict(test_X) - test_y)),regr.score(test_X, test_y)
    if return_short:
        return np.mean(abs(regr.predict(test_X) - test_y)),regr.score(test_X, test_y)
    return regr

In [None]:
#remove data without price
#field - str, data - list of dicts
def remove_with_no(field, t_data):
    c = 1
    while c!=0:
        c=0
        for d in t_data:
            try:
                d[field]
            except:
                #print d
                c+=1
                t_data.remove(d)
        print c
    return t_data

In [None]:
def get_X_y_from(t_data):
    t_cleaned_data = [extract_data(i) for i in t_data]
    X = np.array([i[0] for i in t_cleaned_data])
    y = np.array([i[1] for i in t_cleaned_data])
    return X,y

## Load data & clean data

In [None]:
with open('./spider_1000realty/realty_flats.json') as data_file: #last scray
#with open('./spider_1000realty/results_new.json') as data_file: #first scrapy
#with open('./filtered_data_1.json') as data_file:
#with open('./filtered_realty_data_no_required.json') as data_file: #last nice
#with open('./filtered_realty_data_no_required.json') as data_file: #last nice
    json_data = json.load(data_file)
print len(json_data)

realty_data = [i for i in json_data if i['city'].strip().encode('utf-8') in ["Киев","Київ"]]
print len(realty_data)

In [None]:
with open('metrovka_flats.json') as data_file:
#with open('metrovka_flats_metro.json') as data_file:
#with open('metrovka_flats_district.json') as data_file:    
    json_data = json.load(data_file)
print len(json_data)

for d in json_data:
    d['district']=[]

#filter city Kiyv
metrovka_data = [i for i in json_data ] #if i['city'].encode('utf-8') in ["Киев","Київ"]]
print len(metrovka_data)

In [None]:
print len(realty_data)
realty_data = remove_with_no('price', realty_data)
realty_data = remove_with_no('square', realty_data)
realty_data = remove_with_no('district', realty_data)
#realty_data = remove_with_no('metro', realty_data)
print len(realty_data)

In [None]:
print len(metrovka_data)
metrovka_data = remove_with_no('price', metrovka_data)
metrovka_data = remove_with_no('square', metrovka_data)
#metrovka_data = remove_with_no('district', metrovka_data)
metrovka_data = remove_with_no('metro', metrovka_data)
print len(metrovka_data)

In [None]:
realty_data = []

In [None]:
#merging
realty_data += metrovka_data
print len(realty_data)

In [None]:
realty_data_1 = [i for i in realty_data if i['type'] == 1]
realty_data_2 = [i for i in realty_data if i['type'] == 2]

In [None]:
metrovka_data_1 = [i for i in metrovka_data if i['type'] == 1]
metrovka_data_2 = [i for i in metrovka_data if i['type'] == 2]

## First models

In [None]:
realty_X_2, realty_y_2 = get_X_y_from(realty_data_2)

In [None]:
regr = process_with(realty_X_2, realty_y_2, info=True)

In [None]:
metrovka_X_2, metrovka_y_2 = get_X_y_from(metrovka_data_2)

In [None]:
print("Residual sum of squares: %.2f"% np.mean((regr.predict(metrovka_X_2) - metrovka_y_2) ** 2))
print("Test absolute: %.2f"% np.mean(abs(regr.predict(metrovka_X_2) - metrovka_y_2)))
print("Absolute to mean: %.2f%%"% (np.mean(abs(regr.predict(metrovka_X_2) - metrovka_y_2))/np.mean(metrovka_y_2)*100))
print('Test variance score: %.2f' % regr.score(metrovka_X_2, metrovka_y_2))

In [None]:
print "Features sorted by their score:"
#f = sorted(zip(map(lambda x: round(x, 4), regr.coef_), FEATURES), reverse=True)
f = zip(map(lambda x: round(x, 4), regr.coef_), FEATURES)
print regr.intercept_
print len(FEATURES)
for i in f:
    print "%4f \t %s" %(i[0], i[1])

In [None]:
#!-----extract frequency------
counts = [0] * len(FIELDS)
for d in realty_data:
    for k in d.keys():
        try:
            counts[FIELDS.index(k)]+=1
        except:
            pass

together = []
for i in range(0, len(FIELDS)):
    together += [[counts[i], FIELDS[i]]]

together.sort(reverse=True)

for i in range(0, len(FIELDS)):
    together[i] += [round(together[i][0]/float(len(realty_data))*100,2)]

for i in together:
    print i
#!---------------------------

## Plots

In [None]:
#show dots from all data
plt.scatter([np.linalg.norm(i) for i in realty_X_2], realty_y_2, color='black')
#plt.plot([np.linalg.norm(i) for i in X_2], regr.predict(X_2), color='blue', linewidth=1)
#plt.scatter(np.linalg.norm(X_2[ind]), y_2[ind], color='blue')
#plt.scatter([np.linalg.norm(i) for i in X_2[inx]], y_2[inx], color='blue')
plt.xticks(())
plt.yticks(())
plt.show()

##  Good vs bad

In [None]:
good = []
bad = []
all = zip(metrovka_X_2, metrovka_y_2)
for i,z in enumerate(all):
    r = regr.predict(z[0])
    err_procent = np.mean(abs(r - z[1])/z[1]*100)
    if err_procent > 30:
        bad += [[metrovka_data_2[i], r[0], err_procent]]
    else:
        good += [[metrovka_data_2[i], r[0], err_procent]]

In [None]:
print len(metrovka_data_2)
print len(bad), len(good)
print len(bad)/float(len(metrovka_data_2)), len(good)/float(len(metrovka_data_2))
print np.mean(map(lambda x: x[2], bad)), np.mean(map(lambda x: x[2], good))
print np.median(map(lambda x: x[2], bad)), np.median(map(lambda x: x[2], good))
#print bad[0]
#for b in bad:
#    b[-2]=b[-2][0]
#with open('bad.json', 'w') as outfile:
#    json.dump(bad, outfile)

In [None]:
print "Всего элементов: %d" % len(metrovka_data_2)
print "Плохих: %d \nХороших: %d" % (len(bad), len(good))
print "Процент плохих: %.2f \nПроцент хороших: %.2f" % (len(bad)/float(len(metrovka_data_2)), len(good)/float(len(metrovka_data_2)))
print "Среднее плохих: %.3f \nСреднее хороших: %.3f" % (np.mean(map(lambda x: x[2], bad)), np.mean(map(lambda x: x[2], good)))
print "Медиана плохих: %.3f \nМедиана хороших: %.3f" % (np.median(map(lambda x: x[2], bad)), np.median(map(lambda x: x[2], good)))
#print bad[0]
#for b in bad:
#    b[-2]=b[-2][0]
#with open('bad.json', 'w') as outfile:
#    json.dump(bad, outfile)

## Manual checking

In [None]:
ind = 1129
#for i in metrovka_data_2[ind].values():
#    print i
#print metrovka_data_2[ind]
[x,y] = extract_data(metrovka_data_2[ind])
print metrovka_data_2[ind]['rooms']
print
print("%.2f \t diff"% np.mean((regr.predict(x) - y)))
print y, "real"
print regr.predict(x)[0]
print("%.2f"% np.mean(abs(regr.predict(x) - y)/y*100))

In [None]:
[x,y] =extract_data({
 u'district': u'Оболонский',
 u'price': u'3000000.00',
 #u'rooms': u'2',
 u'square': u'80',
 u'type': 2,
})
print
print("%.2f \t diff"% np.mean((regr.predict(x) - y)))
print y, "real"
print regr.predict(x)[0]
print("%.2f"% np.mean(abs(regr.predict(x) - y)/y*100))

In [None]:
for d in DISTRICT_NAMES_RU:
    [x,y] =extract_data({
     u'district': d.decode('utf-8'),
     u'price': u'3000000.00',
     #u'rooms': u'2',
     u'square': u'45',
     u'type': 2,
    })
    print "%s: \t %f" %(d,regr.predict(x)[0])

In [None]:
for d in METRO_RU:
    [x,y] =extract_data({
     u'metro': [d],
     u'price': u'3000000.00',
     #u'rooms': u'2',
     u'square': u'45',
     u'type': 2,
    })
    print "%50s: \t %f" %(d,regr.predict(x)[0])

In [None]:
from collections import Counter
data = Counter([i['square'] for i in metrovka_data])
data.most_common()   # Returns all unique items and their counts
data.most_common(2)

## Making some fields requiered

In [None]:
#remove_data_with_no_district_in_X
def make_field_required(t_data):
    ind = []
    X = get_X_y_from(t_data)[0]
    print "Elements: %d" % len(X)
    for i, x in enumerate(X):
        if np.count_nonzero(x[6:16])==0:
            ind += [i]
    print "Deleted: %d" % len(ind)
    ind = sorted(ind, reverse=True)
    for i in ind:
        t=t_data.pop(i)
    return t_data

In [None]:
realty_data_2=make_field_required(realty_data_2)

In [None]:
metrovka_data_2=make_field_required(metrovka_data_2)

## Delete data with big diffs

In [None]:
def mean_diff(t_data, normalize = True, plots = True, info=True):
    X, y = get_X_y_from(t_data)
    print "Before:"
    before = process_with(X, y, return_short=True)
    print before
    
    if normalize:
        min_max_scaler = preprocessing.MinMaxScaler()
        X = min_max_scaler.fit_transform(X)
        y = min_max_scaler.fit_transform(y)
    
    pts = np.float32(zip(map(lambda x: np.linalg.norm(x), X),y))
    
    mean_pt = np.mean(pts,axis=0)
    mean_x,mean_y = mean_pt
    diffs = np.apply_along_axis(np.linalg.norm,1,pts - mean_pt)
    mean_diff = np.mean(diffs)

    filtered = np.float32([pts[i] for i,diff in enumerate(diffs) if diff <= 2*mean_diff])
    filtered_mean = np.mean(filtered,axis=0)
    
    t_data_f = [t_data[i] for i,diff in enumerate(diffs) if diff <= 2*mean_diff]
    X_f, y_f = get_X_y_from(t_data_f)
    print "After:"
    after = process_with(X_f,y_f, return_short=True)
    print after
    
#    plt.plot(pts[:,0],pts[:,1],'ro')
#    plt.plot(mean_x,mean_y,'w^')
#    plt.plot(filtered[:,0],filtered[:,1],'bo')
#    plt.plot(filtered_mean[0],filtered_mean[1],'w^')
#    plt.show()
    if after[1] > before[1] or after[0] < before[0]:
        return t_data_f
    return t_data

In [None]:
l = 1
while l!=len(realty_data_2):
    l = len(realty_data_2)
    realty_data_2 = mean_diff(realty_data_2)

## Delete "outlaws"

In [None]:
def get_ind_by_y(X, y, ind_max = True):
    if ind_max:
        return np.argmax(y)
    else:
        return np.argmin(y)

In [None]:
def get_ind_by_x_norm(X, y, ind_max=True):
    norms = [np.linalg.norm(i) for i in X]
    #sorted_norm = [np.linalg.norm(i) for i in X]
    #sorted_norm.sort()
    if ind_max:
        #ind = norms.index(sorted_norm[-1])
        ind = norms.index(max(norms))
    else:
        #ind = norms.index(sorted_norm[1])
        ind = norms.index(min(norms))
    return ind

In [None]:
def get_ind_by_y_to_x_norm(X, y, ind_max=True):
    y_to_norm_x = [y[i]/np.linalg.norm(X[i]) for i in range(0,len(X))]
    if ind_max:
        return np.argmax(y_to_norm_x)
    else:
        return np.argmin(y_to_norm_x)

In [None]:
def cut_data_by(t_data, get_ind_cut_function, ind_max_flag=True):
    previous_sum = -1000000
    previous_score = -100000
    X, y = get_X_y_from(t_data)
    [current_sum, current_score] = process_with(X, y, return_short=True)
    temp={}
    while current_score > previous_score or current_sum < previous_sum:
        previous_score = current_score
        previous_sum = current_sum

        inx = get_ind_cut_function(X, y,ind_max=ind_max_flag)
        #print t_data[norms.index(sorted_norm[-1])]['code'], sorted_norm[-1]
        temp = t_data.pop(inx)
        
        X, y = get_X_y_from(t_data)
        [current_sum, current_score] = process_with(X, y, return_short=True)
        
        print current_sum, previous_sum
        print current_score, previous_score
        print "-----"
    t_data+=[temp]
    return t_data

In [None]:
realty_data_2 = cut_data_by(realty_data_2, get_ind_by_x_norm, True)

In [None]:
l = 1
while l!=len(realty_data_2):
    l = len(realty_data_2)
    print "by x norm max"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_x_norm, True)
    print "by y max"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y, True)
    print "by y to x norm max"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y_to_x_norm, True)
    print "by x norm min"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_x_norm, False)
    print "by y min"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y, False)
    print "by y to x norm min"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y_to_x_norm, False)

## One by one

In [None]:
l = 1
while l!=len(realty_data_2):
    l = len(realty_data_2)
    print "by x norm max"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_x_norm, True)
    print "by y max"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y, True)
    print "by y to x norm max"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y_to_x_norm, True)
    print "by x norm min"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_x_norm, False)
    print "by y min"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y, False)
    print "by y to x norm min"
    realty_data_2 = cut_data_by(realty_data_2, get_ind_by_y_to_x_norm, False)
    print "mean_diff"
    realty_data_2 = mean_diff(realty_data_2, normalize = True)

## Saving and loading coefs

In [None]:
coefs = map(lambda x: round(x, 4), regr.coef_)
f = zip(coefs, FEATURES)
print coefs

In [None]:
import pickle
with open('filtered_mixed_coefs_district_square_required.json', 'w') as outfile:
    pickle.dump(coefs, outfile)

In [None]:
import pickle
with open('filtered_mixed_coefs_district_square_required.json', 'r') as infile:
    c = pickle.load(infile)

## Save data

In [None]:
with open('metrovka_flats_metro.json', 'w') as outfile:
    json.dump(metrovka_data, outfile)
#json.dumps(data_2[0:1])