In [135]:
from util import *
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingCVClassifier,StackingClassifier
from sklearn.multiclass import  OneVsOneClassifier,OneVsRestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.ensemble import VotingClassifier
pd.set_option('display.max_colwidth',1000)

In [2]:
train_all = load_train()
test_all = load_testA()
preprocess_basic_time(train_all)
preprocess_basic_wifi(train_all)
shop_info = load_shop_info()

In [4]:
class CVEstimator(object):
    def __expansion(trainx, trainy, cv):
        # 对样本少的进行复制扩充
        bin = np.bincount(trainy)
        labels = np.unique(trainy)
        l = np.asarray(range(np.max(labels) + 1))[bin < cv]
        l = np.intersect1d(l, labels)
        for _l in l:
            n = (trainy == _l).sum()
            n = int(np.ceil(float(cv) / n - 1))
            trainx = np.concatenate([trainx, np.tile(trainx[trainy == _l], (n, 1))], axis=0)
            trainy = np.concatenate([trainy, np.tile(trainy[trainy == _l], (n,))], axis=0)
        return trainx, trainy
    
    def __init__(self, estimator, cv = 3, use_proba = True):
        self.estimator = estimator
        self.cv = cv
        self.kf = StratifiedKFold(cv,shuffle=True)
        from sklearn.base import clone
        self.clfs_ = [clone(self.estimator) for _ in range(self.cv)]
        self.use_proba = use_proba
        self.classes_ = None
        
        
    def fit(self, X,y):
#         if np.unique(y) > 2:
#             self.multiclass = True
#         else:
#             self.multiclass = False
        origin_size = y.shape[0]
        _x,_y = expansion(X,y,self.cv)
        self.indexs = []
        self.ys = []
        self.train_predicts=[]
        for _i,(_train_index,_test_index) in enumerate(self.kf.split(_x,_y)):
            _train_x = _x[_train_index]
            _train_y = _y[_train_index]
            _test_x = _x[_test_index]
            _test_y = _y[_test_index]
            self.clfs_[_i].fit(_train_x, _train_y)
            self.indexs.append(_test_index)
            self.ys.append(_test_y)
            if self.use_proba:
                self.train_predicts.append(self.clfs_[_i].predict_proba(_test_x))
            else:
                self.train_predicts.append(self.clfs_[_i].predict(_test_x))
            if self.classes_  is None:
                self.classes_ = self.clfs_[_i].classes_
            else:
                assert((self.classes_ != self.clfs_[_i].classes_).sum()==0)
        self.indexs = np.concatenate(self.indexs, axis=0)
        self.train_predicts = np.concatenate(self.train_predicts, axis=0)
        self.ys = np.concatenate(self.ys,axis=0)
        all_train_predicts = zip(self.indexs,self.ys,self.train_predicts)
        all_train_predicts = sorted(all_train_predicts, key=lambda x: x[0])
        all_train_predicts = all_train_predicts[:origin_size]
        self.train_ys = np.asarray([_l[1] for _l in all_train_predicts])
        self.train_predicts = np.asarray([_l[2] for _l in all_train_predicts])
        
        
    def get_all_train_predicts(self):
        return self.train_predicts, self.train_ys
        
    def predict(self,X):
        px = self.predict_proba(X)
        p = np.argmax(px,axis=1)
        return self.classes_.take(p)
        
        
    def predict_proba(self,X):
        _y = []
        for _cls in self.clfs_:
            _y.append(_cls.predict_proba(X))
        p = np.max(_y,axis=0)
        return p
        

In [6]:
def get_indexs(df, sorted_wifi, sp = 0.5):
    w= sorted_wifi
    s = gt.shape[0]
#     print s
#     print w
    strong_sig_worst = s * sp
    for _index in range(len(w)):
        if w[_index][1] < strong_sig_worst:
            break
    strong_sig_choose = _index 
    choose_strong_wifi_index = [_wi[0] for _wi in w[:strong_sig_choose]]
    assert(choose_strong_wifi_index > 5)
    choose_strong_wifi_index = np.array(choose_strong_wifi_index)

    indexs = df[df.wifi_name.isin(choose_strong_wifi_index)].wifi_rank.values
    return list(indexs)

def get_indexs2(df, sorted_wifi, topn=20):
    w= sorted_wifi
    s = gt.shape[0]
#     print s
#     print w
    strong_sig_choose = topn 
    choose_strong_wifi_index = [_wi[0] for _wi in w[:strong_sig_choose]]
    assert(choose_strong_wifi_index > 5)
    choose_strong_wifi_index = np.array(choose_strong_wifi_index)

    indexs = df[df.wifi_name.isin(choose_strong_wifi_index)].wifi_rank.values
    return list(indexs)

def get_range_sorted_wifi(sorted_wifi, rank0 = 1, rankmin = 10):
    choose = []
    if rank0 < 1:
        rank0 = 1
    min_sig = 0
    s = len(sorted_wifi)
    while(rank0 <= rankmin and rank0-1 < s):
        choose.append(sorted_wifi[rank0 - 1][0])
        min_sig = sorted_wifi[rank0 - 1][1]
        rank0+=1
    while( rank0-1 < s and sorted_wifi[rank0-1][1] == min_sig ):
        choose.append(sorted_wifi[rank0-1][0])
        rank0+=1
    return choose


def get_intersection_size(sorted_wifi, x, top = 3):
    wi = []
    for _x in x[1]:
        wi.append(_x)
    for _x in x[2]:
        wi.append(_x)    
    wi = sorted(wi,key=lambda x: -x[1])
    wis = get_range_sorted_wifi(wi,rankmin=3)
    return np.intersect1d(sorted_wifi,wis).shape[0]




In [41]:
cvrf = CVEstimator(RandomForestClassifier(n_jobs=-1,n_estimators=388,class_weight="balanced"), use_proba=True, cv=3)
cvrf.fit(_train_x3,train_y)
proba_train = cvrf.predict_proba(_train_x3)
proba_valid = cvrf.predict_proba(_valid_x3)

In [65]:
train.columns

Index([u'user_id', u'shop_id', u'time_stamp', u'longitude', u'latitude',
       u'wifi_infos', u'category_id', u'shop_longitude', u'shop_latitude',
       u'price', u'mall_id', u'dt', u'weekday', u'hour', u'is_weekend',
       u'basic_wifi_info', u'wifi_size', u'use_wifi_size', u'no_use_wifi_size',
       u'use_wifi_freq', u'no_use_wifi_freq', u'i_loc'],
      dtype='object')

In [254]:
train.columns

Index([u'user_id', u'shop_id', u'time_stamp', u'longitude', u'latitude',
       u'wifi_infos', u'category_id', u'shop_longitude', u'shop_latitude',
       u'price', u'mall_id', u'dt', u'weekday', u'hour', u'is_weekend',
       u'basic_wifi_info', u'wifi_size', u'use_wifi_size', u'no_use_wifi_size',
       u'use_wifi_freq', u'no_use_wifi_freq', u'i_loc', u'dayofyear'],
      dtype='object')

In [252]:
valid.columns

Index([u'user_id', u'shop_id', u'time_stamp', u'longitude', u'latitude',
       u'wifi_infos', u'category_id', u'shop_longitude', u'shop_latitude',
       u'price', u'mall_id', u'dt', u'weekday', u'hour', u'is_weekend',
       u'basic_wifi_info', u'wifi_size', u'use_wifi_size', u'no_use_wifi_size',
       u'use_wifi_freq', u'no_use_wifi_freq', u'i_loc', u'dayofyear'],
      dtype='object')

In [9]:
def modify_wifi(tx,vx,train,valid,modify_size=0):
    _tx = tx.copy()
    _vx = vx.copy()
    assert _tx.shape[0] == train.shape[0]
    assert _vx.shape[0] == valid.shape[0]
    assert _tx.shape[1] == _vx.shape[1]
    train_day_of_year = train.dt.dt.dayofyear
    valid_day_of_year = valid.dt.dt.dayofyear
    if modify_size >= 0:
        _tx = _tx[:,:modify_size]
        _vx = _vx[:,:modify_size]
    for _index in range(_tx.shape[1]):
        l = []
        for _wifi_sig in _tx[:,_index]:
            if _wifi_sig != -115:
                l.append(_wifi_sig)
        for _wifi_sig in _vx[:,_index]:
            if _wifi_sig != -115:
                l.append(_wifi_sig)
        modify_median = np.median(l)
#         print modify_median
        for _d in sorted(np.unique(train_day_of_year)):
            d_index = (train_day_of_year==_d) & (_tx[:,_index] != -115)
#             print _d
#             print np.median(_tx[d_index ,_index])
            _tx[d_index ,_index] = _tx[d_index ,_index] - (np.median(_tx[d_index ,_index]) - modify_median)
#             print np.median(_tx[d_index ,_index])
        for _d in sorted(np.unique(valid_day_of_year)):
            d_index = (valid_day_of_year==_d) & (_vx[:,_index] != -115)
#             print _d
#             print np.median(_vx[d_index ,_index])
            _vx[d_index ,_index] = _vx[d_index ,_index] - (np.median(_vx[d_index ,_index]) - modify_median)
#             print np.median(_vx[d_index ,_index])
        
    return _tx, _vx
modify_wifi(train_wifi_all_x[:,[0]],valid_wifi_all_x[:,[0]],train,valid)

(array([], shape=(12023, 0), dtype=float64),
 array([], shape=(4460, 0), dtype=float64))

In [10]:
def exp_wifi(tx,vx):
    _tx = tx.copy()
    _vx = vx.copy()
    __tx = (_tx==-115).astype(int)
    __vx = (_vx==-115).astype(int)
    _tx = np.concatenate([_tx,__tx],axis=1)
    _vx = np.concatenate([_vx,__vx],axis=1)
    return _tx,_vx



In [11]:
def get_specific_wifi_size(x,wifi_names):
    s = 0
    for _x in x[1]:
        if _x[0] in wifi_names:
            s += 1
    for _x in x[2]:
        if _x[0] in wifi_names:
            s += 1
    return s


In [606]:
mall_id = "m_4422" # 6587 ,9 21 36 72 77 86 
train = train_all[train_all.mall_id == mall_id]
# label
y = train.shop_id.values
le = LabelEncoder().fit(y)
y = le.transform(y)
#split
_train_index, _valid_index = get_last_one_week_index(train)
valid = train.iloc[_valid_index]
train = train.iloc[_train_index]
#wifi info
df, (train_index, train_use_wifi, train_matrix), (test_index, test_use_wifi, test_matrix) = get_wifi_cache2(mall_id)
train_wifi_all_x = train_matrix[_train_index]
valid_wifi_all_x = train_matrix[_valid_index]
valid_y = y[_valid_index]
train_y = y[_train_index]
train_lonlats = train[["longitude","latitude"]].values
valid_lonlats = valid[["longitude","latitude"]].values
train_wh = train[["weekday","hour"]].values
valid_wh = valid[["weekday","hour"]].values
train_w = train[["weekday"]].values
valid_w = valid[["weekday"]].values
train_h = train[["hour"]].values
valid_h = valid[["hour"]].values



# rf3 string wifi + lonlat + wh
indexs = choose_strong_wifi_index(-90,6,train_wifi_all_x)
_train_x3 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh],axis=1)
_valid_x3 = np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh],axis=1)
lb = LabelBinarizer().fit(y)
_train_b_y = lb.transform(train_y)
_valid_b_y = lb.transform(valid_y)

ss = StandardScaler().fit(_train_x3)
_train_x3_ss = ss.transform(_train_x3)
_valid_x3_ss = ss.transform(_valid_x3)
norm = Normalizer().fit(_train_x3)
_train_x3_norm = norm.transform(_train_x3)
_valid_x3_norm = norm.transform(_valid_x3)

_train_x33 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_w],axis=1)
_valid_x33 = np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_w],axis=1)
valid_index = np.asarray(range(valid.shape[0]))
train_index = np.asarray(range(train.shape[0]))

# 一天内的交易次数
train["dayofyear"] = train.dt.dt.dayofyear
train["isweekend"] = (train.dt.dt.weekday >=5).astype(int)
oneday_count = train.groupby(["user_id","dayofyear"])["shop_id"].count().reset_index()
oneday_count.rename(columns={"shop_id":"oneday_count"},inplace=True)
train_one_day_count = pd.merge(train,oneday_count,on=["user_id","dayofyear"])["oneday_count"].values
train_one_day_count = train_one_day_count.reshape((-1,1))

valid["dayofyear"] = valid.dt.dt.dayofyear
valid["isweekend"] = (valid.dt.dt.weekday >=5).astype(int)
oneday_count = valid.groupby(["user_id","dayofyear"])["shop_id"].count().reset_index()
oneday_count.rename(columns={"shop_id":"oneday_count"},inplace=True)
valid_one_day_count = pd.merge(valid,oneday_count,on=["user_id","dayofyear"])["oneday_count"].values
valid_one_day_count = valid_one_day_count.reshape((-1,1))

#是否连接wifi
train_connect_wifi = (train.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1,1)
valid_connect_wifi = (valid.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1,1)

In [915]:
#  多分类
rf_all = RandomForestClassifier(n_estimators=500,n_jobs=-1,class_weight="balanced")
train_all_x = np.concatenate([train_wifi_all_x[:, indexs], train_lonlats, train_wh],axis=1)
valid_all_x = np.concatenate([valid_wifi_all_x[:, indexs], valid_lonlats, valid_wh],axis=1)
rf_all.fit(train_all_x,train_y)
rf_all_pba = rf_all.predict_proba(valid_all_x)
acc(rf_all.predict(valid_all_x),valid_y)

0.7704034207101692

In [918]:
train[train.shop_id=="s_3899912"]

Unnamed: 0,user_id,shop_id,time_stamp,longitude,latitude,wifi_infos,category_id,shop_longitude,shop_latitude,price,...,wifi_size,use_wifi_size,no_use_wifi_size,use_wifi_freq,no_use_wifi_freq,i_loc,dayofyear,isweekend,minute,hour_minute
216651,u_60207157,s_3899912,2017-08-24 19:50,121.365462,32.31647,b_22564361|-54|false;b_7875936|-64|false;b_18355877|-65|false;b_22564128|-53|true;b_1085662|-85|false;b_22564113|-45|false;b_31239911|-59|false;b_1085661|-84|false;b_7874902|-57|false;b_22564400|-54|false,c_38,121.364908,32.3169,42,...,10,1,9,0.1,0.9,3914,236,0,50,1190
411064,u_50471249,s_3899912,2017-08-24 15:40,121.365402,32.316492,b_22564176|-80|false;b_22563989|-78|false;b_22564180|-68|false;b_22564382|-60|false;b_22564075|-70|false;b_22564126|-69|false;b_22561956|-82|false;b_22564083|-80|false;b_22564354|-71|false;b_22564361|-67|false,c_38,121.364908,32.3169,42,...,10,0,10,0.0,1.0,7573,236,0,40,940


In [922]:
valid[valid.shop_id=="s_3899912"]

Unnamed: 0,user_id,shop_id,time_stamp,longitude,latitude,wifi_infos,category_id,shop_longitude,shop_latitude,price,...,use_wifi_size,no_use_wifi_size,use_wifi_freq,no_use_wifi_freq,i_loc,dayofyear,isweekend,minute,hour_minute,sample_index
45848,u_10753315,s_3899912,2017-08-26 14:00,121.365487,32.316457,b_22564112|-46|false;b_7259563|-62|false;b_7875936|-63|false;b_30465620|-72|false;b_38265962|-82|false;b_22564172|-52|false;b_22564128|-46|false;b_18355877|-70|false;b_31239911|-58|false;b_30465621|-71|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,913,238,1,0,840,217
72382,u_18011519,s_3899912,2017-08-28 20:30,121.365457,32.316466,b_22564361|-58|false;b_22564126|-65|false;b_22564112|-46|false;b_22562099|-66|false;b_22564399|-58|false;b_22564160|-67|false;b_22564184|-60|false;b_22564180|-69|false;b_22562123|-68|false;b_22564302|-62|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,1494,240,0,30,1230,337
346074,u_31240761,s_3899912,2017-08-27 11:20,121.365404,32.316491,b_22564399|-51|false;b_31239911|-52|false;b_22564113|-55|false;b_30465620|-72|false;b_22564393|-53|false;b_22564112|-42|false;b_22564400|-61|false;b_22564128|-44|false;b_40865719|-74|false;b_22564172|-49|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,6416,239,1,20,680,1614
437071,u_58469715,s_3899912,2017-08-27 15:40,121.365444,32.316471,b_22564321|-68|false;b_22564172|-56|false;b_22564393|-60|false;b_22564399|-52|false;b_22564382|-65|false;b_22564160|-74|false;b_22564128|-51|false;b_22564302|-64|false;b_22564112|-49|false;b_22529300|-76|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,8113,239,1,40,940,2090
474216,u_3377920,s_3899912,2017-08-31 13:40,121.365485,32.316465,b_22564321|-64|true;b_7875936|-71|false;b_48321683|-62|false;b_40865719|-60|false;b_31239911|-65|false;b_50022461|-64|false;b_20565143|-65|false;b_202743|-73|false;b_7874902|-59|false;b_14811282|-68|false,c_38,121.364908,32.3169,42,...,1,9,0.1,0.9,8763,243,0,40,820,2257
503596,u_11476755,s_3899912,2017-08-26 19:00,121.365448,32.316469,b_22564302|-67|false;b_31239911|-63|false;b_22564160|-79|false;b_22529440|-88|false;b_39332839|-86|false;b_22529374|-92|false;b_22529300|-82|false;b_39332838|-87|false;b_22564340|-85|false;b_22564356|-77|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,9400,238,1,0,1140,2472
560331,u_27430923,s_3899912,2017-08-30 11:00,121.365452,32.316477,b_7962419|-70|false;b_22564113|-43|false;b_7875936|-57|false;b_22564173|-58|false;b_7259563|-54|false;b_22564129|-48|false;b_22564394|-53|false;b_57081913|-63|false;b_2169598|-57|false;b_18355877|-61|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,10475,242,0,0,660,2751
603691,u_40201447,s_3899912,2017-08-28 14:00,121.365418,32.316485,b_7874902|-75|false;b_22564173|-71|false;b_22564362|-72|false;b_22564113|-44|false;b_22564185|-63|false;b_22564129|-58|false;b_22564383|-60|false;b_22564400|-57|false;b_22564394|-58|false;b_7875936|-69|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,11347,240,0,0,840,2974
955612,u_10716663,s_3899912,2017-08-30 13:10,121.365423,32.316475,b_22564085|-78|false;b_22564298|-82|false;b_22564112|-50|false;b_22564393|-55|false;b_22564160|-73|false;b_22564321|-74|false;b_22529374|-77|false;b_22529300|-76|false;b_22564356|-70|false;b_22564158|-80|false,c_38,121.364908,32.3169,42,...,0,10,0.0,1.0,17677,242,0,10,790,4614


In [921]:
df[df.wifi_name=="b_18355877"]

Unnamed: 0,wifi_rank,wifi_name,wifi_num
1772,1772,b_18355877,4


In [917]:
train.groupby("shop_id").count()["user_id"].reset_index().sort_values("user_id")

Unnamed: 0,shop_id,user_id
37,s_3899912,2
60,s_591798,2
13,s_282583,5
26,s_3570746,8
48,s_422407,11
68,s_647740,13
40,s_3939061,16
42,s_3984488,18
77,s_662040,26
61,s_594743,26


In [889]:
def get_top_k(pba,k = 3, min_proba = 0):
    _pba = pba.copy()
    def top_k(x,k):
        rs = []
        for _ in range(k):
            ind = np.argmax(x)
            if x[ind] < min_proba:
                continue
            rs.append(ind)
            x[ind] = 0
        return rs
    r = map(lambda x: top_k(x,k), _pba)
    return r

def acc_top_k(candidate, y):
    all_size = len(candidate)
    cor = 0
    for _can,_true in zip(candidate,y):
        if _true in _can:
            cor += 1
    return float(cor) / all_size



In [914]:
# 候选10 个, 最小概率0.02
candidate = get_top_k(rf_all_pba, 2, 0.01)
candidate = [rf_all.classes_.take(_can) for _can in candidate]
acc_top_k(candidate, valid_y)

0.8549916341327384

In [895]:
train["minute"] = train.dt.dt.minute
train["hour_minute"] = train.hour * 60 + train.minute
valid["minute"] = valid.dt.dt.minute
valid["hour_minute"] = valid.hour * 60 + valid.minute
train["dayofyear"] = train.dt.dt.dayofyear
valid["dayofyear"] = valid.dt.dt.dayofyear
valid["sample_index"] = range(valid.shape[0])

In [896]:
tud = train.groupby(["user_id", "dayofyear"])["shop_id"].count().reset_index()
tud = tud[tud.shop_id >=2]
vud = valid.groupby(["user_id", "dayofyear"])["longitude"].count().reset_index()
vud = vud[vud.longitude >= 2]

In [904]:
modify_candidate_u = copy_candidate(candidate)
def remove_candidate_by_user(candidate,valid, train, deresase_to, le):
    u = train.groupby("user_id")["shop_id"].count().reset_index().sort_values("shop_id",ascending=False)
    u.rename(columns={"shop_id":"count"},inplace=True)
    uids = u[u["count"] >= (train.dayofyear.unique().shape[0] / 2)].user_id.values
    for _ind,_can in enumerate(candidate):
        if len(_can) > deresase_to:
            _uid = valid.iloc[_ind].user_id
            if _uid in uids:
                new_set = []
#                 print le.transform(valid.iloc[[_ind]]["shop_id"].values)
#                 print _can
                _max = np.bincount(le.transform(train[train.user_id == _uid].shop_id.values))
                
                while len(new_set) < deresase_to:
                    _m = np.argmax(_max)
                    if _max[_m] != 0:
                        if _m in _can:
                            new_set.append(_m)
                    else:
                        break
                    _max[_m] = 0
                
                if len(new_set) == deresase_to:
#                     print new_set
                    candidate[_ind] = np.asarray(new_set)
#                 print "=================="
                
remove_candidate_by_user(modify_candidate_u, valid,train,deresase_to=1, le=le)
acc_top_k(modify_candidate_u, valid_y)

0.8542480014872653

In [910]:
print_statistic_candidate(candidate,valid_y)
print_statistic_candidate(modify_candidate_u,valid_y)
print_statistic_candidate(modify_candidate,valid_y)

all num [   0  695 4684]
cor num [   0  695 3904]
all num [   0  738 4641]
cor num [   0  733 3862]
all num [   0  738 4641]
cor num [   0  733 3862]


all num [   0  738 4641]
cor num [   0  733 3862]


In [907]:
shops_cor_60time = {}
shops_cor_1000time = {}
shops_cor_30time = {}
_ = map(lambda x:corr_analyse_in_time(x,train, shops_cor_30time,time=30),tud[["user_id", "dayofyear"]].values)
_ = map(lambda x:corr_analyse_in_time(x,train, shops_cor_60time,time=60),tud[["user_id", "dayofyear"]].values)
_ = map(lambda x:corr_analyse_in_time(x,train, shops_cor_1000time,time=1000),tud[["user_id", "dayofyear"]].values)

In [908]:
shops_cor_10time = {}
_ = map(lambda x:corr_analyse_in_time(x,train, shops_cor_10time,time=10),tud[["user_id", "dayofyear"]].values)

In [909]:
modify_candidate = copy_candidate(modify_candidate_u)        
#一天内
_ = map(lambda x:remove_candidate(x,valid,shops_cor_10time,modify_candidate,le, stay=1, time = 10),vud[["user_id","dayofyear"]].values)
acc_top_k(modify_candidate, valid_y)

0.8542480014872653

In [47]:
# 按hour 对最可能去的商店进行排序
ht = train.groupby(["shop_id","hour"])["longitude"].count().reset_index().sort_values(["hour","longitude"],ascending=False)

In [52]:

modify_candidate4 = copy_candidate(modify_candidate3)
def remove_candidate_by_hour(candidate,valid,ht, deresase_to, le):
    for _ind,_can in enumerate(candidate):
        if len(_can) > deresase_to:
            _can = le.inverse_transform(_can)
            _h = valid.iloc[[_ind]]["hour"].values[0]
            _temp = ht[ht.hour==_h]["shop_id"].values
            new_set = []
            for _sn in _temp:
                if _sn in _can:
                    new_set.append(_sn)
                if len(new_set) >= deresase_to:
                    break
            if len(new_set) >=deresase_to:
                candidate[_ind] = le.transform(new_set)
                
remove_candidate_by_hour(modify_candidate4, valid,ht,deresase_to=7, le=le)
acc_top_k(modify_candidate4, valid_y)

0.9800448430493274

In [53]:
print statistic_candidate(candidate)
print statistic_candidate(modify_candidate)
print statistic_candidate(modify_candidate2)
print statistic_candidate(modify_candidate3)
print statistic_candidate(modify_candidate4)

[   0 2265 1097  382  188  155  104   60   57   30  122]
[   0 2267 1098  391  186  151  129   53   49   26  110]
[   0 2272 1100  392  182  171  106   52   49   26  110]
[   0 2273 1100  408  178  158  106   52   49   26  110]
[   0 2273 1100  408  178  158  106  223    4    4    6]


In [None]:
many_dict = {}
for _ind,_s in enumerate(many_set):
    if _ind % 20 == 0:
        print "{}/{}".format(_ind,len(many_set))
    _ss = _s.split(",")
    s1 = int(_ss[0])
    _indexs = set(all_choose[s1])
    _train_bool_index =  (_train_b_y[:,s1] == 1)
    for _k in range(1,len(_ss)):
        s2 = int(_ss[_k])
        _indexs = _indexs.union(set(all_choose[s2]))
        _train_bool_index =  _train_bool_index | (_train_b_y[:,s2] == 1)
   
    _indexs = list(_indexs) 
    _rf = RandomForestClassifier(n_jobs=-1,n_estimators=288,class_weight="balanced")
    ptrain_x = train_wifi_all_x[_train_bool_index][:,_indexs] 
    ptrain_y = train_y[_train_bool_index] 
    ptrain_x = np.concatenate([ptrain_x, train_lonlats[_train_bool_index]],axis=1)
    _rf.fit(ptrain_x,ptrain_y)
    many_dict[_s] = _rf

In [32]:
ap = np.concatenate([all_predict,last_p.reshape(-1,1)],axis=1)

In [33]:
lp = np.asarray(map(lambda x:np.argmax(np.bincount(x)),ap))

In [34]:
acc(lp,valid_y)

0.9426008968609866

In [101]:
k = 0
scores = np.zeros((len(valid_y),_train_b_y.shape[1]))
base_score = 0.5
for _i in range(len(sids)): 
    for _j in range(_i+1, len(sids)): 
        s1 = sids[_i] 
        s2 = sids[_j] 
        _p = all_predict[:,k]
        for _k,_v in enumerate(_p):
            if _v == _train_b_y.shape[1]:
                scores[_k,:] += base_score
                scores[_k,s1] -= base_score
                scores[_k,s2] -= base_score
            else:
                scores[_k,_v]+=1
        k += 1

In [102]:
acc(np.argmax(scores,axis=1),valid_y)

0.9181614349775785

In [91]:
all_predict[all_predict==-1] =_train_b_y.shape[1]

In [96]:
def amax(x):
    l = np.bincount(x)
    l[-1] = 0
    return np.argmax(l)

In [100]:
acc(np.asarray(map(lambda x:amax(x), all_predict)),valid_y)

0.9367713004484305

In [187]:
sids = sorted(all_choose.keys())
for _i in range(len(sids)):
    for _j in range(_i+1, len(sids)):
        s1 = sids[_i]
        s2 = sids[_j]
        _indexs = list(set(all_choose[s1]).union(set(all_choose[s2])))
        _key = str(s1) + "," + str(s2)
        print _key
        
        
        break
    break

0,1
[1 0 0 ..., 0 0 0]


In [649]:
import matplotlib.pyplot as plt
import seaborn as ses

In [911]:
train_times = train[["weekday","hour","is_weekend"]].values
valid_times = valid[["weekday","hour","is_weekend"]].values
train_connect_wifi = (train.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1,1)
valid_connect_wifi = (valid.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1,1)
train_search_wifi_size = train.basic_wifi_info.map(lambda x: x[0]).values.reshape(-1,1)
valid_search_wifi_size = valid.basic_wifi_info.map(lambda x: x[0]).values.reshape(-1,1)

In [586]:
def forward_search(estimetor, trainx,trainy,fix_trainx,validx,validy,fix_validx):
    choose = []
    best_acc=0
    cc = 0
    while cc != -1:
        cc = -1
        for _i in range(trainx.shape[1]):
            if _i not in choose:
                curr = choose + [_i]
                ptrainx = np.concatenate([trainx[:,curr], fix_trainx],axis=1)
                pvalidx = np.concatenate([validx[:,curr], fix_validx],axis=1)
                m.fit(ptrainx,trainy)
                _acc = acc(m.predict(pvalidx),validy)
                if _acc > best_acc:
                    best_acc = _acc
                    cc = _i
        if cc != -1:
            print "choose", cc
            print "best acc", best_acc
            choose.append(cc)
        if best_acc == 1:
            break
    return choose, best_acc
                
            

In [711]:
def forward_search_cv(estimator, trainx,trainy,fix_trainx,cv=3):
    choose = []
    best_acc=0
    cc = 0
    while cc != -1:
        cc = -1
        for _i in range(trainx.shape[1]):
            if _i not in choose:
                curr = choose + [_i]
                ptrainx = np.concatenate([trainx[:,curr], fix_trainx],axis=1)
                _acc = np.mean(cross_val_score(estimator,ptrainx,trainy,cv=cv))
                if _acc > best_acc:
                    best_acc = _acc
                    cc = _i
        if cc != -1:
            print "choose", cc
            print "best acc", best_acc
            choose.append(cc)
        if best_acc == 1:
            break
    return choose, best_acc

In [757]:
# 周末与非周末 2个模型

for _s1,_s2 in [(66,74),(68,70),(70,75),(67,72),(44,53),(36,50),(67,75)]:

    s1 = _s1
    print "s1", le.inverse_transform([s1])
    s2 = _s2
    print "s2", le.inverse_transform([s2])
    print
    s1_train = train[_train_b_y[:,s1] == 1]
    # s1_train = s1_train[s1_train.dayofyear >= 231]
    s2_train = train[_train_b_y[:,s2] == 1]
    s1_valid = valid[_valid_b_y[:,s1] == 1]
    s2_valid = valid[_valid_b_y[:,s2] == 1]
    print "s1 train shape",s1_train.shape
    print "s2 train shape",s2_train.shape
    print "s1 valid shape",s1_valid.shape
    print "s2 valid shape",s2_valid.shape

    s1_weekday_wifi_all_x = train_wifi_all_x[(_train_b_y[:,s1] == 1) & (train.is_weekend == 0)]
    s1_weekend_wifi_all_x = train_wifi_all_x[(_train_b_y[:,s1] == 1) & (train.is_weekend == 1)]

    s2_weekday_wifi_all_x = train_wifi_all_x[(_train_b_y[:,s2] == 1) & (train.is_weekend == 0)]
    s2_weekend_wifi_all_x = train_wifi_all_x[(_train_b_y[:,s2] == 1) & (train.is_weekend == 1)]

    ratio = 0.1

    # weekday
    is_weekend=0
    sp1 = int(s1_weekday_wifi_all_x.shape[0] * ratio)
    sp2 = int(s2_weekday_wifi_all_x.shape[0] * ratio)
    s1_indexs = choose_strong_wifi_index(-115,sp1,s1_weekday_wifi_all_x)
    s2_indexs = choose_strong_wifi_index(-115,sp2,s2_weekday_wifi_all_x)
    _indexs = list(set(s1_indexs).union(set(s2_indexs)))

    _train_weekday_bool_index = ((_train_b_y[:,s1] == 1) | (_train_b_y[:,s2]==1)) & (train.is_weekend == is_weekend)
    _valid_weekday_bool_index = ((_valid_b_y[:,s1] == 1) | (_valid_b_y[:,s2]==1)) & (valid.is_weekend == is_weekend)
    ptrain_x = train_wifi_all_x[_train_weekday_bool_index][:,_indexs]
    ptrain_y = train_y[_train_weekday_bool_index]
    pvalid_x = valid_wifi_all_x[_valid_weekday_bool_index][:,_indexs]
    pvalid_y = valid_y[_valid_weekday_bool_index]
    pvalid = valid[_valid_weekday_bool_index]

    ptrain_x = np.concatenate([ptrain_x, 
                               train_lonlats[_train_weekday_bool_index],
    #                            train_times[_train_bool_index][:,[2]]
                              ],axis=1)

    pvalid_x = np.concatenate([pvalid_x,
                               valid_lonlats[_valid_weekday_bool_index],
    #                            valid_times[_valid_bool_index][:,[2]]
                              ], axis=1)

    m = RandomForestClassifier(n_jobs=-1,
                               n_estimators=188,
    #                            min_samples_leaf=2
    #                            min_samples_split=5,
    #                            class_weight="balanced",
                               random_state=2017
                              )

    print "train cross_val_score acc",np.mean(cross_val_score(m,ptrain_x,ptrain_y))
    m.fit(ptrain_x,ptrain_y)
    p2 = m.predict(pvalid_x)
    print p2.shape
    print "valid acc", acc(p2,pvalid_y)

    p_weekday = p2
    r_weekday = pvalid_y

    # weekday
    is_weekend = 1
    sp1 = int(s1_weekday_wifi_all_x.shape[0] * ratio)
    sp2 = int(s2_weekday_wifi_all_x.shape[0] * ratio)
    s1_indexs = choose_strong_wifi_index(-115,sp1,s1_weekday_wifi_all_x)
    s2_indexs = choose_strong_wifi_index(-115,sp2,s2_weekday_wifi_all_x)
    _indexs = list(set(s1_indexs).union(set(s2_indexs)))

    _train_weekday_bool_index = ((_train_b_y[:,s1] == 1) | (_train_b_y[:,s2]==1)) & (train.is_weekend == is_weekend)
    _valid_weekday_bool_index = ((_valid_b_y[:,s1] == 1) | (_valid_b_y[:,s2]==1)) & (valid.is_weekend == is_weekend)
    ptrain_x = train_wifi_all_x[_train_weekday_bool_index][:,_indexs]
    ptrain_y = train_y[_train_weekday_bool_index]
    pvalid_x = valid_wifi_all_x[_valid_weekday_bool_index][:,_indexs]
    pvalid_y = valid_y[_valid_weekday_bool_index]
    pvalid = valid[_valid_weekday_bool_index]

    ptrain_x = np.concatenate([ptrain_x, 
                               train_lonlats[_train_weekday_bool_index],
    #                            train_times[_train_bool_index][:,[2]]
                              ],axis=1)

    pvalid_x = np.concatenate([pvalid_x,
                               valid_lonlats[_valid_weekday_bool_index],
    #                            valid_times[_valid_bool_index][:,[2]]
                              ], axis=1)

    m = RandomForestClassifier(n_jobs=-1,
                               n_estimators=188,
    #                            min_samples_leaf=2
    #                            min_samples_split=5,
    #                            class_weight="balanced",
                               random_state=2017
                              )

    print "train cross_val_score acc",np.mean(cross_val_score(m,ptrain_x,ptrain_y))
    m.fit(ptrain_x,ptrain_y)
    p2 = m.predict(pvalid_x)
    print p2.shape
    print "valid acc", acc(p2,pvalid_y)

    print "all acc", acc(np.hstack([p_weekday,p2]), np.hstack([r_weekday, pvalid_y]))


s1 ['s_648391']
s2 ['s_649379']

s1 train shape (475, 24)
s2 train shape (422, 24)
s1 valid shape (137, 24)
s2 valid shape (142, 24)
train cross_val_score acc 0.742387287509
(158,)
valid acc 0.740506329114
train cross_val_score acc 0.75259730758
(121,)
valid acc 0.768595041322
all acc 0.752688172043


In [912]:
def rank1(x): # 不行啊
        _x = x.copy()
        _x[:] = -999
        for _index, _i in enumerate(x):
            l = zip(_i,range(len(_i)))
            l = sorted(l,key=lambda x: -x[0])
            for _rank,(_sig,_ind) in enumerate(l):
                if _sig == -115:
                    break
                _x[_index,_ind] = _rank
        return _x
def rank2(x,data,wifi_names): # 依然不行啊
    d = dict(zip(wifi_names,range(len(wifi_names))))
    _x = x.copy()
    _x[:] = -999
    for _index, _i in enumerate(x):
        wifi_info = data.iloc[_index].basic_wifi_info
        wifis = []
        for _w in wifi_info[1]:
            wifis.append(_w)
        for _w in wifi_info[2]:
            wifis.append(_w)

        l = sorted(wifis,key=lambda x: -x[1])
        for _rank,(_wn,_sig) in enumerate(l):
            if _wn in d:
                _x[_index,d[_wn]] = _rank
    return _x

In [913]:
def get_one_sample_sorted_wifis(x):
    wifis = []
    for xx in [x[1],x[2]]:
        for _x in xx:
            wifis.append(_x)
    l = sorted(wifis,key=lambda x:-x[1])
    return l

def wifi_rank_mean(train,test):
    wifi_rank_means = {}
    def rank_statistic(x, wifi_rank_means):
        l = get_one_sample_sorted_wifis(x)
        for _rank,(_wn,_sig) in enumerate(l):
            if _wn not in wifi_rank_means:
                wifi_rank_means[_wn] = []
            wifi_rank_means[_wn].append(_rank)
    train.basic_wifi_info.map(lambda x: rank_statistic(x,wifi_rank_means))
    wifi_rank_mean = {}
    for _wn,_means in wifi_rank_means.items():
        wifi_rank_mean[_wn] = np.mean(_means)
    return wifi_rank_mean
def rank3(data,wifi_rank_mean):
    ranks = np.zeros((data.shape[0],10))
#     ranks[:] = 999
    for _index in range(data.shape[0]):
        l = get_one_sample_sorted_wifis(data.iloc[_index].basic_wifi_info)
        for _rank,(_wn,_sig) in enumerate(l):
            if _wn in wifi_rank_mean:
                ranks[_index,_rank] = _rank - wifi_rank_mean[_wn]
    return ranks

In [964]:
def ac(data):
    def _ac(x):
        l = get_one_sample_sorted_wifis(x)
        l = [_l[1] for _l in l]
        s =set()
        for _i in range(len(l)):
            for _j in range(_i+1, len(l)):
                s.add(l[_i] - l[_j])
        return len(s) - (len(l) - 1)
    return data.basic_wifi_info.map(lambda x: _ac(x)).values.reshape(-1,1)


In [965]:
from sklearn.model_selection import  cross_val_score
ps = []
rs = []
ratio = 0.05
for _s1,_s2 in [(66,74),(68,70),(70,75),(67,72),(44,53),(36,50),(67,75),(25,76),(34,67),
               (34,66), (91,92), (66,67), (65,70), (11,33), (72,75), (70,76)]:
    
    
    
    s1 = _s1
    print "s1", le.inverse_transform([s1])
    s2 = _s2
    print "s2", le.inverse_transform([s2])
    print
    s1_train = train[_train_b_y[:,s1] == 1]
    # s1_train = s1_train[s1_train.dayofyear >= 231]
    s2_train = train[_train_b_y[:,s2] == 1]
    s1_valid = valid[_valid_b_y[:,s1] == 1]
    s2_valid = valid[_valid_b_y[:,s2] == 1]
    print "s1 train shape",s1_train.shape
    print "s2 train shape",s2_train.shape
    print "s1 valid shape",s1_valid.shape
    print "s2 valid shape",s2_valid.shape


    s1_wifi_all_x = np.concatenate([train_wifi_all_x[_train_b_y[:,s1] == 1], valid_wifi_all_x[_valid_b_y[:,s1]==1]])
    s2_wifi_all_x = np.concatenate([train_wifi_all_x[_train_b_y[:,s2] == 1], valid_wifi_all_x[_valid_b_y[:,s2]==1]])

    
    
    sp1 = int(s1_wifi_all_x.shape[0] * ratio)
    sp2 = int(s2_wifi_all_x.shape[0] * ratio)
    s1_indexs = choose_strong_wifi_index(-115,sp1,s1_wifi_all_x)
    s2_indexs = choose_strong_wifi_index(-115,sp2,s2_wifi_all_x)
    _indexs = list(set(s1_indexs).union(set(s2_indexs)))
    # _indexs = list(set(all_choose[s1]).union(set(all_choose[s2])))
    # _indexs = [0]
#     _indexs = s1_indexs
    print "wifi indexs",_indexs
    print "wifi size:",len(_indexs)
    _train_bool_index = (_train_b_y[:,s1] == 1) | (_train_b_y[:,s2] == 1)
    _valid_bool_index = (_valid_b_y[:,s1] == 1) | (_valid_b_y[:,s2] == 1)
    
    ptrain_x = train_wifi_all_x[_train_bool_index][:,_indexs]
    ptrain_y = train_y[_train_bool_index]
    pvalid_x = valid_wifi_all_x[_valid_bool_index][:,_indexs]
    pvalid_y = valid_y[_valid_bool_index]
    pvalid = valid[_valid_bool_index]
    ptrain = train[_train_bool_index]
    
    
    
        
    
    ptrain_s1 = train[(_train_b_y[:,s1] == 1)]
    pvalid_s1 = valid[(_valid_b_y[:,s1] == 1)]
    wrm = wifi_rank_mean(ptrain_s1,pvalid_s1)
    ptrain_x_rank = rank3(ptrain, wrm)
    pvalid_x_rank = rank3(pvalid, wrm)
    
    train_ac = ac(ptrain)
    valid_ac = ac(pvalid)
    
    ptrain_x = np.concatenate([ptrain_x,
                               train_ac,
#                                ptrain_x_rank,
                               train_lonlats[_train_bool_index],
                               train_times[_train_bool_index][:,[0,1,2]]
                              ],axis=1)

    pvalid_x = np.concatenate([pvalid_x,
                               valid_ac,
#                                pvalid_x_rank,
                               valid_lonlats[_valid_bool_index],
                               valid_times[_valid_bool_index][:,[0,1,2]]
                              ], axis=1)

    m = RandomForestClassifier(n_jobs=-1,
                               n_estimators=188,
                               random_state=2017
                              )

    print "train cross_val_score:",np.mean(cross_val_score(m,ptrain_x,ptrain_y))
    m.fit(ptrain_x,ptrain_y)
    p2 = m.predict(pvalid_x)
    print p2.shape
    print "valid acc", acc(p2,pvalid_y)
    ps.append(p2)
    rs.append(pvalid_y)
#     break
print "all acc", acc(np.hstack(ps),np.hstack(rs))


s1 ['s_647507']
s2 ['s_649372']

s1 train shape (272, 26)
s2 train shape (213, 26)
s1 valid shape (125, 27)
s2 valid shape (84, 27)
wifi indexs [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 151, 24, 25, 26, 27, 28, 32, 33, 290, 36, 39, 41, 43, 44, 173, 46, 181, 59, 325, 454, 141, 208, 82, 143, 130, 148]
wifi size: 48
train cross_val_score: 0.733992791964
(209,)
valid acc 0.746411483254
s1 ['s_647739']
s2 ['s_648391']

s1 train shape (291, 26)
s2 train shape (475, 26)
s1 valid shape (95, 27)
s2 valid shape (137, 27)
wifi indexs [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 32, 33, 36, 39, 41, 59, 256, 103, 106, 250]
wifi size: 35
train cross_val_score: 0.667141544118
(232,)
valid acc 0.616379310345
s1 ['s_648391']
s2 ['s_649379']

s1 train shape (475, 26)
s2 train shape (422, 26)
s1 valid shape (137, 27)
s2 valid shape (142, 27)
wifi indexs [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,

array([[-1.21965,  0.41262, ...,  0.     ,  2.67949],
       [-1.24545, -2.5963 , ...,  1.0146 ,  1.07224],
       ..., 
       [-1.24545, -1.90612, ...,  0.05556,  2.75758],
       [-1.24545, -1.33333, ...,  4.4037 ,  1.07224]])

array([[   3.,    2., ..., -999., -999.],
       [   1.,    4., ..., -999., -999.],
       ..., 
       [   6.,    0., ..., -999., -999.],
       [   5.,    1., ..., -999., -999.]])

In [None]:
ptrain.basic_wifi_info

In [618]:
def wifi_rank(data, wifis):
    def rank_wifi_one(x,wifis):
        xs = []
        for _x in x[1]:
            xs.append(_x)
        for _x in x[2]:
            xs.append(_x)
        xs = sorted(xs,key=lambda x:-x[1])
        r = []
        if wifis is not None and len(wifis) != 0:
            for _x in xs:
                if _x[0] in wifis:
                    r.append(_x[0])
        else:
            for _x in xs:
                r.append(_x[0])
        return r
    return data.basic_wifi_info.map(lambda x: rank_wifi_one(x,wifis))

In [696]:
def mean_wifi(data1,data2,wifi_num = 30):
    sw = get_sorted_wifi([data1,data2])
    sw = sw[:wifi_num]
    mean_num1 = {}
    mean_num2 = {}
    mean_diff = {}
    def st(x,bsn):
        for xx in [x[1],x[2]]:
            for _x in xx:
                if _x[0] == bsn:
                    return _x[1]

    for _sn,_num in sw:
        sig = data1.basic_wifi_info.map(lambda x:st(x,_sn))
        sig =  sig[~sig.isnull()]
        mean_s = np.mean(sig)
        num_s = len(sig)
        mean_num1[_sn] = (mean_s,num_s)
        sig = data2.basic_wifi_info.map(lambda x:st(x,_sn))
        sig =  sig[~sig.isnull()]
        mean_s = np.mean(sig)
        num_s = len(sig)
        mean_num2[_sn] = (mean_s,num_s)
        mean_diff[_sn] = (mean_num1[_sn][0] - mean_num2[_sn][0],mean_num1[_sn][1],mean_num2[_sn][1])
    return mean_num1,mean_num2,mean_diff
mean_wifi(s1_train,s2_train)
        

({'b_14811282': (-75.628787878787875, 132),
  'b_1918897': (-79.532258064516128, 62),
  'b_20405383': (-74.765625, 64),
  'b_22562099': (-69.832089552238813, 268),
  'b_22562100': (-69.050632911392398, 79),
  'b_22562123': (-79.218181818181819, 165),
  'b_22564160': (-78.605769230769226, 104),
  'b_22564172': (-85.645161290322577, 62),
  'b_22564180': (-79.017142857142858, 175),
  'b_22564184': (-62.262798634812285, 293),
  'b_22564185': (-64.923913043478265, 184),
  'b_22564302': (-67.592982456140348, 285),
  'b_22564303': (-70.238805970149258, 67),
  'b_22564340': (-84.326086956521735, 46),
  'b_22564354': (-84.389830508474574, 59),
  'b_22564361': (-84.509433962264154, 53),
  'b_22564399': (-84.346666666666664, 75),
  'b_22564413': (-83.328571428571422, 70),
  'b_30146810': (-64.635514018691595, 107),
  'b_30465620': (-82.222222222222229, 216),
  'b_30465621': (-77.655913978494624, 93),
  'b_38265961': (-77.703125, 64),
  'b_395093': (-78.0, 65),
  'b_40865719': (-75.904040404040401

In [700]:
df[df.wifi_name=="b_22564302"]

Unnamed: 0,wifi_rank,wifi_name,wifi_num
0,0,b_22564302,5104


In [716]:
l = ["b_22564184","b_22564302","b_22562099",
                     "b_22561769","b_22564180","b_22564354",
                     "b_30465620","b_40865719","b_48770758"]
l = ["b_22564302","b_30146810"]
l=[]
pd.concat([wifi_rank(s1_valid_error, l), s1_valid_error.hour,s1_valid_error.is_weekend],axis=1)

Unnamed: 0,basic_wifi_info,hour,is_weekend
51465,"[b_18579108, b_57081913, b_22564184, b_49272814, b_22562099, b_40865719, b_41997665, b_3486047, b_30373182, b_30465620]",17,1
66846,"[b_13549167, b_22564303, b_22564185, b_49272814, b_53299477, b_20405383, b_40865719, b_22564383, b_30373182]",14,1
93049,"[b_39783023, b_18579108, b_22562100, b_11329893]",12,1
176502,"[b_22564184, b_22562099, b_36209576, b_22562100, b_22564302, b_18579108, b_22564399, b_22564160, b_30465620, b_22564172]",11,0
213743,"[b_22564185, b_22562100, b_22562099, b_22564302, b_38265961, b_40865719, b_7962419, b_22564160, b_22564180, b_22562123]",19,0
236586,"[b_22564185, b_22564184, b_22562099, b_22562123, b_22564302, b_40865719, b_42538226, b_22564180, b_30465621, b_30465620]",12,0
295725,"[b_22562100, b_18579108, b_22564303, b_22564184, b_22564185, b_40865719, b_55968864, b_46277623, b_20405383, b_30465620]",18,0
297117,"[b_24680413, b_18579108, b_22562099, b_22562100, b_22564302, b_55962405, b_22564361, b_22564413, b_22564180, b_22564340]",18,1
322834,"[b_22529356, b_22529357, b_22529422, b_22529438, b_22529391, b_22564187, b_22564174, b_22529069, b_22564162, b_22530875]",20,0
402549,"[b_35006331, b_22564185, b_40865719, b_42307981, b_22564184, b_46277623, b_22562099, b_22564302, b_22564180, b_30465620]",17,0


In [717]:
pd.concat([wifi_rank(s2_valid_error, l), s2_valid_error.hour,s2_valid_error.is_weekend],axis=1)

Unnamed: 0,basic_wifi_info,hour,is_weekend
12974,"[b_22564302, b_22562099, b_22564180, b_26055286, b_20405383, b_22564361, b_22564172, b_22564413, b_8745165, b_22563989]",17,0
13512,"[b_22564184, b_22562099, b_31611365, b_22564185, b_22562100, b_40865719, b_57081913, b_42271229, b_51893017, b_32174303]",20,0
39632,"[b_22564161, b_30465621, b_46277623, b_33408329, b_10569950, b_49272814, b_48770758, b_17609383, b_2169774, b_48321683]",12,0
50013,"[b_57081913, b_22564184, b_18579108, b_41659513, b_7962419, b_41997665, b_47796213, b_49272814, b_14811282, b_30465620]",18,0
76174,"[b_22564184, b_22562099, b_22562123, b_22562100, b_18579108, b_14811282, b_3486047, b_49272814, b_7259424, b_48770758]",17,1
115454,"[b_22562100, b_22564303, b_22564184, b_22562099, b_22564302, b_14811282, b_22562123, b_22564180, b_22564172, b_22564413]",19,0
164603,"[b_22564184, b_22564185, b_42271229, b_22562099, b_22564303, b_49497048, b_7167752, b_22564160, b_40865719, b_22562123]",20,0
252078,"[b_22564185, b_36438395, b_42271229, b_40865719, b_22564303, b_7962419, b_38265961, b_22564383, b_22564414, b_7167752]",20,0
256103,"[b_22564185, b_30465621, b_38753195, b_7962419, b_10569950, b_57081913, b_40865719, b_7259424, b_17609386, b_17609383]",13,0
279796,"[b_22564184, b_29016456, b_22564302, b_22562099, b_22564180, b_22564298, b_40865719, b_22564382, b_22563989, b_22564340]",19,1


In [698]:
pd.concat([wifi_rank(s1_valid_correct, l), s1_valid_correct.hour,s1_valid_correct.is_weekend],axis=1)

Unnamed: 0,basic_wifi_info,hour,is_weekend
11305,[],10,0
23585,[b_22564302],12,0
36642,[],11,1
39630,[],20,0
45086,[b_22564302],17,0
45087,[],18,0
53589,[b_22564302],12,0
53590,[b_22564302],12,0
65010,[b_22564302],12,1
76175,[],17,1


In [699]:
pd.concat([wifi_rank(s2_valid_correct, l), s2_valid_correct.hour,s2_valid_correct.is_weekend],axis=1)

Unnamed: 0,basic_wifi_info,hour,is_weekend
20078,[b_22564302],13,1
55073,[],19,0
65009,[b_22564302],12,1
70625,[],17,0
73825,[b_22564302],19,0
80424,[b_22564302],15,1
89211,[],17,0
98093,[b_22564302],14,1
100267,[b_22564302],13,1
139563,[],12,1


In [694]:
pd.concat([wifi_rank(s2_train, l), s2_train.hour,s2_train.is_weekend],axis=1)

Unnamed: 0,basic_wifi_info,hour,is_weekend
3594,[b_22564302],10,1
8461,[b_22564302],13,1
8471,[],18,0
10832,[],18,0
13953,[b_22564302],12,1
15200,[],18,0
15780,[b_22564302],10,1
19699,[b_30146810],12,1
20819,[],18,1
22922,"[b_30146810, b_22564302]",11,0


In [593]:
get_sorted_wifi([s1_train])

[('b_22564184', 293),
 ('b_22564302', 285),
 ('b_22562099', 268),
 ('b_30465620', 216),
 ('b_40865719', 198),
 ('b_48770758', 188),
 ('b_22564185', 184),
 ('b_7962419', 181),
 ('b_22564180', 175),
 ('b_49272814', 172),
 ('b_22562123', 165),
 ('b_14811282', 132),
 ('b_30146810', 107),
 ('b_22564160', 104),
 ('b_30465621', 93),
 ('b_22562100', 79),
 ('b_57081913', 78),
 ('b_22564399', 75),
 ('b_46277623', 73),
 ('b_22564413', 70),
 ('b_22564303', 67),
 ('b_395093', 65),
 ('b_38265961', 64),
 ('b_20405383', 64),
 ('b_22564172', 62),
 ('b_1918897', 62),
 ('b_22564354', 59),
 ('b_22564361', 53),
 ('b_22564340', 46),
 ('b_46165431', 40),
 ('b_41997665', 34),
 ('b_30373182', 30),
 ('b_22563989', 30),
 ('b_22564176', 26),
 ('b_50249290', 21),
 ('b_32174303', 20),
 ('b_22564181', 20),
 ('b_22561769', 19),
 ('b_22564298', 14),
 ('b_17609386', 13),
 ('b_49497048', 13),
 ('b_7259424', 12),
 ('b_18579108', 12),
 ('b_2169453', 12),
 ('b_22564414', 11),
 ('b_22564170', 10),
 ('b_22561956', 10),
 ('b_

In [712]:
s1 = 70
print "s1", le.inverse_transform([s1])
s2 = 75
print "s2", le.inverse_transform([s2])
print
s1_train = train[_train_b_y[:,s1] == 1]
# s1_train = s1_train[s1_train.dayofyear >= 231]
s2_train = train[_train_b_y[:,s2] == 1]
s1_valid = valid[_valid_b_y[:,s1] == 1]
s2_valid = valid[_valid_b_y[:,s2] == 1]
print "s1 train shape",s1_train.shape
print "s2 train shape",s2_train.shape
print "s1 valid shape",s1_valid.shape
print "s2 valid shape",s2_valid.shape

s1_wifi_all_x = train_wifi_all_x[_train_b_y[:,s1] == 1]
s2_wifi_all_x = train_wifi_all_x[_train_b_y[:,s2] == 1]
s1_indexs = choose_strong_wifi_index(-115,6,s1_wifi_all_x)
s2_indexs = choose_strong_wifi_index(-115,6,s2_wifi_all_x)
_indexs = list(set(s1_indexs).union(set(s2_indexs)))
# _indexs = list(set(all_choose[s1]).union(set(all_choose[s2])))
# _indexs = [10,8,74]
print "wifi indexs",_indexs
print "wifi size:",len(_indexs)
_train_bool_index = (_train_b_y[:,s1] == 1) | (_train_b_y[:,s2]==1)
_valid_bool_index = (_valid_b_y[:,s1] == 1) | (_valid_b_y[:,s2]==1)
ptrain_x = train_wifi_all_x[_train_bool_index][:,_indexs]
ptrain_y = train_y[_train_bool_index]
pvalid_x = valid_wifi_all_x[_valid_bool_index][:,_indexs]
pvalid_y = valid_y[_valid_bool_index]
pvalid = valid[_valid_bool_index]

ptrain_x = np.concatenate([ptrain_x, 
                           train_lonlats[_train_bool_index],
                          ],axis=1)

pvalid_x = np.concatenate([pvalid_x,
                           valid_lonlats[_valid_bool_index],
                          ], axis=1)

_indexs.append("lon")
_indexs.append("lat")
m = RandomForestClassifier(n_jobs=-1,
                           n_estimators=188,
#                            min_samples_leaf=2
#                            min_samples_split=5,
#                            class_weight="balanced",
                           random_state=2017
                          )


m.fit(ptrain_x,ptrain_y)
p1 = m.predict(ptrain_x)
print "train acc", acc(p1,ptrain_y)
p2 = m.predict(pvalid_x)
print p2.shape
print "valid acc", acc(p2,pvalid_y)

if hasattr(m,"feature_importances_"):
    fi = zip(m.feature_importances_, _indexs)
    fi = sorted(fi,key=lambda x:-x[0])
    find_indexs = []
    for _f in fi[:30]:
        if isinstance(_f[1],int):
            find_indexs.append(_f[1])
    choose,best = forward_search_cv(m,
                   train_wifi_all_x[_train_bool_index][:,find_indexs],
                   train_y[_train_bool_index],
                   train_lonlats[_train_bool_index],
                  )
    if best > acc(p2,pvalid_y):
        choose = [find_indexs[_c] for _c in choose]
        ptrain_x = np.concatenate([train_wifi_all_x[_train_bool_index][:,choose], 
                           train_lonlats[_train_bool_index],
                          ],axis=1)
        pvalid_x = np.concatenate([valid_wifi_all_x[_valid_bool_index][:,choose], 
                                   valid_lonlats[_valid_bool_index],
                                  ],axis=1)
        m.fit(ptrain_x,ptrain_y)
        print choose
    


m.fit(ptrain_x,ptrain_y)
p1 = m.predict(ptrain_x)
print "train acc", acc(p1,ptrain_y)
p2 = m.predict(pvalid_x)
print p2.shape
print "valid acc", acc(p2,pvalid_y)
# pvalid

s1 ['s_648391']
s2 ['s_649379']

s1 train shape (475, 24)
s2 train shape (422, 24)
s1 valid shape (137, 24)
s2 valid shape (142, 24)
wifi indexs [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 25, 26, 27, 28, 32, 33, 290, 36, 1024, 39, 41, 43, 44, 303, 313, 59, 319, 256, 78, 82, 95, 96, 101, 102, 103, 361, 106, 373, 121, 381, 640, 130, 134, 395, 141, 195, 148, 151, 154, 683, 181, 698, 451, 238, 250, 1019]
wifi size: 69
train acc 0.998885172798
(279,)
valid acc 0.756272401434
choose 0
best acc 0.623170673311
choose 2
best acc 0.688946026901
choose 4
best acc 0.729053993052
choose 18
best acc 0.74130970997
choose 5
best acc 0.746947169162
choose 22
best acc 0.749184330567
train acc 0.998885172798
(279,)
valid acc 0.756272401434


In [None]:
# lon lat
# ptrain_lonlat = train_lonlats[(_train_b_y[:,33] == 1) | (_train_b_y[:,61]==1)]
# pvalid_lonlat = valid_lonlats[(_valid_b_y[:,33] == 1) | (_valid_b_y[:,61]==1)]
# ptrain_x = np.concatenate([ptrain_x,ptrain_lonlat],axis=1)
# pvalid_x = np.concatenate([pvalid_x,pvalid_lonlat],axis=1)


# def rank_wifi(x):
#     def _rank(x,m):
#         sig_site = zip(list(x),range(m))
#         sig_site = sorted(sig_site,key=lambda x: -x[0])
#         rs = []
#         for _r, (_sig,_site) in enumerate(sig_site):
#             if _sig == -115:
#                 rs.append((_site,m))
#             else:
#                 rs.append((_site,_r))
#         rs = sorted(rs,key=lambda x:x[0])
#         rs = [_r[1] for _r in rs]
#         return rs
            
#     m = x.shape[1]
#     return np.vstack(map( lambda a:_rank(a,m), x))
    
# ptrain_x_rank = rank_wifi(ptrain_x)
# pvalid_x_rank = rank_wifi(pvalid_x)
# ptrain_x = np.concatenate([ptrain_x,ptrain_x_rank],axis=1)
# pvalid_x = np.concatenate([pvalid_x,pvalid_x_rank],axis=1)





In [177]:


r = {}
probas = []
for i in range(_train_b_y.shape[1]):
#     i=26
    print i
    gt = train.iloc[train_y==lb.classes_[i]]
    ngt = train.iloc[train_y!=lb.classes_[i]]
    if len(gt) != 0:
        
        sorted_wifi = get_sorted_wifi([gt])
        _indexs = get_indexs(df,sorted_wifi,0.1)
       
        cv = 3
        n_sorted_wifi = get_sorted_wifi([ngt])
        _nindexs = get_indexs2(df,n_sorted_wifi,50)
#         _indexs = list(set(_indexs).union(set(_nindexs)))
        prf = get_model(cv = cv)
        otxs= []
        ovxs = []
#         _indexs = _indexs[:1]
        print _indexs
        modify_size =  (np.asarray(_indexs) < 0).sum()
        _tx = train_wifi_all_x[:,_indexs]
        _vx = valid_wifi_all_x[:,_indexs]
        
        __tx,__vx = modify_wifi(_tx,_vx,train,valid,modify_size)
        _tx = np.concatenate([_tx,__tx],axis=1)
        _vx = np.concatenate([_vx,__vx],axis=1)
        
        
        _tx = np.concatenate([_tx,train_lonlats,train_wh],axis=1)
        _vx = np.concatenate([_vx,valid_lonlats,valid_wh],axis=1)

        if cv is not None and cv != 0:
            _tx,_ty = expansion(_tx,_train_b_y[:,i],cv)
        else:
            _ty = _train_b_y[:,i]
            
        prf.fit(_tx,_ty)
        p = prf.predict(_vx)
        proba = prf.predict_proba(_vx)
        probas.append(proba[:,1])
        _acc = acc(p,_valid_b_y[:,i])
        print "origin", _acc
        r[i] = _acc
        print "error shape,", (p != _valid_b_y[:,i]).sum()
        print _valid_b_y[:,i][(p != _valid_b_y[:,i])]
        print "error pos shape", (_valid_b_y[:,i][(p != _valid_b_y[:,i])]==1).sum()
        print "error index"
        print valid_index[(p != _valid_b_y[:,i])]
#         print "correct index"
#         print valid_index[(p == _valid_b_y[:,i]) & (_valid_b_y[:,i]==1)]
        print le.classes_[i]
        print
    else:
        probas.append(np.zeros((valid_y.shape[0],)))
        r[i] = 0
#     break
print r
acc(lb.classes_.take(np.argmax(np.vstack(probas).T,axis=1)), valid_y)

0
[2, 3, 4, 5, 6, 7, 9, 13, 21, 27, 29, 37, 48, 49, 60, 61, 71, 79, 94]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999551569507
error shape, 2
[1 1]
error pos shape 2
error index
[1931 1932]
s_1346456

1
[24, 32, 41, 50, 53, 64, 66, 88, 92, 98, 101, 223, 236, 280, 292, 300, 334, 337, 387, 484, 487, 488, 523]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999551569507
error shape, 2
[1 1]
error pos shape 2
error index
[1193 1551]
s_1392063

2
[23, 26, 34, 65, 123, 133, 169, 171, 230, 255, 302, 373, 405, 408, 540, 774, 776, 783, 800, 801, 810, 837, 924, 932, 974, 1304, 1584, 1969]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training 

Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_3643138

24
[59, 67, 72, 78, 80, 136, 154, 155, 275, 319, 356, 357, 374, 397, 406, 415, 451, 505, 507, 513, 547, 564, 720, 882]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999775784753
error shape, 1
[1]
error pos shape 1
error index
[906]
s_3644057

25
[58, 96, 106, 128, 130, 150, 167, 184, 192, 206, 212, 310, 419, 426, 476]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_3658709

26
[8, 10, 20, 23, 26, 40, 65, 112, 123, 132, 133, 147, 168, 169, 171, 175, 180, 195, 202, 230, 302, 436]
Fitting 1 classifiers...
F

Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.99730941704
error shape, 12
[0 1 1 1 0 0 0 1 0 0 1 1]
error pos shape 6
error index
[ 311 1173 1174 1176 1279 2131 2646 2933 3200 3600 4054 4061]
s_491277

46
[88, 92, 98, 101, 103, 116, 122, 233, 236, 254, 256, 295, 300, 330, 345, 349, 352, 383, 404, 422, 500, 556, 643, 696, 738]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_493201

47
[59, 67, 72, 80, 143, 155, 179, 235, 239, 253, 277, 283, 301, 336, 358, 429, 458, 874, 2263, 2547]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_496413

48
[2, 3, 4, 5, 7, 11, 13

Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999551569507
error shape, 2
[1 1]
error pos shape 2
error index
[627 629]
s_517795

66
[93, 97, 100, 124, 127, 141, 142, 144, 149, 162, 170, 190, 210, 313, 370, 389, 428, 443]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999103139013
error shape, 4
[0 1 1 1]
error pos shape 3
error index
[ 860 2616 3778 3779]
s_522897

67
[15, 24, 32, 41, 45, 47, 50, 52, 64, 66, 70, 131, 146, 151, 153, 164, 186, 187, 209, 216, 243, 247, 307, 348]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.99865470852
error shape, 6
[1 1 1 1 1 1]
error pos shape 6
error index
[ 723 1841 1842 2846 3011 3262]
s_525418

68
[30, 39, 44, 51, 68, 75, 76, 8

0.9482062780269058

In [237]:
r = {}
probas = []
for i in range(_train_b_y.shape[1]):
#     i=26
    print i
    gt = train.iloc[train_y==lb.classes_[i]]
    ngt = train.iloc[train_y!=lb.classes_[i]]
    if len(gt) != 0:
        
        sorted_wifi = get_sorted_wifi([gt])
        _indexs = get_indexs(df,sorted_wifi,0.1)
       
        cv = 3
        n_sorted_wifi = get_sorted_wifi([ngt])
        _nindexs = get_indexs2(df,n_sorted_wifi,50)
#         _indexs = list(set(_indexs).union(set(_nindexs)))
        prf = get_model(cv = cv)
        otxs= []
        ovxs = []
#         _indexs = _indexs[:1]
        print _indexs
        modify_size =  (np.asarray(_indexs) < 6).sum()
        _tx = train_wifi_all_x[:,_indexs]
        _vx = valid_wifi_all_x[:,_indexs]
        __tx,__vx = modify_wifi(_tx,_vx,train,valid,modify_size)
        _tx = np.concatenate([_tx,__tx],axis=1)
        _vx = np.concatenate([_vx,__vx],axis=1)
        
        _tx = np.concatenate([_tx,train_lonlats,train_wh],axis=1)
        _vx = np.concatenate([_vx,valid_lonlats,valid_wh],axis=1)

        if cv is not None and cv != 0:
            _tx,_ty = expansion(_tx,_train_b_y[:,i],cv)
        else:
            _ty = _train_b_y[:,i]
            
        prf.fit(_tx,_ty)
        p = prf.predict(_vx)
        proba = prf.predict_proba(_vx)
        probas.append(proba[:,1])
        _acc = acc(p,_valid_b_y[:,i])
        print "origin", _acc
        r[i] = _acc
        print "error shape,", (p != _valid_b_y[:,i]).sum()
        print _valid_b_y[:,i][(p != _valid_b_y[:,i])]
        print "error pos shape", (_valid_b_y[:,i][(p != _valid_b_y[:,i])]==1).sum()
        print "error index"
        print valid_index[(p != _valid_b_y[:,i])]
#         print "correct index"
#         print valid_index[(p == _valid_b_y[:,i]) & (_valid_b_y[:,i]==1)]
        print le.classes_[i]
        print
    else:
        probas.append(np.zeros((valid_y.shape[0],)))
        r[i] = 0
#     break
print r
acc(lb.classes_.take(np.argmax(np.vstack(probas).T,axis=1)), valid_y)

0
[2, 3, 4, 5, 6, 7, 9, 13, 21, 27, 29, 37, 48, 49, 60, 61, 71, 79, 94]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.99932735426
error shape, 3
[1 1 1]
error pos shape 3
error index
[1930 1931 1932]
s_1346456

1
[24, 32, 41, 50, 53, 64, 66, 88, 92, 98, 101, 223, 236, 280, 292, 300, 334, 337, 387, 484, 487, 488, 523]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999551569507
error shape, 2
[1 1]
error pos shape 2
error index
[1193 1551]
s_1392063

2
[23, 26, 34, 65, 123, 133, 169, 171, 230, 255, 302, 373, 405, 408, 540, 774, 776, 783, 800, 801, 810, 837, 924, 932, 974, 1304, 1584, 1969]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Tra

Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_3643138

24
[59, 67, 72, 78, 80, 136, 154, 155, 275, 319, 356, 357, 374, 397, 406, 415, 451, 505, 507, 513, 547, 564, 720, 882]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999775784753
error shape, 1
[1]
error pos shape 1
error index
[906]
s_3644057

25
[58, 96, 106, 128, 130, 150, 167, 184, 192, 206, 212, 310, 419, 426, 476]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_3658709

26
[8, 10, 20, 23, 26, 40, 65, 112, 123, 132, 133, 147, 168, 169, 171, 175, 180, 195, 202, 230, 302, 436]
Fitting 1 classifiers...
F

Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.996860986547
error shape, 14
[0 1 1 1 0 0 0 1 0 0 0 1 1 0]
error pos shape 6
error index
[ 311 1173 1174 1175 1279 2131 2442 2933 3034 3200 3600 4054 4061 4299]
s_491277

46
[88, 92, 98, 101, 103, 116, 122, 233, 236, 254, 256, 295, 300, 330, 345, 349, 352, 383, 404, 422, 500, 556, 643, 696, 738]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_493201

47
[59, 67, 72, 80, 143, 155, 179, 235, 239, 253, 277, 283, 301, 336, 358, 429, 458, 874, 2263, 2547]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 1.0
error shape, 0
[]
error pos shape 0
error index
[]
s_496413

48
[2, 3, 

origin 0.99798206278
error shape, 9
[1 1 1 1 1 1 0 0 1]
error pos shape 7
error index
[ 692  693 2633 3158 3971 3972 4128 4129 4397]
s_517764

65
[59, 67, 72, 78, 80, 136, 143, 154, 155, 239, 241, 249, 253, 270, 281, 283, 336, 354, 429]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999551569507
error shape, 2
[1 1]
error pos shape 2
error index
[627 629]
s_517795

66
[93, 97, 100, 124, 127, 141, 142, 144, 149, 162, 170, 190, 210, 313, 370, 389, 428, 443]
Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
origin 0.999103139013
error shape, 4
[0 1 1 1]
error pos shape 3
error index
[ 860 2616 3778 3779]
s_522897

67
[15, 24, 32, 41, 45, 47, 50, 52, 64, 66, 70, 131, 146, 151, 153, 164, 186, 187, 209, 216, 243, 247, 

0.9468609865470852

In [152]:
train.columns

Index([u'user_id', u'shop_id', u'time_stamp', u'longitude', u'latitude',
       u'wifi_infos', u'category_id', u'shop_longitude', u'shop_latitude',
       u'price', u'mall_id', u'dt', u'weekday', u'hour', u'is_weekend',
       u'basic_wifi_info', u'wifi_size', u'use_wifi_size', u'no_use_wifi_size',
       u'use_wifi_freq', u'no_use_wifi_freq', u'i_loc', u'day', u'dayofyear'],
      dtype='object')

In [343]:
rf1 = RandomForestClassifier(n_estimators=388,n_jobs=-1,class_weight="balanced",min_weight_fraction_leaf=0.00002)
rf1.fit(_train_x3,train_y)
acc(rf1.predict(_valid_x3),valid_y)

0.8947368421052632

In [665]:
rf2 = RandomForestClassifier(n_estimators=388,n_jobs=-1,class_weight="balanced")
rf2.fit(_train_x3,train_y)
acc(rf2.predict(_valid_x3),valid_y)

0.69780977178741

In [679]:
ovr = OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced"),n_jobs=-1)
ovr.fit(_train_x3, train_y)
acc(ovr.predict(_valid_x3),valid_y)

0.9017543859649123

In [6]:
#stack
cv = 3
_x,_y = expansion(_train_x33,train_y,cv)
stack = StackingCVClassifier([
#                                 RandomForestClassifier(n_jobs=-1,n_estimators=388,class_weight="balanced"),
#                               RandomForestClassifier(n_jobs=-1,n_estimators=388,class_weight="balanced",min_weight_fraction_leaf=0.00002),
                              OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced"))
                              ],
               
                             OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced")),
                             use_probas=True,
                             verbose=1,
                             use_features_in_secondary = True,
                             cv = cv) 
stack.fit(_x,_y)
p = stack.predict(_valid_x33)
acc(p, valid_y)
# OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced"))

Fitting 1 classifiers...
Fitting classifier1: onevsrestclassifier (1/1)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...


0.9413919413919414

In [12]:
#stack
cv = 3
_x,_y = expansion(_train_x3,train_y,cv)
stack = StackingCVClassifier([RandomForestClassifier(n_jobs=-1,
                                                     n_estimators=388,
                                                     class_weight="balanced"),
                              OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=188,
                                                                                   n_jobs=-1,
                                                                                   class_weight="balanced")),
                              ],
               
                             RandomForestClassifier(n_estimators=666,n_jobs=-1,class_weight="balanced"),
                             use_probas=True,
                             verbose=1,
                             use_features_in_secondary = True,
                             cv = cv) 
stack.fit(_x,_y)
p = stack.predict(_valid_x3)
acc(p, valid_y)

Fitting 2 classifiers...
Fitting classifier1: randomforestclassifier (1/2)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
Fitting classifier2: onevsrestclassifier (2/2)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...


0.713126052994333

In [None]:
#stack
cv = 3
_x,_y = expansion(_train_x3,train_y,cv * 2)
stack = StackingCVClassifier([StackingCVClassifier([RandomForestClassifier(n_estimators=500,n_jobs=-1,class_weight="balanced")],
                                                     RandomForestClassifier(n_estimators=800,n_jobs=-1,class_weight="balanced"),
                                                     use_probas=True,
                                                     verbose=0,
                                                     use_features_in_secondary = False,
                                                     cv = cv),
                              OneVsRestClassifier(estimator=StackingCVClassifier([RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced")],
                                                         RandomForestClassifier(n_estimators=288,n_jobs=-1,class_weight="balanced"),
                                                         cv=cv,
                                                         verbose=0,
                                                         use_probas=True,
                                                         use_features_in_secondary=True
                                                        )),
                              ],
                             RandomForestClassifier(n_estimators=800,n_jobs=-1,class_weight="balanced"),
                             use_probas=True,
                             use_features_in_secondary = True,
                             verbose=1,
                             cv = cv) 
stack.fit(_x,_y)
p = stack.predict(_valid_x3)
acc(p, valid_y)

In [18]:
rfs = []
for _i in range(_train_b_y.shape[1]):
    rf = RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced")
    rf.fit(_train_x3, _train_b_y[:,_i])
    rfs.append(rf)

In [19]:
# predict
ps = []
for _rf in rfs:
    p = _rf.predict_proba(_valid_x3)[:,1].reshape(-1,1)
    ps.append(p)
p = np.hstack(ps)
p = np.argmax(p,axis=1)
acc(lb.classes_.take(p),valid_y)

0.7097564711288099

In [30]:
negetive_num = 3
rfs2 = []
kf = KFold(n_splits=negetive_num)
for _i in range(_train_b_y.shape[1]):
    _y = _train_b_y[:,_i]
    _y_pos = _y[_y==1]
    _x_pos = _train_x3[_y==1]
    _y_neg = _y[_y!=1]
    _x_neg = _train_x3[_y!=1]
    _rfs = []
    for _, _test_index in kf.split(_x_neg):
        rf = RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced")
        rf.fit(np.concatenate([_x_pos,_x_neg[_test_index]],axis=0),
              np.concatenate([_y_pos,_y_neg[_test_index]],axis=0))
        _rfs.append(rf)
    rfs2.append(_rfs)


In [32]:
# predict
ps = []
for _rfs in rfs2:
    _ps = []
    for _rf in _rfs:
        p = _rf.predict_proba(_valid_x3)[:,1].reshape(-1,1)
        _ps.append(p)
    p = np.mean(_ps,axis=0)
    ps.append(p)
p = np.hstack(ps)
p = np.argmax(p,axis=1)
acc(lb.classes_.take(p),valid_y)

0.69980088834431

In [None]:
rfs3 = []
validxs = []
for _i in range(_train_b_y.shape[1]):
    print _i
    _v =[]
    _rfs = []
    for _sp in [2,5,10]:
        indexs = choose_strong_wifi_index(-115,_sp,train_wifi_all_x)
        _train_x_sp = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh],axis=1)
        _v.append(np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh],axis=1))
        rf = RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced")
        rf.fit(_train_x_sp,_train_b_y[:, _i])
        _rfs.append(rf)
    validxs.append(_v)
    rfs3.append(_rfs)

In [37]:
#predict 
ps = []
for _rfs,_vx in zip(rfs3,validxs):
    _ps = []
    for _rf,_x in zip(_rfs,_vx):
        p = _rf.predict_proba(_x)[:,1].reshape(-1,1)
        _ps.append(p)
    p = np.mean(_ps,axis=0)
    ps.append(p)
p = np.hstack(ps)
p = np.argmax(p,axis=1)
acc(lb.classes_.take(p),valid_y)

0.710522285189156

In [None]:
rfs3 = []
validxs = []
for _i in range(_train_b_y.shape[1]):
    print _i
    _v =[]
    _rfs = []
    for _sp in [2,5,10]:
        indexs = choose_strong_wifi_index(-115,_sp,train_wifi_all_x)
        _train_x_sp = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh],axis=1)
        _v.append(np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh],axis=1))
        rf = RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced")
        rf.fit(_train_x_sp,_train_b_y[:, _i])
        _rfs.append(rf)
    validxs.append(_v)
    rfs3.append(_rfs)

In [199]:
rfs4 = []
validxs =[]
for _sp in [2,6,10]:
    indexs = choose_strong_wifi_index(-90,_sp,train_wifi_all_x)
    _train_x_sp = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh],axis=1)
    validxs.append(np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh],axis=1))
    rf = RandomForestClassifier(n_jobs=-1,n_estimators=500,class_weight="balanced")
    rf.fit(_train_x_sp,train_y)
    rfs4.append(rf)
ps = []
for _rfs,_vx in zip(rfs4, validxs):
    p = _rfs.predict_proba(_vx)
    ps.append(p)
p = np.mean(ps,axis=0)
acc(rfs4[0].classes_.take(np.argmax(p,axis=1)),valid_y)

0.8731856378915203

In [136]:
class CVEstimator(object):
    def __expansion(trainx, trainy, cv):
        # 对样本少的进行复制扩充
        bin = np.bincount(trainy)
        labels = np.unique(trainy)
        l = np.asarray(range(np.max(labels) + 1))[bin < cv]
        l = np.intersect1d(l, labels)
        for _l in l:
            n = (trainy == _l).sum()
            n = int(np.ceil(float(cv) / n - 1))
            trainx = np.concatenate([trainx, np.tile(trainx[trainy == _l], (n, 1))], axis=0)
            trainy = np.concatenate([trainy, np.tile(trainy[trainy == _l], (n,))], axis=0)
        return trainx, trainy
    
    def __init__(self, estimator, cv = 3, use_proba = True):
        self.estimator = estimator
        self.cv = cv
        self.kf = StratifiedKFold(cv,shuffle=True)
        from sklearn.base import clone
        self.clfs_ = [clone(self.estimator) for _ in range(self.cv)]
        self.use_proba = use_proba
        self.classes_ = None
        
        
    def fit(self, X,y):
#         if np.unique(y) > 2:
#             self.multiclass = True
#         else:
#             self.multiclass = False
        origin_size = y.shape[0]
        _x,_y = expansion(X,y,self.cv)
        self.indexs = []
        self.ys = []
        self.train_predicts=[]
        for _i,(_train_index,_test_index) in enumerate(self.kf.split(_x,_y)):
            _train_x = _x[_train_index]
            _train_y = _y[_train_index]
            _test_x = _x[_test_index]
            _test_y = _y[_test_index]
            self.clfs_[_i].fit(_train_x, _train_y)
            self.indexs.append(_test_index)
            self.ys.append(_test_y)
            if self.use_proba:
                self.train_predicts.append(self.clfs_[_i].predict_proba(_test_x))
            else:
                self.train_predicts.append(self.clfs_[_i].predict(_test_x))
            if self.classes_  is None:
                self.classes_ = self.clfs_[_i].classes_
            else:
                assert((self.classes_ != self.clfs_[_i].classes_).sum()==0)
        self.indexs = np.concatenate(self.indexs, axis=0)
        self.train_predicts = np.concatenate(self.train_predicts, axis=0)
        self.ys = np.concatenate(self.ys,axis=0)
        all_train_predicts = zip(self.indexs,self.ys,self.train_predicts)
        all_train_predicts = sorted(all_train_predicts, key=lambda x: x[0])
        all_train_predicts = all_train_predicts[:origin_size]
        self.train_ys = np.asarray([_l[1] for _l in all_train_predicts])
        self.train_predicts = np.asarray([_l[2] for _l in all_train_predicts])
        
        
    def get_all_train_predicts(self):
        return self.train_predicts, self.train_ys
        
    def predict(self,X):
        px = self.predict_proba(X)
        p = np.argmax(px,axis=1)
        return self.classes_.take(p)
        
        
    def predict_proba(self,X):
        _y = []
        for _cls in self.clfs_:
            _y.append(_cls.predict_proba(X))
        p = np.max(_y,axis=0)
        return p
        

In [228]:
cvrf = CVEstimator(RandomForestClassifier(n_jobs=-1,n_estimators=500,class_weight="balanced"), cv=3, use_proba=True)
cvrf.fit(_train_x3, train_y)
cvrf.predict(_valid_x3)
acc(cvrf.predict(_valid_x3),valid_y)

0.8708938120702827

In [379]:
class StackingCVEstimator(object):
    def __init__(self, base_cv_estimators, meta_estimator,use_proba = True,use_feature_in_secondary=False):
        self.base_cv_estimators = base_cv_estimators
        for _cls in self.base_cv_estimators:
            _cls.use_proba = use_proba
        self.meta_estimator = meta_estimator
        self.use_proba = use_proba
        self.use_feature_in_secondary = use_feature_in_secondary
        self.x_num = 1
    
    
    def __check(self,xs):
        if isinstance(xs, np.ndarray):
            xs = [xs]
        if isinstance(xs,list):
            if(len(xs) == 1):
                _xs =xs[0]
                xs = [_xs for _ in range(len(self.base_cv_estimators))]
                self.x_num = 1
            else:
                self.x_num = len(xs)
        return xs
        
    def fit(self,xs,y):
        xs = self.__check(xs)
        if not self.use_feature_in_secondary:
            self.x_num = 1
        if self.x_num == 1:
            self.meta_estimetors = None
        else:
            from sklearn.base import clone
            self.meta_estimetors = [clone(self.meta_estimator) for _ in range(len(xs))]
                
        
        
        for _xs,_cve in zip(xs, self.base_cv_estimators):
            _cve.fit(_xs,y)
        
        # meta predict
        newxs = []
        for _cve in self.base_cv_estimators:
            _newx = _cve.get_all_train_predicts()[0]
            if self.use_proba and _newx.shape[1] == 2:
                _newx = _newx[:,1].reshape(-1,1)
            newxs.append(_newx)
        _newx = np.concatenate(newxs,axis=1)
        if self.meta_estimetors is None:
            if self.use_feature_in_secondary:
                newx = np.concatenate([_newx,xs[0]],axis=1)
            else:
                newx = _newx
            self.meta_estimator.fit(newx,y)
            self.classes_ = self.meta_estimator.classes_
        else:
            for _xs,_meta_cls in zip(xs,self.meta_estimetors):
                newx = np.concatenate([_newx,_xs],axis=1)
                _meta_cls.fit(newx,y)
            self.classes_ = self.meta_estimators[0].classes_
    
        
    def predict(self,Xs):
        return self.classes_.take(np.argmax(self.predict_proba(Xs),axis=1))
    
    def predict_proba(self, Xs):
        ps = []
        Xs = self.__check(Xs)
        for _cls,_xs in zip(self.base_cv_estimators,Xs):
            p = _cls.predict_proba(_xs)
            if self.use_proba and p.shape[1] == 2:
                p = p[:,1].reshape(-1,1)
            ps.append(p)
        ps = np.concatenate(ps, axis=1)
        if self.meta_estimetors is None:
            if self.use_feature_in_secondary:
                newx = np.concatenate([ps,Xs[0]],axis=1)
            else:
                newx = ps
            p = self.meta_estimator.predict_proba(newx)
        else:
            pass
        
        return p
            

In [382]:
i = 11
rf = RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced")
rf.fit(_train_x3, _train_b_y[:,i])
acc(rf.predict(_valid_x3),_valid_b_y[:,i])

0.9955582784499923

In [369]:
trainxs = []
validxs = []
for _sp in [2,6,10]:
    indexs = choose_strong_wifi_index(-90,_sp,train_wifi_all_x)
    _train_x_sp = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh],axis=1)
    trainxs.append(_train_x_sp)
    validxs.append(np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh],axis=1))

In [383]:
scve = StackingCVEstimator([CVEstimator(RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced")),
                            CVEstimator(ExtraTreesClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced")),
                           ],
                           LogisticRegression(C=10,class_weight="balanced"),
                           use_feature_in_secondary=False)
scve.fit(trainxs,_train_b_y[:,i])
acc(scve.predict(validxs),_valid_b_y[:,i])

0.9834584162965232

In [232]:
rfs4 = []
validxs =[]
for _sp in [2,6,10]:
    indexs = choose_strong_wifi_index(-90,_sp,train_wifi_all_x)
    _train_x_sp = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh],axis=1)
    validxs.append(np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh],axis=1))
    rf = CVEstimator(RandomForestClassifier(n_jobs=-1,n_estimators=500,class_weight="balanced"))
    rf.fit(_train_x_sp,train_y)
    rfs4.append(rf)
ps = []
for _rfs,_vx in zip(rfs4, validxs):
    p = _rfs.predict_proba(_vx)
    ps.append(p)
p = np.max(ps,axis=0)
acc(rfs4[0].classes_.take(np.argmax(p,axis=1)),valid_y)

0.8724216959511077

In [342]:
#stack
cv = 3
_x,_y = expansion(_train_x3,train_y,cv)
stack = StackingCVClassifier([RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced"),
                              RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced",min_weight_fraction_leaf=0.00002),
                              ],
                               RandomForestClassifier(n_jobs=-1,n_estimators=888,class_weight="balanced"),
                             use_probas=True,
                             verbose=1,
                             use_features_in_secondary = True,
                             cv = cv) 
stack.fit(_x,_y)
p = stack.predict(_valid_x3)
acc(p, valid_y)
# OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced"))

Fitting 4 classifiers...
Fitting classifier1: randomforestclassifier (1/4)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
Fitting classifier2: randomforestclassifier (2/4)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
Fitting classifier3: randomforestclassifier (3/4)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...
Fitting classifier4: randomforestclassifier (4/4)
Training and fitting fold 1 of 3...
Training and fitting fold 2 of 3...
Training and fitting fold 3 of 3...


0.7106754480012253

In [330]:
rf1 = RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced",min_weight_fraction_leaf=0.00002)
rf1.fit(_train_x3,train_y)
p = rf1.predict(_valid_x3)
acc(p, valid_y)

0.7030173073977638

In [331]:
cv1 = CVEstimator(RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced",min_weight_fraction_leaf=0.00002))
cv1.fit(_train_x3,train_y)
p = cv1.predict(_valid_x3)
acc(p, valid_y)

0.7057742380150099

In [335]:
cv2 = CVEstimator(RandomForestClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced",min_weight_fraction_leaf=0.00002))
cv2.fit(_train_x3,train_y)
p = cv2.predict(_valid_x3)
acc(p, valid_y)

0.7025578189615561

In [336]:
cv3 = CVEstimator(ExtraTreesClassifier(n_jobs=-1,n_estimators=188,class_weight="balanced",min_weight_fraction_leaf=0.00002))
cv3.fit(_train_x3,train_y)
p = cv3.predict(_valid_x3)
acc(p, valid_y)

0.6639607903201102

In [337]:
newx = np.concatenate([cv1.get_all_train_predicts()[0],
                       cv2.get_all_train_predicts()[0]],axis=1)

In [338]:
l = LogisticRegression(class_weight="balanced", n_jobs=-1,C=30)
l.fit(newx,train_y)
p1 = cv1.predict_proba(_valid_x3)
p2 = cv2.predict_proba(_valid_x3)
newvalidx = np.concatenate([p1,p2],axis=1)
p = l.predict(newvalidx)
acc(p,valid_y)

0.7073058661357022

In [None]:
l = RandomForestClassifier(class_weight="balanced", n_jobs=-1,n_estimators=500)
l.fit(newx,train_y)
p1 = cv1.predict_proba(_valid_x3)
p2 = cv2.predict_proba(_valid_x3)
newvalidx = np.concatenate([p1,p2],axis=1)
p = l.predict(newvalidx)
acc(p,valid_y)

In [7]:
52,55': 30class Stacking(object):

    def __init__(self, base_clses,
                 meta_cls,
                 use_prob=False,
                 kfold=-1,
                 stratify=True,
                 Kfold_shuffle=False,
                 num_class = None,
                 use_features_in_secondary = False):
        """

        :param base_clses: sklearn model 或者lightgbm lgb的话存入tuple("lgb", params)
        :param meta_cls:
        :param objective:
        """
        self.base_clses = base_clses
        self.meta_cls = meta_cls
        self.spetial_base_clses = {}
        self.use_prob = use_prob
        self.kfold = kfold
        self.Kfold_shuffle = Kfold_shuffle
        self.stratify = stratify
        self.num_class = num_class
        self.use_feature_in_secondary = use_features_in_secondary
        if self.kfold > 0:
            self.cv = True
            self.sklearn_models = {}
            self.spetial_base_clses_params = {}
            for _cls in base_clses:
                if _cls[0].startswith("lgb"):
                    for _i in range(self.kfold):
                        lgb_name = _cls[0] + "_" + str(_i)
                        self.spetial_base_clses_params[lgb_name] = (lgb_name, _cls[1].copy())
                elif _cls[0].startswith("xgb"):
                    for _i in range(self.kfold):
                        xgb_name = _cls[0] + "_" + str(_i)
                        self.spetial_base_clses_params[xgb_name] = (xgb_name, _cls[1].copy())
                else:
                    base_name = _cls[0]
                    for _i in range(self.kfold):
                        self.sklearn_models[base_name + "_{}".format(_i)] = clone(_cls[1])


        else:
            self.cv = False


    def _lgb_train(self, _cls, train_x, train_y, valid_x=None, valid_y=None):
        _train = lgb.Dataset(train_x, label=train_y)
        _valid = None
        if valid_x is not None:
            _valid = lgb.Dataset(valid_x, label=valid_y, reference=_train)
        if _valid is None:
            bst = lgb.train(_cls[1],
                            _train)
        else:
            bst = lgb.train(_cls[1],
                            _train,
                            valid_sets=_valid)
        assert _cls[0] not in self.spetial_base_clses.keys()
        self.spetial_base_clses[_cls[0]] = bst
        return self._lgb_predict(_cls[0], train_x, self.use_prob)

    def _lgb_predict(self, _cls_name, x, use_proba=False):
        bst = self.spetial_base_clses[_cls_name]
        if use_proba:
            p = bst.predict(x, bst.best_iteration, raw_score = True)
        else:
            p = np.argmax(bst.predict(x, bst.best_iteration), axis=1).astype(int).reshape((-1, 1))
        return p

    def _sklearn_predict(self, model, x, use_proba=False):
        if use_proba:
            p = model.predict_proba(x)
        else:
            p = model.predict(x).reshape((-1, 1))

        if use_proba and self.num_class is not None:#针对没有出现的类别导致的feature不一样
            sample_size = p.shape[0]
            clss = model.classes_
            p = dict(zip(clss, p.transpose()))
            for _i in range(self.num_class):
                if _i not in p.keys():
                    p[_i] = np.zeros((sample_size,))
            p = np.vstack(p.values()).transpose()
        return p

    def fit(self, train_x, train_y):
        self.split = check_cv(self.kfold, train_y, self.stratify)
        self.split.shuffle = self.Kfold_shuffle
        p_rs = []
        v_rs = []
        if not self.cv:
            for _cls in self.base_clses:
                if _cls[0].startswith("lgb"):
                    p = self._lgb_train(_cls, train_x, train_y)
                    p_rs.append(p)
                else:
                    _cls[1].fit(train_x, train_y)
                    p = self._sklearn_predict(_cls[1], train_x, self.use_prob)
                    p_rs.append(p)

        else:
            for _cls in self.base_clses:
                p_part = []
                y_index = []
                v_part = []
                if _cls[0].startswith("lgb"):  # lgb or xgb
                    for _i, (_train_index, _test_index) in enumerate(self.split.split(train_x, train_y)):
                        _train_x = train_x[_train_index]
                        _train_y = train_y[_train_index]
                        _test_x = train_x[_test_index]
                        _test_y = train_y[_test_index]
                        lgb_name = _cls[0] + "_" + str(_i)
                        self._lgb_train(self.spetial_base_clses_params[lgb_name],
                                        _train_x,
                                        _train_y,
                                        _test_x,
                                        _test_y)
                        p_part.append(self._lgb_predict(lgb_name, _test_x, self.use_prob))
                        y_index.append(_test_index.reshape((-1, 1)))

                else:  # sklearn
                    for _i, (_train_index, _test_index) in enumerate(self.split.split(train_x, train_y)):
                        _train_x = train_x[_train_index]
                        _train_y = train_y[_train_index]
                        _test_x = train_x[_test_index]
                        name = _cls[0]
                        model = self.sklearn_models[name + "_" + str(_i)]
                        model.fit(_train_x, _train_y)
                        p = self._sklearn_predict(model, _test_x, self.use_prob)
                        p_part.append(p)
                        y_index.append(_test_index.reshape((-1, 1)))
                p_part = np.vstack(p_part)
                y_index = np.vstack(y_index)
                p = dict(zip(list(y_index.reshape((-1,))), list(p_part)))
                p_rs.append(np.vstack(p.values()))

        new_train = np.hstack(p_rs)

        if self.use_feature_in_secondary:
            new_train = np.concatenate([new_train, train_x],axis=1)


        assert new_train.shape[0] == train_y.shape[0]

        # 次学习其
        # if self.meta_cls[0].startswith("lgb"):
        #     self._lgb_train(self.meta_cls, new_train, train_y)
        # else:
        self.meta_cls[1].fit(new_train, train_y)

    def predict(self, test_x):
        p_rs = []
        if not self.cv:
            for _cls in self.base_clses:
                if _cls[0].startswith("lgb"):
                    p = self._lgb_predict(_cls[0], test_x, self.use_prob)
                    p_rs.append(p)
                else:
                    p = self._sklearn_predict(_cls[1], test_x, self.use_prob)
                    p_rs.append(p)

        else:
            for _cls in self.base_clses:
                p_part = []
                if _cls[0].startswith("lgb"):  # lgb or xgb
                    for _i in range(self.kfold):
                        lgb_name = _cls[0] + "_" + str(_i)
                        p_part.append(self._lgb_predict(lgb_name, test_x, self.use_prob))

                else:  # sklearn
                    for _i in range(self.kfold):
                        name = _cls[0]
                        model = self.sklearn_models[name + "_" + str(_i)]
                        p = self._sklearn_predict(model, test_x, self.use_prob)
                        p_part.append(p)
                p_rs.append(np.mean(np.stack(p_part, axis=2), axis=2))

        new_train = np.hstack(p_rs)
        if self.use_feature_in_secondary:
            new_train = np.concatenate([new_train, test_x],axis=1)
        # 次学习器
        if self.meta_cls[0].startswith("lgb"):
            p = self._lgb_predict(self.meta_cls[0], new_train)
        else:
            p = self._sklearn_predict(self.meta_cls[1], new_train)
        return p.reshape((-1,))

In [9]:
#stack
import lightgbm as lgb
num_class = np.unique(y).shape[0]
params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'metric': ['multi_error'],
            'num_leaves': 31,
            'learning_rate': 0.02,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': 0,
            'num_class': num_class,
            'num_iterations':400,
            'early_stopping_round':15
        }
from sklearn.base import clone
from sklearn.model_selection import check_cv
cv = 3
_x,_y = expansion(_train_x3,train_y,cv)
stack = Stacking([("rf1",RandomForestClassifier(n_jobs=-1,
                                                     n_estimators=388,
                                                     class_weight="balanced")),
                  ("lgb",params),
                  ("ovr",OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced")))
                              ],
                             ("rf3",RandomForestClassifier(n_estimators=888,n_jobs=-1,class_weight="balanced")),
                             use_prob=True,
                             use_features_in_secondary = True,
                             kfold = cv,
                num_class =num_class) 
stack.fit(_x,_y)
p = stack.predict(_valid_x3)
acc(p, valid_y)
#

[1]	valid_0's multi_error: 0.369015
Training until validation scores don't improve for 15 rounds.
[2]	valid_0's multi_error: 0.34598
[3]	valid_0's multi_error: 0.331828
[4]	valid_0's multi_error: 0.325053
[5]	valid_0's multi_error: 0.321138
[6]	valid_0's multi_error: 0.317977
[7]	valid_0's multi_error: 0.317675
[8]	valid_0's multi_error: 0.314965
[9]	valid_0's multi_error: 0.315266
[10]	valid_0's multi_error: 0.312858
[11]	valid_0's multi_error: 0.310599
[12]	valid_0's multi_error: 0.309244
[13]	valid_0's multi_error: 0.309395
[14]	valid_0's multi_error: 0.308491
[15]	valid_0's multi_error: 0.306534
[16]	valid_0's multi_error: 0.30804
[17]	valid_0's multi_error: 0.307136
[18]	valid_0's multi_error: 0.30533
[19]	valid_0's multi_error: 0.30533
[20]	valid_0's multi_error: 0.305781
[21]	valid_0's multi_error: 0.304878
[22]	valid_0's multi_error: 0.304276
[23]	valid_0's multi_error: 0.304426
[24]	valid_0's multi_error: 0.303674
[25]	valid_0's multi_error: 0.304426
[26]	valid_0's multi_error

[47]	valid_0's multi_error: 0.293059
[48]	valid_0's multi_error: 0.292152
[49]	valid_0's multi_error: 0.292454
[50]	valid_0's multi_error: 0.293059
[51]	valid_0's multi_error: 0.292757
[52]	valid_0's multi_error: 0.292605
[53]	valid_0's multi_error: 0.292605
[54]	valid_0's multi_error: 0.292757
[55]	valid_0's multi_error: 0.293059
[56]	valid_0's multi_error: 0.292908
[57]	valid_0's multi_error: 0.292605
[58]	valid_0's multi_error: 0.291698
[59]	valid_0's multi_error: 0.291093
[60]	valid_0's multi_error: 0.291396
[61]	valid_0's multi_error: 0.292303
[62]	valid_0's multi_error: 0.292152
[63]	valid_0's multi_error: 0.292303
[64]	valid_0's multi_error: 0.292152
[65]	valid_0's multi_error: 0.292454
[66]	valid_0's multi_error: 0.291698
[67]	valid_0's multi_error: 0.291396
[68]	valid_0's multi_error: 0.290186
[69]	valid_0's multi_error: 0.291093
[70]	valid_0's multi_error: 0.289884
[71]	valid_0's multi_error: 0.290337
[72]	valid_0's multi_error: 0.288523
[73]	valid_0's multi_error: 0.288825
[

[75]	valid_0's multi_error: 0.287494
[76]	valid_0's multi_error: 0.286886
[77]	valid_0's multi_error: 0.286583
[78]	valid_0's multi_error: 0.285367
[79]	valid_0's multi_error: 0.286279
[80]	valid_0's multi_error: 0.285671
[81]	valid_0's multi_error: 0.284759
[82]	valid_0's multi_error: 0.284911
[83]	valid_0's multi_error: 0.284607
[84]	valid_0's multi_error: 0.283544
[85]	valid_0's multi_error: 0.283847
[86]	valid_0's multi_error: 0.285215
[87]	valid_0's multi_error: 0.284607
[88]	valid_0's multi_error: 0.284607
[89]	valid_0's multi_error: 0.284759
[90]	valid_0's multi_error: 0.285519
[91]	valid_0's multi_error: 0.284455
[92]	valid_0's multi_error: 0.283847
[93]	valid_0's multi_error: 0.284151
[94]	valid_0's multi_error: 0.283544
[95]	valid_0's multi_error: 0.283695
[96]	valid_0's multi_error: 0.28324
[97]	valid_0's multi_error: 0.283544
[98]	valid_0's multi_error: 0.282936
[99]	valid_0's multi_error: 0.282632
[100]	valid_0's multi_error: 0.28248
[101]	valid_0's multi_error: 0.282024
[

0.7157298207995099

In [105]:
valid_index = np.arange(_valid_x3.shape[0])
train_x1 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats],axis=1)
valid_x1= np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats],axis=1)
train_x0 = np.concatenate([train_wifi_all_x[:,indexs]],axis=1)
valid_x0= np.concatenate([valid_wifi_all_x[:,indexs]],axis=1)
train_x2 = np.concatenate([train_wifi_all_x[:,indexs],train_wh],axis=1)
valid_x2= np.concatenate([valid_wifi_all_x[:,indexs],valid_wh],axis=1)
train_x3 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh],axis=1)
valid_x3= np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh],axis=1)

In [106]:
other_train_wifi_features=[]
other_test_wifi_features= []
sorted_wifi_all = get_sorted_wifi_just_train(train, valid)
for _index in range(len(sorted_wifi_all), 0, -1):
    if sorted_wifi_all[_index - 1][1] >= 10:
        break
sorted_wifi = sorted_wifi_all[:_index]
d = rank_sorted_wifi(sorted_wifi)

# use
test_use_wifi_in_wifi_rank, train_use_wifi_in_wifi_rank = use_wifi_in_wifi_rank2(valid, train, d)
other_train_wifi_features.append(train_use_wifi_in_wifi_rank.values.reshape((-1, 1)))
other_test_wifi_features.append(test_use_wifi_in_wifi_rank.values.reshape((-1, 1)))
other_train_wifi_feature = np.concatenate(other_train_wifi_features, axis=1)
other_test_wifi_feature = np.concatenate(other_test_wifi_features, axis=1)

train_x4 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh,other_train_wifi_feature],axis=1)
valid_x4= np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh, other_test_wifi_feature],axis=1)

In [206]:
sorted_wifis = {}
for i in range(len(le.classes_)):
    t_ = train[ (train.shop_id == le.classes_[i])]
    s_ = t_.shape[0]
    sorted_wifi = get_sorted_wifi([t_])
    sp = 0.5
    for _index in range(len(sorted_wifi), 0, -1):
        if sorted_wifi[_index - 1][1] / float(s_) >= sp:
            break
    sorted_wifi = sorted_wifi[:_index]
    d = rank_sorted_wifi(sorted_wifi)
    sorted_wifis[i] = d.keys()
def intersect_size(x,sorted_wifis):
    l=[]
    m = []
    for _x in x[1]:
        m.append(_x[0])
    for _x in x[2]:
        m.append(_x[0])
    for _i in range(len(sorted_wifis)):
        s = np.intersect1d(sorted_wifis[_i],m).shape[0]
        l.append(s)
    return np.array(l)
train_f = np.vstack(train.basic_wifi_info.map(lambda x: intersect_size(x,sorted_wifis)).values)
valid_f = np.vstack(valid.basic_wifi_info.map(lambda x: intersect_size(x,sorted_wifis)).values)

# pca 降维
from sklearn.decomposition import PCA
pca = PCA(20).fit(train_f)
train_f = pca.transform(train_f)
valid_f = pca.transform(valid_f)

train_x5 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh,train_f],axis=1)
valid_x5= np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh, valid_f],axis=1)

In [246]:
sorted_wifis = {}
for i in range(len(le.classes_)):
    t_ = train[ (train.shop_id == le.classes_[i])]
    s_ = t_.shape[0]
    sorted_wifi = get_sorted_wifi([t_])
    sp = 0.9
    for _index in range(len(sorted_wifi), 0, -1):
        if sorted_wifi[_index - 1][1] / float(s_) >= sp:
            break
    sorted_wifi = sorted_wifi[:_index]
    d = rank_sorted_wifi(sorted_wifi)
    sorted_wifis[i] = d.keys()
def intersect_size(x,sorted_wifis):
    l=[]
    m = []
    for _x in x[1]:
        m.append(_x[0])
    for _x in x[2]:
        m.append(_x[0])
    for _i in range(len(sorted_wifis)):
        s = np.intersect1d(sorted_wifis[_i],m).shape[0]
        l.append(s)
    return np.array(l)
train_f = np.vstack(train.basic_wifi_info.map(lambda x: intersect_size(x,sorted_wifis)).values)
valid_f = np.vstack(valid.basic_wifi_info.map(lambda x: intersect_size(x,sorted_wifis)).values)
train_x5 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh,train_f],axis=1)
valid_x5= np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh, valid_f],axis=1)

In [281]:
def get_median(d,t):
    for _k in d:
        d[_k] = []
    def get_one_sample(x,d):
        for _x in x[1]:
            if _x[0] in d:
                d[_x[0]].append(_x[1])
        for _x in x[2]:
            if _x[0] in d:
                d[_x[0]].append(_x[1])   
    
    t.basic_wifi_info.map(lambda x: get_one_sample(x,d))
    
    for _k in d:
        d[_k] = np.median(d[_k])
    return d
sorted_wifis = {}
for i in range(len(le.classes_)):
    t_ = train[ (train.shop_id == le.classes_[i])]
    s_ = t_.shape[0]
    sorted_wifi = get_sorted_wifi([t_])
    sp = 0.3
    for _index in range(len(sorted_wifi), 0, -1):
        if sorted_wifi[_index - 1][1] / float(s_) >= sp:
            break
    sorted_wifi = sorted_wifi[:_index]
    d = rank_sorted_wifi(sorted_wifi)
    d = get_median(d, t_)
    sorted_wifis[i] = d

def dis(x1,x2):
    return np.sqrt((x1-x2)*(x1-x2))
def dis_to_wifi(x,sorted_wifis):
    l=[]
    for _i in range(len(sorted_wifis)):
        s = 0
        c = 0
        wifis = sorted_wifis[_i]
        for _x in x[1]:
            if _x[0] in wifis:
                c += 1
                s+= dis(_x[1],wifis[_x[0]])
        for _x in x[2]:
            if _x[0] in wifis:
                c += 1
                s+= dis(_x[1],wifis[_x[0]])
        if c!=0:
            l.append(float(s) / float(c))
        else:
            l.append(10000)
    return np.array(l)
train_f = np.vstack(train.basic_wifi_info.map(lambda x: dis_to_wifi(x,sorted_wifis)).values)
valid_f = np.vstack(valid.basic_wifi_info.map(lambda x: dis_to_wifi(x,sorted_wifis)).values)
train_x6 = np.concatenate([train_wifi_all_x[:,indexs],train_lonlats,train_wh,train_f],axis=1)
valid_x6= np.concatenate([valid_wifi_all_x[:,indexs],valid_lonlats,valid_wh, valid_f],axis=1)

In [320]:
cv = CVEstimator(RandomForestClassifier(n_jobs=-1,n_estimators=288,class_weight="balanced"))
cv.fit(train_x3,train_y)


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.00694444,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.01041667, ...,  0.        ,
         0.        ,  0.        ]])

In [324]:
indexs = choose_strong_wifi_index(-90,6,train_wifi_all_x)
train_x7 = np.concatenate([train_wifi_all_x[:,indexs],cv.predict_proba(train_x3)],axis=1)
valid_x7= np.concatenate([valid_wifi_all_x[:,indexs],cv.predict_proba(valid_x3)],axis=1)

In [331]:

_train_b_y.shape[0]
i = 3
c=7

if c == 3:
    tx = train_x3
    vx = valid_x3
elif c == 2:
    tx = train_x2
    vx = valid_x2
elif c == 4:
    tx = train_x4
    vx = valid_x4
elif c == 5:
    tx = train_x5
    vx = valid_x5
elif c == 6:
    tx = train_x6
    vx = valid_x6
elif c == 7:
    tx = train_x7
    vx = valid_x7

def get_model():
    rf2 = RandomForestClassifier(n_estimators=188,n_jobs=-1,class_weight="balanced",min_samples_leaf=1)
    return rf2
    
prf = get_model()
prf.fit(tx,_train_b_y[:,i])
p = prf.predict(vx)
proba = prf.predict_proba(vx)
print "origin",acc(p,_valid_b_y[:,i])
print "error shape,", (p != _valid_b_y[:,i]).sum()
print _valid_b_y[:,i][(p != _valid_b_y[:,i])]
print valid_index[(p != _valid_b_y[:,i])]
print le.classes_[i]

origin 0.999387348752
error shape, 4
[1 1 0 0]
[1587 1588 3174 3819]
s_136016


In [71]:

print train[train.shop_id=="s_193954"]["wifi_infos"]
print 
print valid[valid.shop_id=="s_193954"]["wifi_infos"]

689680    b_55640889|-58|false;b_52574361|-73|false;b_62470|-73|false;b_55570664|-57|false;b_55640412|-48|false;b_26730374|-67|false;b_62516|-79|false;b_62704|-48|false;b_62705|-66|false;b_42231552|-72|false
Name: wifi_infos, dtype: object

803695    b_55570664|-60|false;b_32358751|-61|false;b_52574362|-66|false;b_62516|-78|false;b_55640889|-59|false;b_62550|-80|false;b_52574361|-81|false;b_62705|-71|false;b_55640412|-48|false;b_20616910|-77|false
Name: wifi_infos, dtype: object
