In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import log_loss
from sklearn import pipeline, grid_search

In [2]:
app_ev = pd.read_csv(".\\raw_data\\app_events.csv", dtype={'device_id': np.str})
app_ev.head()

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0
3,2,-653184325010919369,1,1
4,2,8693964245073640147,1,1


In [3]:
label_cat = pd.read_csv('.\\raw_data\\label_categories.csv')
print(label_cat.shape)
label_cat.head()

(930, 2)


Unnamed: 0,label_id,category
0,1,
1,2,game-game type
2,3,game-Game themes
3,4,game-Art Style
4,5,game-Leisure time


In [4]:
#for cat in label_cat.category.unique():
#    print(cat)

In [5]:
#each app can have severla labels
#app_labels[app_labels.app_id == -5720078949152207372]['label_id']

In [6]:
app_labels = pd.read_csv('.\\raw_data\\app_labels.csv')
print(app_labels.shape)
app_labels.head()

(459943, 2)


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [7]:
app_labels = pd.merge(app_labels, label_cat, how='inner',on='label_id')
print(app_labels.shape)
app_labels.head()

(459943, 3)


Unnamed: 0,app_id,label_id,category
0,7324884708820027918,251,Finance
1,-4494216993218550286,251,Finance
2,8756705988821000489,251,Finance
3,1061207043315821111,251,Finance
4,-1491198667294647703,251,Finance


In [8]:
app_ev = app_ev.groupby("event_id")["app_id"].apply(lambda x: " ".join(set("app_id:" + str(s) for s in x)))

In [9]:
events = pd.read_csv(".\\raw_data\\events.csv", dtype={'device_id': np.str})

In [10]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [11]:
events["app_id"] = events["event_id"].map(app_ev)

In [12]:
print(events.shape)
events.isnull().sum()

(3252950, 6)


event_id           0
device_id          0
timestamp          0
longitude          0
latitude           0
app_id       1764854
dtype: int64

In [13]:
events = events.dropna()
print(events.shape)
print(events.isnull().sum())
del app_ev

(1488096, 6)
event_id     0
device_id    0
timestamp    0
longitude    0
latitude     0
app_id       0
dtype: int64


In [14]:
print(events.shape)
events.head()

(1488096, 6)


Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,app_id:-8022267440849930066 app_id:-3725672010...
5,6,1476664663289716375,2016-05-01 00:27:21,0.0,0.0,app_id:-4361453417043092299 app_id:59890386732...
6,7,5990807147117726237,2016-05-01 00:15:13,113.73,23.0,app_id:-3923363630716707823 app_id:-5372083417...
8,9,-2073340001552902943,2016-05-01 00:15:33,0.0,0.0,app_id:5099453940784075687 app_id:161151331274...
15,16,9070651185984875886,2016-05-01 00:06:06,0.0,0.0,app_id:4775896950989639373 app_id:628020936226...


In [15]:
#events.event_id.value_counts()

In [14]:
#events = events.iloc[:100]

In [16]:
'''
import itertools
import operator

def most_common(L):
    # get an iterable of (item, iterable) pairs
    SL = sorted((x, i) for i, x in enumerate(L))
    #print ('SL:', SL)
    groups = itertools.groupby(SL, key=operator.itemgetter(0))
    # auxiliary function to get "quality" for an item
    def _auxfun(g):
        item, iterable = g
        count = 0
        min_index = len(L)
        for _, where in iterable:
            count += 1
            min_index = min(min_index, where)
        #print ('item %r, count %r, minind %r' % (item, count, min_index))
        return count, -min_index
    # pick the highest-count/earliest item
    #print(list(groups))
    return max(groups, key=_auxfun)[0]
'''

'\nimport itertools\nimport operator\n\ndef most_common(L):\n    # get an iterable of (item, iterable) pairs\n    SL = sorted((x, i) for i, x in enumerate(L))\n    #print (\'SL:\', SL)\n    groups = itertools.groupby(SL, key=operator.itemgetter(0))\n    # auxiliary function to get "quality" for an item\n    def _auxfun(g):\n        item, iterable = g\n        count = 0\n        min_index = len(L)\n        for _, where in iterable:\n            count += 1\n            min_index = min(min_index, where)\n        #print (\'item %r, count %r, minind %r\' % (item, count, min_index))\n        return count, -min_index\n    # pick the highest-count/earliest item\n    #print(list(groups))\n    return max(groups, key=_auxfun)[0]\n'

In [None]:
'''
def most_used_type_app1(x):
    all_cat=[]
    apps = x.split(" ")
    #print(apps)
    for app in apps:
        app_int = int(app[7:])
        app_category =app_labels.loc[app_labels['app_id'] == app_int,'category']
        for a in app_category:
            all_cat.append(a)
    
    mc = most_common(all_cat)
    return mc

def most_used_type_app2(x):
    all_cat=[]
    apps = x.split(" ")
    #print(apps)
    for app in apps:
        app_int = int(app[7:])
        app_category =app_labels.loc[app_labels['app_id'] == app_int,'category']
        for a in app_category:
            all_cat.append(a)
    
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    return mc

def most_used_type_app3(x):
    all_cat=[]
    apps = x.split(" ")
    #print(apps)
    for app in apps:
        app_int = int(app[7:])
        app_category =app_labels.loc[app_labels['app_id'] == app_int,'category']
        for a in app_category:
            all_cat.append(a)
    
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    return mc

def most_used_type_app4(x):
    all_cat=[]
    apps = x.split(" ")
    #print(apps)
    for app in apps:
        app_int = int(app[7:])
        app_category =app_labels.loc[app_labels['app_id'] == app_int,'category']
        for a in app_category:
            all_cat.append(a)
    
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    return mc

def most_used_type_app5(x):
    all_cat=[]
    apps = x.split(" ")
    #print(apps)
    for app in apps:
        app_int = int(app[7:])
        app_category =app_labels.loc[app_labels['app_id'] == app_int,'category']
        for a in app_category:
            all_cat.append(a)
    
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    all_cat = [x for x in all_cat if x != mc]
    mc = most_common(all_cat)
    return mc

print('1st...')
events['1st_used_app'] = events['app_id'].map(most_used_type_app1)
events.to_csv('.\\transformed_data\\events.csv',index=False)
print('2nd...')
events['2nd_used_app'] = events['app_id'].map(most_used_type_app2)
events.to_csv('.\\transformed_data\\events.csv',index=False)
print('3rd...')
events['3rd_used_app'] = events['app_id'].map(most_used_type_app3)
events.to_csv('.\\transformed_data\\events.csv',index=False)
print('4th...')
events['4th_used_app'] = events['app_id'].map(most_used_type_app4)
events.to_csv('.\\transformed_data\\events.csv',index=False)
print('5th...')
events['5th_used_app'] = events['app_id'].map(most_used_type_app5)
events.to_csv('.\\transformed_data\\events.csv',index=False)
'''

1st...


In [None]:
events

In [17]:
events['timestamp'] = pd.to_datetime(events['timestamp'])
events['hour'] = events['timestamp'].map(lambda x : x.hour)
events['min'] = events['timestamp'].map(lambda x : x.minute)

In [18]:
len(events['device_id'].unique())

60822

In [19]:
gd = events.groupby('device_id')
longitude = gd['longitude'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
longitude = longitude.rename(columns={'amax':'long_max','amin':'long_min','mean':'long_mean','median':'long_median','std':'long_std'})
longitude['long_std'].fillna(0,inplace=True)

In [20]:
latitude = gd['longitude'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
latitude = latitude.rename(columns={'amax':'lat_max','amin':'lat_min','mean':'lat_mean','median':'lat_median','std':'lat_std'})
latitude['lat_std'].fillna(0,inplace=True)

In [21]:
hour = gd['hour'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
hour = hour.rename(columns={'amax':'hour_max','amin':'hour_min','mean':'hour_mean','median':'hour_median','std':'hour_std'})
hour['hour_std'].fillna(0,inplace=True)

In [22]:
minute = gd['min'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
minute = minute.rename(columns={'amax':'min_max','amin':'min_min','mean':'min_mean','median':'min_median','std':'min_std'})
minute['min_std'].fillna(0,inplace=True)

In [23]:
events = events[["device_id", "app_id"]]
# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

In [24]:
print("# Read Phone Brand")
pbd = pd.read_csv(".\\raw_data\phone_brand_device_model.csv",dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)

# Read Phone Brand


In [25]:
train = pd.read_csv(".\\raw_data\gender_age_train.csv",dtype={'device_id': np.str})
train.drop(["age", "gender"], axis=1, inplace=True)

test = pd.read_csv(".\\raw_data\gender_age_test.csv",dtype={'device_id': np.str})
test["group"] = np.nan

In [26]:
events.head()

Unnamed: 0,device_id,app_id
0,-100015673884079572,app_id:3123327274723331806 app_id:415869968065...
1,-1000458529741848912,app_id:-7687224006159751793 app_id:57369864780...
2,-1000667340060427374,app_id:-145658454112781034 app_id:683821366869...
3,-100098646088222553,app_id:6423939170234284215 app_id:179467654644...
4,-100101996136889832,app_id:-3864848260969172656 app_id:-4532036554...


In [27]:
longitude.head()

Unnamed: 0,device_id,long_max,long_min,long_mean,long_std,long_median
0,-100015673884079572,0.0,0.0,0.0,0.0,0.0
1,-1000458529741848912,0.0,0.0,0.0,0.0,0.0
2,-1000667340060427374,116.69,0.0,19.851809,44.066648,0.0
3,-100098646088222553,103.52,103.52,103.52,0.0,103.52
4,-100101996136889832,114.4,114.4,114.4,0.0,114.4


In [28]:
print(latitude.shape)
latitude.head()

(60822, 6)


Unnamed: 0,device_id,lat_max,lat_min,lat_mean,lat_std,lat_median
0,-100015673884079572,0.0,0.0,0.0,0.0,0.0
1,-1000458529741848912,0.0,0.0,0.0,0.0,0.0
2,-1000667340060427374,116.69,0.0,19.851809,44.066648,0.0
3,-100098646088222553,103.52,103.52,103.52,0.0,103.52
4,-100101996136889832,114.4,114.4,114.4,0.0,114.4


In [29]:
print(hour.shape)
hour.head()

(60822, 6)


Unnamed: 0,device_id,hour_max,hour_min,hour_mean,hour_std,hour_median
0,-100015673884079572,23,0,11.38,7.233398,10.0
1,-1000458529741848912,2,2,2.0,0.0,2.0
2,-1000667340060427374,23,0,14.297872,5.397789,13.5
3,-100098646088222553,9,9,9.0,0.0,9.0
4,-100101996136889832,22,22,22.0,0.0,22.0


In [30]:
minute.head()

Unnamed: 0,device_id,min_max,min_min,min_mean,min_std,min_median
0,-100015673884079572,57,0,27.34,17.214861,30.0
1,-1000458529741848912,5,5,5.0,0.0,5.0
2,-1000667340060427374,59,0,18.191489,19.30473,6.5
3,-100098646088222553,28,28,28.0,0.0,28.0
4,-100101996136889832,50,50,50.0,0.0,50.0


In [31]:
print(train.shape)
print(test.shape)
split_len = len(train)
Df = pd.concat((train, test), axis=0, ignore_index=True)

(74645, 2)
(112071, 2)


In [32]:
m = pd.merge(events, hour, how='inner',on='device_id')
m1 = pd.merge(m, minute, how='inner',on='device_id')
m2 = pd.merge(m1, latitude, how='inner',on='device_id')
device_info = pd.merge(m2, longitude, how='inner',on='device_id')
del m
del m1
del m2

In [33]:
print(device_info.shape)
device_info.columns

(60822, 22)


Index(['device_id', 'app_id', 'hour_max', 'hour_min', 'hour_mean', 'hour_std',
       'hour_median', 'min_max', 'min_min', 'min_mean', 'min_std',
       'min_median', 'lat_max', 'lat_min', 'lat_mean', 'lat_std', 'lat_median',
       'long_max', 'long_min', 'long_mean', 'long_std', 'long_median'],
      dtype='object')

In [None]:
device_info["hour_max"] = device_info["hour_max"].apply(lambda x: "hour_max:" + str(x))
device_info["hour_min"] = device_info["hour_min"].apply(lambda x: "hour_min:" + str(x))
device_info["hour_mean"] = device_info["hour_mean"].apply(lambda x: "hour_mean:" + str(x))
device_info["hour_std"] = device_info["hour_std"].apply(lambda x: "hour_std:" + str(x))
device_info["hour_median"] = device_info["hour_median"].apply(lambda x: "hour_median:" + str(x))
device_info["min_max"] = device_info["min_max"].apply(lambda x: "min_max:" + str(x))
device_info["min_min"] = device_info["min_min"].apply(lambda x: "min_min:" + str(x))
device_info["min_std"] = device_info["min_std"].apply(lambda x: "min_std:" + str(x))
device_info["min_median"] = device_info["min_median"].apply(lambda x: "min_median:" + str(x))
device_info["min_mean"] = device_info["min_mean"].apply(lambda x: "min_mean:" + str(x))
device_info["lat_max"] = device_info["lat_max"].apply(lambda x: "min_lat_maxmax:" + str(x))
device_info["lat_min"] = device_info["lat_min"].apply(lambda x: "lat_min:" + str(x))
device_info["lat_std"] = device_info["lat_std"].apply(lambda x: "lat_std:" + str(x))
device_info["lat_median"] = device_info["lat_median"].apply(lambda x: "lat_median:" + str(x))
device_info["lat_mean"] = device_info["lat_mean"].apply(lambda x: "lat_mean:" + str(x))
device_info["long_max"] = device_info["long_max"].apply(lambda x: "long_max:" + str(x))
device_info["long_min"] = device_info["long_min"].apply(lambda x: "long_min:" + str(x))
device_info["long_mean"] = device_info["long_mean"].apply(lambda x: "long_mean:" + str(x))
device_info["long_std"] = device_info["long_std"].apply(lambda x: "long_std:" + str(x))
device_info["long_median"] = device_info["long_median"].apply(lambda x: "long_median:" + str(x))

In [None]:
# Group Labels
Y_train = train["group"]
lable_group = LabelEncoder()
Y_train = lable_group.fit_transform(Y_train)
device_id = test["device_id"]

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(lambda x: "device_model:" + str(x))

In [None]:
###################
#  Concat Feature
###################

apps = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))for _, row in device_info.iterrows()]).reset_index()
apps.columns = ['app_id', 'device_id']


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = apps[["device_id", "app_id"]]    # app_id
f4 = device_info[["device_id","hour_max"]]
f5 = device_info[["device_id","hour_min"]]
f6 = device_info[["device_id","hour_mean"]]
f7 = device_info[["device_id","hour_std"]]
f8 = device_info[["device_id","hour_median"]]
f9 = device_info[["device_id","min_max"]]
f10 = device_info[["device_id","min_min"]]
f11 = device_info[["device_id","min_mean"]]
f12 = device_info[["device_id","min_median"]]
f13 = device_info[["device_id","lat_max"]]
f14 = device_info[["device_id","lat_min"]]
f15 = device_info[["device_id","lat_mean"]]
f16 = device_info[["device_id","lat_std"]]
f17 = device_info[["device_id","lat_median"]]
f18 = device_info[["device_id","long_max"]]
f19 = device_info[["device_id","long_min"]]
f20 = device_info[["device_id","long_mean"]]
f21 = device_info[["device_id","long_std"]]
f22 = device_info[["device_id","long_median"]]

del apps
del Df
del device_info

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"
f4.columns.values[1] = "feature"
f5.columns.values[1] = "feature"
f6.columns.values[1] = "feature"
f7.columns.values[1] = "feature"
f8.columns.values[1] = "feature"
f9.columns.values[1] = "feature"
f10.columns.values[1] = "feature"
f11.columns.values[1] = "feature"
f12.columns.values[1] = "feature"
f13.columns.values[1] = "feature"
f14.columns.values[1] = "feature"
f15.columns.values[1] = "feature"
f16.columns.values[1] = "feature"
f17.columns.values[1] = "feature"
f18.columns.values[1] = "feature"
f19.columns.values[1] = "feature"
f20.columns.values[1] = "feature"
f21.columns.values[1] = "feature"
f22.columns.values[1] = "feature"

FLS = pd.concat((f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19,
                 f20, f21, f22), axis=0, ignore_index=True)

In [None]:
#FLS[FLS.device_id == '-8260683887967679142']

In [None]:
###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
print(len(row))
print(len(col))
print(len(device_ids))
print(len(feature_cs))
print(data.shape)
sparse_matrix = sparse.csr_matrix((data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

In [None]:
sparse_matrix

In [None]:
train_row = dec.transform(train["device_id"])
X_train = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
X_test = sparse_matrix[test_row, :]

In [None]:
##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=3)

selector.fit(X_train, Y_train)

X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

print("# Num of Features: ", X_train.shape[1])

In [None]:
#('f_classif','chi2','SelectKBest'):
for i in (7,10,20):
    train_row = dec.transform(train["device_id"])
    X_train = sparse_matrix[train_row, :]

    test_row = dec.transform(test["device_id"])
    X_test = sparse_matrix[test_row, :]
        
    selector = SelectPercentile(chi2, percentile=i)
    selector.fit(X_train, Y_train)

    X_train = selector.transform(X_train)
    X_test = selector.transform(X_test)

    lg = LogisticRegression(random_state=23, fit_intercept=True, class_weight= None)
    param_grid = {'C': [0.005,0.1],'multi_class':['ovr', 'multinomial']}
    lg_gs = grid_search.GridSearchCV(estimator = lg, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
    lg_gs.fit(X_train, Y_train)
    print("Feature sel %",i)
    print("# Num of Features: ", X_train.shape[1])
    print("Best parameters found by grid search:", lg_gs.best_params_)
    print("Best CV score:", lg_gs.best_score_)
    print("Best CV score:", lg_gs.grid_scores_)
#23%
#Best parameters found by grid search: {'class_weight': None, 'C': 0.1}
#Best CV score: -2.30189906196
#10%
#Best parameters found by grid search: {'class_weight': None, 'C': 0.1}
#mean: -2.29573, std: 0.03596 LB: 2.28530
#3%
#mean: -2.31534, std: 0.03358

In [None]:
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
for i in (7,30):
    train_row = dec.transform(train["device_id"])
    X_train = sparse_matrix[train_row, :]

    test_row = dec.transform(test["device_id"])
    X_test = sparse_matrix[test_row, :]
        
    selector = SelectPercentile(f_classif, percentile=i)
    selector.fit(X_train, Y_train)

    X_train = selector.transform(X_train)
    X_test = selector.transform(X_test)

    lg = LogisticRegression(random_state=23, class_weight= None)
    param_grid = {'C': [0.1],'fit_intercept':[True,False]}
    lg_gs = grid_search.GridSearchCV(estimator = lg, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
    lg_gs.fit(X_train, Y_train)
    print("Feature sel %",i)
    print("# Num of Features: ", X_train.shape[1])
    print("Best parameters found by grid search:", lg_gs.best_params_)
    print("Best CV score:", lg_gs.best_score_)
    print("Best CV score:", lg_gs.grid_scores_)

In [None]:
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest, SelectFwe, SelectFpr
for i in (7,10,20):
    train_row = dec.transform(train["device_id"])
    X_train = sparse_matrix[train_row, :]

    test_row = dec.transform(test["device_id"])
    X_test = sparse_matrix[test_row, :]
        
    selector = SelectPercentile(SelectFpr, percentile=i)
    selector.fit(X_train, Y_train)

    X_train = selector.transform(X_train)
    X_test = selector.transform(X_test)

    lg = LogisticRegression(random_state=23, fit_intercept=True, class_weight= None)
    param_grid = {'C': [0.005,0.1]}
    lg_gs = grid_search.GridSearchCV(estimator = lg, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
    lg_gs.fit(X_train, Y_train)
    print("Feature sel %",i)
    print("# Num of Features: ", X_train.shape[1])
    print("Best parameters found by grid search:", lg_gs.best_params_)
    print("Best CV score:", lg_gs.best_score_)
    print("Best CV score:", lg_gs.grid_scores_)

In [None]:
abc = AdaBoostClassifier(n_estimators=100, random_state=3)
param_grid = {'learning_rate': [0.01]}
abc_gs = grid_search.GridSearchCV(estimator = abc, param_grid = param_grid, cv = 6, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
abc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", abc_gs.best_params_)
print("Best CV score:", abc_gs.best_score_)
print("Best CV score:", abc_gs.grid_scores_)
# 23%
#Best parameters found by grid search: {'learning_rate': 0.1, n_estimators:100}
#Best CV score: -2.45524182137

In [None]:
print("\nTraining Ensamble Random Forest ....")
rfc = RandomForestClassifier(n_estimators = 300, n_jobs=-1, random_state=23)
param_grid = {'min_samples_leaf': [2,5]}
rfc_gs = grid_search.GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 6, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
rfc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", rfc_gs.best_params_)
print("Best CV score:", rfc_gs.best_score_)
print("Best CV score:", rfc_gs.grid_scores_)
# 23%
# 5:-2.35487, std: 0.01740, CV=LB=2.34511
# 3% ('min_samples_leaf': 2)
# mean: -2.32991, std: 0.02355

In [None]:
rfc = RandomForestClassifier(n_estimators = 2, n_jobs=-1, random_state=23, min_samples_leaf= 5)
rfc.fit(X_train, Y_train)
features_lb = sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), X_train.columns), reverse=True, key=lambda pair: pair[0])
features_lb

In [None]:
preds = lg_gs.predict_proba(X_test)

In [None]:
result = pd.DataFrame(preds, columns=lable_group.classes_)
result["device_id"] = device_id
#result = result.set_index("device_id")

In [None]:
result["device_id"] = test['device_id'].values
#order columns
cols = result.columns.tolist()
cols = cols[-1:] + cols[:-1]
result = result[cols]

In [None]:
result.head(10)

In [None]:
result.to_csv('.\\submissions\\sub_lr.csv',index=False)

In [None]:
import pandas as pd


In [None]:
print(label_categories.shape)
label_categories.category.value_counts()

In [None]:
label_categories.head()

In [None]:
label_categories.isnull().sum()

In [None]:
for cat in label_categories.category:
    print(cat)