In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import log_loss

In [2]:
app_ev = pd.read_csv(".\\raw_data\\app_events.csv", dtype={'device_id': np.str})

In [3]:
app_ev = app_ev.groupby("event_id")["app_id"].apply(lambda x: " ".join(set("app_id:" + str(s) for s in x)))

In [4]:
events = pd.read_csv(".\\raw_data\\events.csv", dtype={'device_id': np.str})

In [5]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [6]:
events["app_id"] = events["event_id"].map(app_ev)

In [7]:
print(events.shape)
events.isnull().sum()

(3252950, 6)


event_id           0
device_id          0
timestamp          0
longitude          0
latitude           0
app_id       1764854
dtype: int64

In [8]:
events = events.dropna()
print(events.shape)
events.isnull().sum()
del app_ev

(1488096, 6)


In [9]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,app_id:4348659952760821294 app_id:701031210314...
5,6,1476664663289716375,2016-05-01 00:27:21,0.0,0.0,app_id:-628177381309085483 app_id:434865995276...
6,7,5990807147117726237,2016-05-01 00:15:13,113.73,23.0,app_id:7608131261394141565 app_id:434865995276...
8,9,-2073340001552902943,2016-05-01 00:15:33,0.0,0.0,app_id:4348659952760821294 app_id:666657379157...
15,16,9070651185984875886,2016-05-01 00:06:06,0.0,0.0,app_id:-1442117565864503182 app_id:-1442117564...


In [10]:
events['timestamp'] = pd.to_datetime(events['timestamp'])
events['hour'] = events['timestamp'].map(lambda x : x.hour)
events['min'] = events['timestamp'].map(lambda x : x.minute)

In [11]:
len(events['device_id'].unique())

60822

In [12]:
gd = events.groupby('device_id')
longitude = gd['longitude'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
longitude = longitude.rename(columns={'amax':'long_max','amin':'long_min','mean':'long_mean','median':'long_median','std':'long_std'})
longitude['long_std'].fillna(0,inplace=True)

In [13]:
latitude = gd['longitude'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
latitude = latitude.rename(columns={'amax':'lat_max','amin':'lat_min','mean':'lat_mean','median':'lat_median','std':'lat_std'})
latitude['lat_std'].fillna(0,inplace=True)

In [14]:
hour = gd['hour'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
hour = hour.rename(columns={'amax':'hour_max','amin':'hour_min','mean':'hour_mean','median':'hour_median','std':'hour_std'})
hour['hour_std'].fillna(0,inplace=True)

In [15]:
minute = gd['min'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
minute = minute.rename(columns={'amax':'min_max','amin':'min_min','mean':'min_mean','median':'min_median','std':'min_std'})
minute['min_std'].fillna(0,inplace=True)

In [16]:
events = events[["device_id", "app_id"]]
# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

In [17]:
print("# Read Phone Brand")
pbd = pd.read_csv(".\\raw_data\phone_brand_device_model.csv",dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)

# Read Phone Brand


In [18]:
train = pd.read_csv(".\\raw_data\gender_age_train.csv",dtype={'device_id': np.str})
train.drop(["age", "gender"], axis=1, inplace=True)

test = pd.read_csv(".\\raw_data\gender_age_test.csv",dtype={'device_id': np.str})
test["group"] = np.nan

In [19]:
events.head()

Unnamed: 0,device_id,app_id
0,-100015673884079572,app_id:8052965594967667886 app_id:434865995276...
1,-1000458529741848912,app_id:5736986478023572714 app_id:-89671151722...
2,-1000667340060427374,app_id:4348659952760821294 app_id:-17715907062...
3,-100098646088222553,app_id:-5188663207849784426 app_id:-3892853652...
4,-100101996136889832,app_id:-1442117565864503182 app_id:-1771590706...


In [20]:
longitude.head()

Unnamed: 0,device_id,long_max,long_min,long_mean,long_std,long_median
0,-100015673884079572,0.0,0.0,0.0,0.0,0.0
1,-1000458529741848912,0.0,0.0,0.0,0.0,0.0
2,-1000667340060427374,116.69,0.0,19.851809,44.066648,0.0
3,-100098646088222553,103.52,103.52,103.52,0.0,103.52
4,-100101996136889832,114.4,114.4,114.4,0.0,114.4


In [21]:
print(latitude.shape)
latitude.head()

(60822, 6)


Unnamed: 0,device_id,lat_max,lat_min,lat_mean,lat_std,lat_median
0,-100015673884079572,0.0,0.0,0.0,0.0,0.0
1,-1000458529741848912,0.0,0.0,0.0,0.0,0.0
2,-1000667340060427374,116.69,0.0,19.851809,44.066648,0.0
3,-100098646088222553,103.52,103.52,103.52,0.0,103.52
4,-100101996136889832,114.4,114.4,114.4,0.0,114.4


In [22]:
print(hour.shape)
hour.head()

(60822, 6)


Unnamed: 0,device_id,hour_max,hour_min,hour_mean,hour_std,hour_median
0,-100015673884079572,23,0,11.38,7.233398,10.0
1,-1000458529741848912,2,2,2.0,0.0,2.0
2,-1000667340060427374,23,0,14.297872,5.397789,13.5
3,-100098646088222553,9,9,9.0,0.0,9.0
4,-100101996136889832,22,22,22.0,0.0,22.0


In [23]:
minute.head()

Unnamed: 0,device_id,min_max,min_min,min_mean,min_std,min_median
0,-100015673884079572,57,0,27.34,17.214861,30.0
1,-1000458529741848912,5,5,5.0,0.0,5.0
2,-1000667340060427374,59,0,18.191489,19.30473,6.5
3,-100098646088222553,28,28,28.0,0.0,28.0
4,-100101996136889832,50,50,50.0,0.0,50.0


In [24]:
print(train.shape)
print(test.shape)
split_len = len(train)
Df = pd.concat((train, test), axis=0, ignore_index=True)

(74645, 2)
(112071, 2)


In [25]:
m = pd.merge(events, hour, how='inner',on='device_id')
m1 = pd.merge(m, minute, how='inner',on='device_id')
m2 = pd.merge(m1, latitude, how='inner',on='device_id')
device_info = pd.merge(m2, longitude, how='inner',on='device_id')
del m
del m1
del m2
print(device_info.shape)
device_info.head()

(60822, 22)


Unnamed: 0,device_id,app_id,hour_max,hour_min,hour_mean,hour_std,hour_median,min_max,min_min,min_mean,...,lat_max,lat_min,lat_mean,lat_std,lat_median,long_max,long_min,long_mean,long_std,long_median
0,-100015673884079572,app_id:8052965594967667886 app_id:434865995276...,23,0,11.38,7.233398,10.0,57,0,27.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1000458529741848912,app_id:5736986478023572714 app_id:-89671151722...,2,2,2.0,0.0,2.0,5,5,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1000667340060427374,app_id:4348659952760821294 app_id:-17715907062...,23,0,14.297872,5.397789,13.5,59,0,18.191489,...,116.69,0.0,19.851809,44.066648,0.0,116.69,0.0,19.851809,44.066648,0.0
3,-100098646088222553,app_id:-5188663207849784426 app_id:-3892853652...,9,9,9.0,0.0,9.0,28,28,28.0,...,103.52,103.52,103.52,0.0,103.52,103.52,103.52,103.52,0.0,103.52
4,-100101996136889832,app_id:-1442117565864503182 app_id:-1771590706...,22,22,22.0,0.0,22.0,50,50,50.0,...,114.4,114.4,114.4,0.0,114.4,114.4,114.4,114.4,0.0,114.4


In [26]:
Df = pd.merge(Df, device_info, how='left',on='device_id')
Df['app_id'].fillna('-1',inplace=True)
Df.fillna(-1,inplace=True)

In [27]:
Df = pd.merge(Df, pbd, how="left", on="device_id")

In [28]:
Df.columns

Index(['device_id', 'group', 'app_id', 'hour_max', 'hour_min', 'hour_mean',
       'hour_std', 'hour_median', 'min_max', 'min_min', 'min_mean', 'min_std',
       'min_median', 'lat_max', 'lat_min', 'lat_mean', 'lat_std', 'lat_median',
       'long_max', 'long_min', 'long_mean', 'long_std', 'long_median',
       'phone_brand', 'device_model'],
      dtype='object')

In [29]:
Df.head()

Unnamed: 0,device_id,group,app_id,hour_max,hour_min,hour_mean,hour_std,hour_median,min_max,min_min,...,lat_mean,lat_std,lat_median,long_max,long_min,long_mean,long_std,long_median,phone_brand,device_model
0,-8076087639492063270,M32-38,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,小米,MI 2
1,-2897161552818060146,M32-38,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,小米,MI 2
2,-8260683887967679142,M32-38,app_id:-671205476771545838 app_id:287038913497...,14.0,14.0,14.0,0.0,14.0,23.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,小米,MI 2
3,-4938849341048082022,M29-31,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,小米,红米note
4,245133531816851882,M29-31,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,小米,MI 3


In [30]:
# Encode
le_brand = LabelEncoder()
le_brand.fit(pbd['phone_brand'])
le_model = LabelEncoder()
le_model.fit(pbd['device_model'])
le_app = LabelEncoder()
Df['app_id'] = le_app.fit_transform(Df['app_id'])


Df['phone_brand'] = le_brand.transform(Df['phone_brand'])
Df['device_model'] = le_model.transform(Df['device_model'])

In [31]:
train = Df[:split_len]
test = Df[split_len:]

print(train.shape)
print(test.shape)

(74645, 25)
(112071, 25)


In [32]:
X_train = train.drop(['device_id','group'],axis=1)
X_test = test.drop(['device_id','group'],axis=1)
Y_train = train['group']

In [None]:
from sklearn import pipeline, grid_search
print("\nTraining Ensamble Random Forest ....")
rfc = RandomForestClassifier(n_estimators = 200, n_jobs=-1, random_state=23)
param_grid = {'min_samples_leaf': [25,50,75]}
rfc_gs = grid_search.GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 6, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
rfc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", rfc_gs.best_params_)
print("Best CV score:", rfc_gs.best_score_)

In [33]:
rfc = RandomForestClassifier(n_estimators = 300, n_jobs=-1, random_state=23, min_samples_leaf= 50)
rfc.fit(X_train, Y_train)
features_lb = sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), X_train.columns), reverse=True, key=lambda pair: pair[0])

In [34]:
features_lb

[(0.19489999999999999, 'device_model'),
 (0.1694, 'phone_brand'),
 (0.078899999999999998, 'app_id'),
 (0.060400000000000002, 'hour_std'),
 (0.0487, 'min_std'),
 (0.047399999999999998, 'hour_mean'),
 (0.042099999999999999, 'hour_min'),
 (0.0419, 'min_mean'),
 (0.039100000000000003, 'min_max'),
 (0.0361, 'min_median'),
 (0.032000000000000001, 'hour_median'),
 (0.0263, 'min_min'),
 (0.022700000000000001, 'lat_std'),
 (0.0223, 'hour_max'),
 (0.0223, 'long_max'),
 (0.0218, 'long_std'),
 (0.021100000000000001, 'long_mean'),
 (0.020500000000000001, 'lat_mean'),
 (0.020299999999999999, 'lat_max'),
 (0.010999999999999999, 'lat_median'),
 (0.0106, 'long_median'),
 (0.0051999999999999998, 'long_min'),
 (0.0051000000000000004, 'lat_min')]

In [35]:
preds = rfc.predict_proba(X_test)

In [44]:
lable_group = LabelEncoder()
Y_train_le = lable_group.fit(Y_train)
result = pd.DataFrame(preds, columns=lable_group.classes_)

In [49]:
result["device_id"] = test['device_id'].values
#order columns
cols = result.columns.tolist()
cols = cols[-1:] + cols[:-1]
result = result[cols]

In [None]:
result.head(10)

In [59]:
result.to_csv('.\\submissions\sub_rf.csv',index=False)

In [61]:
X_train

Unnamed: 0,app_id,hour_max,hour_min,hour_mean,hour_std,hour_median,min_max,min_min,min_mean,min_std,...,lat_mean,lat_std,lat_median,long_max,long_min,long_mean,long_std,long_median,phone_brand,device_model
0,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,51,749
1,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,51,749
2,8232,14.0,14.0,14.000000,0.000000,14.0,23.0,23.0,23.000000,0.000000,...,0.000000,0.000000,0.000,0.00,0.00,0.000000,0.000000,0.000,51,749
3,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,51,1524
4,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,51,753
5,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,7,908
6,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,117,396
7,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,51,1524
8,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,13,1246
9,0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000,-1.00,-1.00,-1.000000,-1.000000,-1.000,15,560
