In [8]:
%matplotlib inline

import pandas as pd
import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.cross_validation import train_test_split

import xgboost as xgb
                                                                                                                                                                                                                    
import os

import warnings
warnings.filterwarnings('ignore')

DATA_DIR = '../data'

In [2]:
app_ev = pd.read_csv(os.path.join(DATA_DIR, 'app_events.csv'), dtype={'device_id': np.str})
app_ev = app_ev.groupby("event_id")["app_id"].apply(
    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

In [3]:
events = pd.read_csv(os.path.join(DATA_DIR, 'events.csv'), dtype={'device_id': np.str})
events["app_id"] = events["event_id"].map(app_ev)

events = events.dropna()

del app_ev

In [4]:
events = events[["device_id", "app_id"]]

events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']


pbd = pd.read_csv(os.path.join(DATA_DIR, 'phone_brand_device_model.csv'),
                  dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)


train = pd.read_csv(os.path.join(DATA_DIR, 'gender_age_train.csv'),
                    dtype={'device_id': np.str})
train.drop(["age", "gender"], axis=1, inplace=True)

test = pd.read_csv(os.path.join(DATA_DIR, 'gender_age_test.csv'),
                   dtype={'device_id': np.str})
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))

In [5]:
###################
#  Concat Feature
###################

f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"

FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)

In [6]:
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

# User-Item-Feature


In [7]:
#################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)

print("# Num of Features: ", X_train.shape[1])

# Feature Selection
# Num of Features:  4823


In [14]:
## Model

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3,
    "colsample_bytree": 0.8,
    "subsample": 0.9
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

Will train until eval error hasn't decreased in 25 rounds.
[0]	train-mlogloss:2.413323	eval-mlogloss:2.423159
[1]	train-mlogloss:2.379340	eval-mlogloss:2.395015
[2]	train-mlogloss:2.354908	eval-mlogloss:2.375592
[3]	train-mlogloss:2.335667	eval-mlogloss:2.360797
[4]	train-mlogloss:2.319801	eval-mlogloss:2.348936
[5]	train-mlogloss:2.306331	eval-mlogloss:2.339183
[6]	train-mlogloss:2.294741	eval-mlogloss:2.331023
[7]	train-mlogloss:2.284633	eval-mlogloss:2.324107
[8]	train-mlogloss:2.275750	eval-mlogloss:2.318206
[9]	train-mlogloss:2.267856	eval-mlogloss:2.313138
[10]	train-mlogloss:2.260778	eval-mlogloss:2.308756
[11]	train-mlogloss:2.254479	eval-mlogloss:2.304960
[12]	train-mlogloss:2.248751	eval-mlogloss:2.301650
[13]	train-mlogloss:2.243585	eval-mlogloss:2.298773
[14]	train-mlogloss:2.238862	eval-mlogloss:2.296248
[15]	train-mlogloss:2.234560	eval-mlogloss:2.294042
[16]	train-mlogloss:2.230633	eval-mlogloss:2.292102
[17]	train-mlogloss:2.227014	eval-mlogloss:2.290406
[18]	train-mlog

In [15]:
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 40, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('../submissions/fine_tune_xgboost.gz', index=True,
              index_label='device_id', compression="gzip")

# Train
