In [1]:
# coding=utf8
# Based on yibo's R script

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
#from sklearn.feature_extraction import FeatureHasher
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
#from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
#from sklearn.linear_model import LogisticRegression, SGDClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
print("# Read App Events")
app_ev = pd.read_csv("F:kaggle_data/talking_data_mobile/app_events.csv", usecols=['event_id', 'app_id'])
# remove duplicates(app_id)
app_ev = app_ev.groupby("event_id")["app_id"].apply(
    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("F:kaggle_data/talking_data_mobile/events.csv", usecols=['event_id','device_id'])
events["app_id"] = events["event_id"].map(app_ev)

events = events.dropna()

del app_ev

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"

FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)



###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)

print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 40, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

# Read App Events
# Read Events
# Read Phone Brand
# Generate Train and Test
# User-Item-Feature
# Feature Selection




# Num of Features:  4823
[0]	train-mlogloss:2.4114	eval-mlogloss:2.42106
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.37642	eval-mlogloss:2.39219
[2]	train-mlogloss:2.35135	eval-mlogloss:2.37236
[3]	train-mlogloss:2.33158	eval-mlogloss:2.35727
[4]	train-mlogloss:2.31525	eval-mlogloss:2.34518
[5]	train-mlogloss:2.30141	eval-mlogloss:2.33525
[6]	train-mlogloss:2.28949	eval-mlogloss:2.32694
[7]	train-mlogloss:2.27909	eval-mlogloss:2.31991
[8]	train-mlogloss:2.26994	eval-mlogloss:2.31392
[9]	train-mlogloss:2.26184	eval-mlogloss:2.30878
[10]	train-mlogloss:2.25461	eval-mlogloss:2.30434
[11]	train-mlogloss:2.24813	eval-mlogloss:2.30051
[12]	train-mlogloss:2.24229	eval-mlogloss:2.29718
[13]	train-mlogloss:2.23701	eval-mlogloss:2.29427
[14]	train-mlogloss:2.23221	eval-mlogloss:2.29173
[15]	train-mlogloss:2.22784	eval-mlogloss:2.28951
[16]	train-mlogloss:2.22384	eval-ml

In [35]:
app_ev = pd.read_pickle('F:/app_ev')
app_ev.head()

event_id
2     app_id:7460082553072507347 app_id:-17588575798...
6     app_id:-5839858269967688123 app_id:-1633912854...
7     app_id:-5839858269967688123 app_id:-5408952623...
9     app_id:-5839858269967688123 app_id:74600825530...
16    app_id:7460082553072507347 app_id:353941197706...
Name: app_id, dtype: object

## add time group (零次，少次，多次)

In [56]:



# coding=utf8
# Based on yibo's R script

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
#from sklearn.feature_extraction import FeatureHasher
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
#from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
#from sklearn.linear_model import LogisticRegression, SGDClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
#print("# Read App Events")
#app_ev = pd.read_csv("F:kaggle_data/talking_data_mobile/app_events.csv", usecols=['event_id', 'app_id'])
# remove duplicates(app_id)
#app_ev = app_ev.groupby("event_id")["app_id"].apply(
#    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

#app_ev.to_pickle('F:/app_ev')
app_ev = pd.read_pickle('F:/app_ev')
##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("F:kaggle_data/talking_data_mobile/events.csv", usecols=['event_id','device_id','timestamp'])
events["app_id"] = events["event_id"].map(app_ev)

del app_ev

print(events.shape)
events = events.dropna()
print('after dropna:' , events.shape)

#extract time feature
events['hour'] = pd.to_datetime(events.timestamp).dt.hour

#when the event happen in the events dataset
#%matplotlib inline
#events.hour.plot(kind='hist',bins=24)

# 1 to 4 clock in the morning 
events.loc[events.hour <= 4,'count'] = 1.0
events.loc[events.hour > 4,'count'] = 0.0

# how many times a person play in the morning 
counts = events.groupby('device_id')['count'].sum()
#group them
ind1 = counts < 4
ind2 = (counts >3) & (counts <10)
ind3 = counts >9
counts[ind1] = '少次'
counts[ind2] = '中次'
counts[ind3] = '多次'

del ind1, ind2, ind3

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id
f4 = counts.reset_index()               # time group

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"
f4.columns.values[1] = 'feature'

FLS = pd.concat((f1, f2, f3, f4), axis=0, ignore_index=True)

print('add time group FLS.shape:' , FLS.shape)


###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()
print('device_ids:', device_ids.shape)
print('feature_cs:', feature_cs.shape)

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

print('add time train_sp:', train_sp.shape)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)
print('after select time train_sp:', train_sp.shape)
print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.05,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 35, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

# Read Events
(3252950, 4)
after dropna: (1488096, 4)
# Read Phone Brand
# Generate Train and Test
add time group FLS.shape: (2803279, 2)
# User-Item-Feature
device_ids: (189076,)
feature_cs: (20970,)


TypeError: unorderable types: str() > int()

In [61]:
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

print('add time train_sp:', train_sp.shape)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)
print('after select time train_sp:', train_sp.shape)
print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.05,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 35, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

add time train_sp: (74645, 20970)
# Feature Selection




after select time train_sp: (74645, 4823)
# Num of Features:  4823
[0]	train-mlogloss:2.42447	eval-mlogloss:2.43216
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.39394	eval-mlogloss:2.40653
[2]	train-mlogloss:2.37181	eval-mlogloss:2.38859
[3]	train-mlogloss:2.35408	eval-mlogloss:2.3746
[4]	train-mlogloss:2.33916	eval-mlogloss:2.36309
[5]	train-mlogloss:2.32625	eval-mlogloss:2.35336
[6]	train-mlogloss:2.31491	eval-mlogloss:2.34499
[7]	train-mlogloss:2.30482	eval-mlogloss:2.33771
[8]	train-mlogloss:2.29577	eval-mlogloss:2.33131
[9]	train-mlogloss:2.28761	eval-mlogloss:2.32566
[10]	train-mlogloss:2.2802	eval-mlogloss:2.32064
[11]	train-mlogloss:2.27344	eval-mlogloss:2.31617
[12]	train-mlogloss:2.26725	eval-mlogloss:2.31217
[13]	train-mlogloss:2.26157	eval-mlogloss:2.30859
[14]	train-mlogloss:2.25633	eval-mlogloss:2.30535
[15]	train-mlogloss:2.25149	eval-mlogloss:2.

In [62]:
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 30, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

# Train


### 每次一个feature

In [68]:
# coding=utf8
# Based on yibo's R script

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
#from sklearn.feature_extraction import FeatureHasher
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
#from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
#from sklearn.linear_model import LogisticRegression, SGDClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
#print("# Read App Events")
#app_ev = pd.read_csv("F:kaggle_data/talking_data_mobile/app_events.csv", usecols=['event_id', 'app_id'])
# remove duplicates(app_id)
#app_ev = app_ev.groupby("event_id")["app_id"].apply(
#    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

#app_ev.to_pickle('F:/app_ev')
app_ev = pd.read_pickle('F:/app_ev')
##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("F:kaggle_data/talking_data_mobile/events.csv", usecols=['event_id','device_id','timestamp'])
events["app_id"] = events["event_id"].map(app_ev)

del app_ev

print(events.shape)
events = events.dropna()
print('after dropna:' , events.shape)

#extract time feature
events['hour'] = pd.to_datetime(events.timestamp).dt.hour

#when the event happen in the events dataset
#%matplotlib inline
#events.hour.plot(kind='hist',bins=24)

# 1 to 4 clock in the morning 
events.loc[events.hour <= 4,'count'] = 1.0
events.loc[events.hour > 4,'count'] = 0.0

# how many times a person play in the morning 
counts = events.groupby('device_id')['count'].sum()
#group them
counts = counts.reset_index()
counts['count'] = counts['count'].astype(np.str) + '次'

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id
         # time group

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"
counts.columns.values[1] = 'feature'

FLS = pd.concat((f1, f2, f3, counts), axis=0, ignore_index=True)

print('add time group FLS.shape:' , FLS.shape)


###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()
print('device_ids:', device_ids.shape)
print('feature_cs:', feature_cs.shape)

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

print('add time train_sp:', train_sp.shape)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)
print('after select time train_sp:', train_sp.shape)
print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.05,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 35, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

# Read Events
(3252950, 4)
after dropna: (1488096, 4)
# Read Phone Brand
# Generate Train and Test
add time group FLS.shape: (2803279, 2)
# User-Item-Feature
device_ids: (189076,)
feature_cs: (21084,)
add time train_sp: (74645, 21084)
# Feature Selection




after select time train_sp: (74645, 4850)
# Num of Features:  4850
[0]	train-mlogloss:2.42432	eval-mlogloss:2.43211
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.39381	eval-mlogloss:2.40653
[2]	train-mlogloss:2.37169	eval-mlogloss:2.38862
[3]	train-mlogloss:2.35395	eval-mlogloss:2.37463
[4]	train-mlogloss:2.33901	eval-mlogloss:2.36313
[5]	train-mlogloss:2.32609	eval-mlogloss:2.35339
[6]	train-mlogloss:2.31473	eval-mlogloss:2.34502
[7]	train-mlogloss:2.30463	eval-mlogloss:2.33774
[8]	train-mlogloss:2.29557	eval-mlogloss:2.33135
[9]	train-mlogloss:2.28739	eval-mlogloss:2.32571
[10]	train-mlogloss:2.27996	eval-mlogloss:2.32069
[11]	train-mlogloss:2.27319	eval-mlogloss:2.31621
[12]	train-mlogloss:2.26699	eval-mlogloss:2.31221
[13]	train-mlogloss:2.26129	eval-mlogloss:2.30862
[14]	train-mlogloss:2.25604	eval-mlogloss:2.30538
[15]	train-mlogloss:2.25119	eval-mlogloss:

In [70]:
params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.055,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 35, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

[0]	train-mlogloss:2.4208	eval-mlogloss:2.42912
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.38912	eval-mlogloss:2.4027
[2]	train-mlogloss:2.36616	eval-mlogloss:2.38423
[3]	train-mlogloss:2.34781	eval-mlogloss:2.36989
[4]	train-mlogloss:2.33243	eval-mlogloss:2.35818
[5]	train-mlogloss:2.3192	eval-mlogloss:2.34833
[6]	train-mlogloss:2.30761	eval-mlogloss:2.33993
[7]	train-mlogloss:2.29737	eval-mlogloss:2.33266
[8]	train-mlogloss:2.28822	eval-mlogloss:2.32633
[9]	train-mlogloss:2.28001	eval-mlogloss:2.32077
[10]	train-mlogloss:2.27258	eval-mlogloss:2.31587
[11]	train-mlogloss:2.26584	eval-mlogloss:2.31154
[12]	train-mlogloss:2.25969	eval-mlogloss:2.30768
[13]	train-mlogloss:2.25407	eval-mlogloss:2.30424
[14]	train-mlogloss:2.24891	eval-mlogloss:2.30117
[15]	train-mlogloss:2.24415	eval-mlogloss:2.29842
[16]	train-mlogloss:2.23976	eval-mlogloss:2.29596
[17]	train-m

In [75]:
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 35, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

# Train


In [80]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

[0]	train-mlogloss:2.41116	eval-mlogloss:2.42091
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.37617	eval-mlogloss:2.3921
[2]	train-mlogloss:2.35108	eval-mlogloss:2.37231
[3]	train-mlogloss:2.33127	eval-mlogloss:2.35722
[4]	train-mlogloss:2.31491	eval-mlogloss:2.34513
[5]	train-mlogloss:2.30103	eval-mlogloss:2.33518
[6]	train-mlogloss:2.28907	eval-mlogloss:2.32687
[7]	train-mlogloss:2.27863	eval-mlogloss:2.31982
[8]	train-mlogloss:2.26945	eval-mlogloss:2.31382
[9]	train-mlogloss:2.26132	eval-mlogloss:2.30866
[10]	train-mlogloss:2.25406	eval-mlogloss:2.30422
[11]	train-mlogloss:2.24755	eval-mlogloss:2.30036
[12]	train-mlogloss:2.24168	eval-mlogloss:2.29702
[13]	train-mlogloss:2.23637	eval-mlogloss:2.29409
[14]	train-mlogloss:2.23155	eval-mlogloss:2.29154
[15]	train-mlogloss:2.22716	eval-mlogloss:2.28931
[16]	train-mlogloss:2.22314	eval-mlogloss:2.28735
[17]	train

In [81]:
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 35, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

# Train


### without add time group

In [63]:

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
#from sklearn.feature_extraction import FeatureHasher
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
#from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
#from sklearn.linear_model import LogisticRegression, SGDClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
#print("# Read App Events")
#app_ev = pd.read_csv("F:kaggle_data/talking_data_mobile/app_events.csv", usecols=['event_id', 'app_id'])
# remove duplicates(app_id)
#app_ev = app_ev.groupby("event_id")["app_id"].apply(
#    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

#app_ev.to_pickle('F:/app_ev')
app_ev = pd.read_pickle('F:/app_ev')
##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("F:kaggle_data/talking_data_mobile/events.csv", usecols=['event_id','device_id','timestamp'])
events["app_id"] = events["event_id"].map(app_ev)

del app_ev

print(events.shape)
events = events.dropna()
print(events.shape)

#extract time feature
#events['hour'] = pd.to_datetime(events.timestamp).dt.hour

#when the event happen in the events dataset
#%matplotlib inline
#events.hour.plot(kind='hist',bins=24)

# 1 to 5 clock in the morning 
#events.loc[events.hour <= 5,'count'] = 1.0
#events.loc[events.hour > 5,'count'] = 0.0

# how many times a person play in the morning 
#counts = events.groupby('device_id')['count'].sum()
#group them
#ind1 = counts == 0
#ind2 = (counts >0) & (counts <5)
#ind3 = counts >4
#counts[ind1] = '零次'
#counts[ind2] = '少次'
#counts[ind3] = '多次'

#del ind1, ind2, ind3

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id
#f4 = counts.reset_index()               # time group

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"
#f4.columns.values[1] = 'feature'

FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)

print(' not add time group FLS.shape:' , FLS.shape)


###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()
print('device_ids:', device_ids.shape)
print('feature_cs:', feature_cs.shape)

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

print('not add time train_sp:', train_sp.shape)
##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)

print('after select time train_sp:', train_sp.shape)
print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

# Read Events
(3252950, 4)
(1488096, 4)
# Read Phone Brand
# Generate Train and Test
 not add time group FLS.shape: (2742457, 2)
# User-Item-Feature
device_ids: (189076,)
feature_cs: (20967,)
not add time train_sp: (74645, 20967)
# Feature Selection




after select time train_sp: (74645, 4823)
# Num of Features:  4823
[0]	train-mlogloss:2.41138	eval-mlogloss:2.42104
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.37627	eval-mlogloss:2.39207
[2]	train-mlogloss:2.3512	eval-mlogloss:2.37224
[3]	train-mlogloss:2.33145	eval-mlogloss:2.35717
[4]	train-mlogloss:2.31514	eval-mlogloss:2.3451
[5]	train-mlogloss:2.30131	eval-mlogloss:2.33518
[6]	train-mlogloss:2.2894	eval-mlogloss:2.32688
[7]	train-mlogloss:2.27901	eval-mlogloss:2.31986
[8]	train-mlogloss:2.26988	eval-mlogloss:2.31388
[9]	train-mlogloss:2.26178	eval-mlogloss:2.30874
[10]	train-mlogloss:2.25456	eval-mlogloss:2.30432
[11]	train-mlogloss:2.24808	eval-mlogloss:2.30048
[12]	train-mlogloss:2.24225	eval-mlogloss:2.29715
[13]	train-mlogloss:2.23697	eval-mlogloss:2.29424
[14]	train-mlogloss:2.23218	eval-mlogloss:2.29171
[15]	train-mlogloss:2.22781	eval-mlogloss:2.2

## add is_active feature

In [82]:
# coding=utf8
# Based on yibo's R script

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
#from sklearn.feature_extraction import FeatureHasher
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
#from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
#from sklearn.linear_model import LogisticRegression, SGDClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
print("# Read App Events")
app_ev = pd.read_csv("F:kaggle_data/talking_data_mobile/app_events.csv", usecols=['event_id', 'app_id', 'is_active'])
#generate is_active feature
is_active = app_ev.groupby('event_id').sum().is_active

#remove duplicates(app_id)
app_ev = app_ev.groupby("event_id")["app_id"].apply(
    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

#app_ev.to_pickle('F:/app_ev')

##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("F:kaggle_data/talking_data_mobile/events.csv", usecols=['event_id','device_id','timestamp'])
events["app_id"] = events["event_id"].map(app_ev)
events['is_active'] = events['event_id'].map(is_active)

del app_ev
del is_active
events.to_pickle('F:/events')

print(events.shape)
events = events.dropna()
print('after dropna:' , events.shape)

#extract time feature
events['hour'] = pd.to_datetime(events.timestamp).dt.hour

#when the event happen in the events dataset
#%matplotlib inline
#events.hour.plot(kind='hist',bins=24)

# 1 to 4 clock in the morning 
events.loc[events.hour <= 4,'count'] = 1.0
events.loc[events.hour > 4,'count'] = 0.0

# how many times a person play in the morning 
counts = events.groupby('device_id')['count'].sum()
#group feature
counts = counts.reset_index()
counts['count'] = counts['count'].astype(np.str) + '次'
#is_active feature
is_active = events.groupby('device_id')['is_active'].sum()
is_active = is_active.fillna('unknow')
is_active = is_active.reset_index()
is_active['is_active'] = is_active['is_active'].astype(np.str) + '个'

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id
         # time group

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"
counts.columns.values[1] = 'feature'
is_active.columns.values[1] = 'feature'

FLS = pd.concat((f1, f2, f3, counts, is_active), axis=0, ignore_index=True)

print('add time group FLS.shape:' , FLS.shape)


###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()
print('device_ids:', device_ids.shape)
print('feature_cs:', feature_cs.shape)

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

print('add time train_sp:', train_sp.shape)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)
print('after select time train_sp:', train_sp.shape)
print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.05,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 35, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

# Read App Events
# Read Events
(3252950, 5)
after dropna: (1488096, 5)


TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')

In [86]:
is_active.head()

Unnamed: 0,device_id,is_active
0,-9222956879900151005,710.0
1,-9222661944218806987,46.0
2,-9222399302879214035,20.0
3,-9221825537663503111,252.0
4,-9221767098072603291,79.0


In [87]:
is_active['is_active'] = is_active['is_active'].astype(np.str) + '个'

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id
         # time group

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"
counts.columns.values[1] = 'feature'
is_active.columns.values[1] = 'feature'

FLS = pd.concat((f1, f2, f3, counts, is_active), axis=0, ignore_index=True)

print('add time group FLS.shape:' , FLS.shape)


###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()
print('device_ids:', device_ids.shape)
print('feature_cs:', feature_cs.shape)

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

print('add time train_sp:', train_sp.shape)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)
print('after select time train_sp:', train_sp.shape)
print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.05,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 35, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

# Read Phone Brand
# Generate Train and Test
add time group FLS.shape: (2864101, 2)
# User-Item-Feature
device_ids: (189076,)
feature_cs: (23627,)
add time train_sp: (74645, 23627)
# Feature Selection




after select time train_sp: (74645, 5434)
# Num of Features:  5434
[0]	train-mlogloss:2.42399	eval-mlogloss:2.43209
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.39328	eval-mlogloss:2.40646
[2]	train-mlogloss:2.371	eval-mlogloss:2.3885
[3]	train-mlogloss:2.35317	eval-mlogloss:2.3745
[4]	train-mlogloss:2.33815	eval-mlogloss:2.363
[5]	train-mlogloss:2.32516	eval-mlogloss:2.35327
[6]	train-mlogloss:2.31374	eval-mlogloss:2.34491
[7]	train-mlogloss:2.30358	eval-mlogloss:2.33762
[8]	train-mlogloss:2.29447	eval-mlogloss:2.33123
[9]	train-mlogloss:2.28624	eval-mlogloss:2.32558
[10]	train-mlogloss:2.27877	eval-mlogloss:2.32056
[11]	train-mlogloss:2.27195	eval-mlogloss:2.31607
[12]	train-mlogloss:2.26571	eval-mlogloss:2.31207
[13]	train-mlogloss:2.25997	eval-mlogloss:2.30848
[14]	train-mlogloss:2.25468	eval-mlogloss:2.30524
[15]	train-mlogloss:2.24979	eval-mlogloss:2.3023

In [90]:
params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 45, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

[0]	train-mlogloss:2.41068	eval-mlogloss:2.42089
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.37543	eval-mlogloss:2.39202
[2]	train-mlogloss:2.35017	eval-mlogloss:2.37218
[3]	train-mlogloss:2.33027	eval-mlogloss:2.35707
[4]	train-mlogloss:2.31382	eval-mlogloss:2.34498
[5]	train-mlogloss:2.29987	eval-mlogloss:2.33503
[6]	train-mlogloss:2.28784	eval-mlogloss:2.32672
[7]	train-mlogloss:2.27735	eval-mlogloss:2.31969
[8]	train-mlogloss:2.26811	eval-mlogloss:2.31368
[9]	train-mlogloss:2.25992	eval-mlogloss:2.30852
[10]	train-mlogloss:2.25261	eval-mlogloss:2.30408
[11]	train-mlogloss:2.24605	eval-mlogloss:2.30022
[12]	train-mlogloss:2.24014	eval-mlogloss:2.29687
[13]	train-mlogloss:2.23479	eval-mlogloss:2.29395
[14]	train-mlogloss:2.22993	eval-mlogloss:2.29139
[15]	train-mlogloss:2.2255	eval-mlogloss:2.28915
[16]	train-mlogloss:2.22145	eval-mlogloss:2.28719
[17]	train

In [91]:
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 40, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

# Train


In [34]:
pd.DataFrame(arr)

Unnamed: 0,0
0,-9223321966609553846
1,-9223067244542181226
2,-9223042152723782980
3,-9222956879900151005
4,-9222896629442493034
5,-9222894989445037972
6,-9222894319703307262
7,-9222754701995937853
8,-9222661944218806987
9,-9222399302879214035


In [27]:
# add time group
FLS.shape

(2803279, 2)

In [28]:
train_sp.shape

(74645, 4823)

In [29]:
##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.05,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 35, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)



[0]	train-mlogloss:2.42441	eval-mlogloss:2.43213
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.39393	eval-mlogloss:2.40658
[2]	train-mlogloss:2.37182	eval-mlogloss:2.38865
[3]	train-mlogloss:2.3541	eval-mlogloss:2.37466
[4]	train-mlogloss:2.33917	eval-mlogloss:2.36314
[5]	train-mlogloss:2.32625	eval-mlogloss:2.3534
[6]	train-mlogloss:2.3149	eval-mlogloss:2.34503
[7]	train-mlogloss:2.30481	eval-mlogloss:2.33774
[8]	train-mlogloss:2.29576	eval-mlogloss:2.33134
[9]	train-mlogloss:2.28759	eval-mlogloss:2.32569
[10]	train-mlogloss:2.28017	eval-mlogloss:2.32066
[11]	train-mlogloss:2.27341	eval-mlogloss:2.31619
[12]	train-mlogloss:2.26722	eval-mlogloss:2.31218
[13]	train-mlogloss:2.26153	eval-mlogloss:2.30858
[14]	train-mlogloss:2.25629	eval-mlogloss:2.30534
[15]	train-mlogloss:2.25144	eval-mlogloss:2.30242
[16]	train-mlogloss:2.24695	eval-mlogloss:2.29978
[17]	train-m

In [14]:
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 30, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

# Train


In [26]:
train_sp.shape

(74645, 4823)

In [None]:
FLS

In [22]:
FLS.feature.nunique()

20970

## parameter twick

In [1]:
# coding=utf8
# Based on yibo's R script

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
#from sklearn.feature_extraction import FeatureHasher
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
#from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
#from sklearn.linear_model import LogisticRegression, SGDClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
print("# Read App Events")
app_ev = pd.read_csv("F:kaggle_data/talking_data_mobile/app_events.csv", usecols=['event_id', 'app_id'])
# remove duplicates(app_id)
app_ev = app_ev.groupby("event_id")["app_id"].apply(
    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("F:kaggle_data/talking_data_mobile/events.csv", usecols=['event_id','device_id'])
events["app_id"] = events["event_id"].map(app_ev)

events = events.dropna()

del app_ev

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################


f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"

FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)



###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)

print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 3,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 5,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

# Read App Events
# Read Events
# Read Phone Brand
# Generate Train and Test
# User-Item-Feature
# Feature Selection




# Num of Features:  4823
[0]	train-mlogloss:2.418	eval-mlogloss:2.42501
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.38685	eval-mlogloss:2.39787
[2]	train-mlogloss:2.36486	eval-mlogloss:2.37919
[3]	train-mlogloss:2.34767	eval-mlogloss:2.36487
[4]	train-mlogloss:2.33354	eval-mlogloss:2.35331
[5]	train-mlogloss:2.32161	eval-mlogloss:2.34371
[6]	train-mlogloss:2.31136	eval-mlogloss:2.33561
[7]	train-mlogloss:2.30246	eval-mlogloss:2.32868
[8]	train-mlogloss:2.29465	eval-mlogloss:2.32272
[9]	train-mlogloss:2.28775	eval-mlogloss:2.31755
[10]	train-mlogloss:2.28163	eval-mlogloss:2.31304
[11]	train-mlogloss:2.27616	eval-mlogloss:2.30909
[12]	train-mlogloss:2.27125	eval-mlogloss:2.30562
[13]	train-mlogloss:2.26683	eval-mlogloss:2.30255
[14]	train-mlogloss:2.26283	eval-mlogloss:2.29984
[15]	train-mlogloss:2.2592	eval-mlogloss:2.29743
[16]	train-mlogloss:2.25589	eval-mlog

In [6]:
params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 7,
    "eval_metric": "mlogloss",
    "eta": 0.04,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 30, evals=watchlist,
                early_stopping_rounds=10, verbose_eval=True)



[0]	train-mlogloss:2.43232	eval-mlogloss:2.43422
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.40458	eval-mlogloss:2.40708
[2]	train-mlogloss:2.38432	eval-mlogloss:2.38716
[3]	train-mlogloss:2.36798	eval-mlogloss:2.37104
[4]	train-mlogloss:2.35414	eval-mlogloss:2.35734
[5]	train-mlogloss:2.34206	eval-mlogloss:2.34538
[6]	train-mlogloss:2.33136	eval-mlogloss:2.33478
[7]	train-mlogloss:2.32175	eval-mlogloss:2.32526
[8]	train-mlogloss:2.31306	eval-mlogloss:2.31666
[9]	train-mlogloss:2.30514	eval-mlogloss:2.30882
[10]	train-mlogloss:2.29789	eval-mlogloss:2.30166
[11]	train-mlogloss:2.29122	eval-mlogloss:2.29508
[12]	train-mlogloss:2.28506	eval-mlogloss:2.289
[13]	train-mlogloss:2.27935	eval-mlogloss:2.28338
[14]	train-mlogloss:2.27405	eval-mlogloss:2.27817
[15]	train-mlogloss:2.26912	eval-mlogloss:2.27332
[16]	train-mlogloss:2.26451	eval-mlogloss:2.2688
[17]	train-m

In [2]:
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 30, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('sub17.gz', index=True,
              index_label='device_id', compression="gzip")

# Train


In [3]:
pwd

'C:\\Users\\User\\Documents\\python_file\\kaggle\\Talking Data Mobile'

In [5]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 3,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 5,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 30, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)
print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 20, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('sub21.gz', index=True,
              index_label='device_id', compression="gzip")

[0]	train-mlogloss:2.41799	eval-mlogloss:2.42502
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.38687	eval-mlogloss:2.39791
[2]	train-mlogloss:2.36487	eval-mlogloss:2.37922
[3]	train-mlogloss:2.34768	eval-mlogloss:2.3649
[4]	train-mlogloss:2.33355	eval-mlogloss:2.35334
[5]	train-mlogloss:2.32161	eval-mlogloss:2.34374
[6]	train-mlogloss:2.31136	eval-mlogloss:2.33563
[7]	train-mlogloss:2.30246	eval-mlogloss:2.32871
[8]	train-mlogloss:2.29465	eval-mlogloss:2.32274
[9]	train-mlogloss:2.28776	eval-mlogloss:2.31757
[10]	train-mlogloss:2.28163	eval-mlogloss:2.31306
[11]	train-mlogloss:2.27616	eval-mlogloss:2.30911
[12]	train-mlogloss:2.27125	eval-mlogloss:2.30565
[13]	train-mlogloss:2.26683	eval-mlogloss:2.30258
[14]	train-mlogloss:2.26283	eval-mlogloss:2.29987
[15]	train-mlogloss:2.2592	eval-mlogloss:2.29746
[16]	train-mlogloss:2.2559	eval-mlogloss:2.29532
[17]	train-m

## add hour（每个events的hour） feature

In [1]:
# coding=utf8
# Based on yibo's R script

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
#from sklearn.feature_extraction import FeatureHasher
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
#from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
#from sklearn.linear_model import LogisticRegression, SGDClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
print("# Read App Events")
app_ev = pd.read_csv("F:kaggle_data/talking_data_mobile/app_events.csv", usecols=['event_id', 'app_id'])
# remove duplicates(app_id)
app_ev = app_ev.groupby("event_id")["app_id"].apply(
    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("F:kaggle_data/talking_data_mobile/events.csv", usecols=['event_id','device_id'])
events["app_id"] = events["event_id"].map(app_ev)

events = events.dropna()

del app_ev

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("F:kaggle_data/talking_data_mobile/phone_brand_device_model.csv")
pbd.drop_duplicates('device_id', keep='first', inplace=True)


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_train.csv", usecols=['device_id','group'])

test = pd.read_csv("F:kaggle_data/talking_data_mobile/gender_age_test.csv")
                  
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################

f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id

f4 = pd.read_csv("F:/kaggle_data/talking_data_mobile/events.csv", usecols=['device_id','timestamp'])
f4['hour'] = pd.to_datetime(f4.timestamp).dt.hour
f4.hour = f4.hour.astype(np.str) + 'h'
f4.drop('timestamp',axis=1, inplace=True)

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"
f4.columns.values[1] = "feature"

FLS = pd.concat((f1, f2, f3, f4), axis=0, ignore_index=True)


###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)

print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 40, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

# Read App Events
# Read Events
# Read Phone Brand
# Generate Train and Test
# User-Item-Feature
# Feature Selection




# Num of Features:  4828
[0]	train-mlogloss:2.41089	eval-mlogloss:2.4205
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[1]	train-mlogloss:2.37573	eval-mlogloss:2.39159
[2]	train-mlogloss:2.35057	eval-mlogloss:2.37176
[3]	train-mlogloss:2.33073	eval-mlogloss:2.35667
[4]	train-mlogloss:2.31434	eval-mlogloss:2.34457
[5]	train-mlogloss:2.30046	eval-mlogloss:2.33463
[6]	train-mlogloss:2.2885	eval-mlogloss:2.32631
[7]	train-mlogloss:2.27808	eval-mlogloss:2.31928
[8]	train-mlogloss:2.26891	eval-mlogloss:2.31329
[9]	train-mlogloss:2.26079	eval-mlogloss:2.30815
[10]	train-mlogloss:2.25354	eval-mlogloss:2.30372
[11]	train-mlogloss:2.24704	eval-mlogloss:2.29987
[12]	train-mlogloss:2.24118	eval-mlogloss:2.29654
[13]	train-mlogloss:2.23588	eval-mlogloss:2.29363
[14]	train-mlogloss:2.23107	eval-mlogloss:2.29109
[15]	train-mlogloss:2.22668	eval-mlogloss:2.28887
[16]	train-mlogloss:2.22267	eval-mlo

In [12]:
X_train

<67180x4828 sparse matrix of type '<class 'numpy.float64'>'
	with 1053878 stored elements in Compressed Sparse Row format>

In [14]:
Y.shape

(74645,)

In [19]:
y_train

array([6, 0, 9, ..., 0, 6, 9], dtype=int64)

In [16]:
y_train.shape

(67180,)

In [2]:
pwd

'C:\\Users\\User\\Documents\\python_file\\kaggle\\Talking Data Mobile'

## parameter tuning

In [10]:
params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 5,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 5,
    
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 25, evals=watchlist,
                 verbose_eval=True)

print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 25, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")

[0]	train-mlogloss:2.41716	eval-mlogloss:2.41884
[1]	train-mlogloss:2.38562	eval-mlogloss:2.38765
[2]	train-mlogloss:2.36335	eval-mlogloss:2.36553
[3]	train-mlogloss:2.34597	eval-mlogloss:2.34821
[4]	train-mlogloss:2.3317	eval-mlogloss:2.33394
[5]	train-mlogloss:2.31967	eval-mlogloss:2.32188
[6]	train-mlogloss:2.30936	eval-mlogloss:2.31155
[7]	train-mlogloss:2.30042	eval-mlogloss:2.30259
[8]	train-mlogloss:2.29258	eval-mlogloss:2.29475
[9]	train-mlogloss:2.28566	eval-mlogloss:2.28784
[10]	train-mlogloss:2.27952	eval-mlogloss:2.2817
[11]	train-mlogloss:2.27403	eval-mlogloss:2.27623
[12]	train-mlogloss:2.26911	eval-mlogloss:2.27132
[13]	train-mlogloss:2.26468	eval-mlogloss:2.26691
[14]	train-mlogloss:2.26067	eval-mlogloss:2.26291
[15]	train-mlogloss:2.25703	eval-mlogloss:2.25929
[16]	train-mlogloss:2.25372	eval-mlogloss:2.25599
[17]	train-mlogloss:2.2507	eval-mlogloss:2.25298
[18]	train-mlogloss:2.24793	eval-mlogloss:2.25022
[19]	train-mlogloss:2.24539	eval-mlogloss:2.24769
[20]	train-ml

## use randomforest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

In [18]:
rf = RandomForestClassifier(n_estimators=200, max_depth=8)
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_val)
print(log_loss(y_val, pred))

2.38169928051


In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
pred = rf.predict_proba(X_val)
print(log_loss(y_val, pred))

2.38169928051


In [22]:
from sklearn.cross_validation import cross_val_score

In [23]:
cross_val_score(rf, train_sp, Y, cv=10,scoring='log_loss')

array([-2.40303731, -2.39014326, -2.38879333, -2.38854669, -2.37988848,
       -2.37984983, -2.37470023, -2.36904485, -2.35956989, -2.36430039])