In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import random
import time
import os
import re
from sklearn.metrics import log_loss
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt 
from IPython.display import display
%matplotlib inline

ImportError: No module named geopy.distance

In [36]:
import logging
import warnings
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.INFO)

from tensorflow.contrib.learn.python.learn.estimators import run_config
config_  = run_config.RunConfig(num_cores=12, allow_soft_placement= True)

## Get static feature ready data 

In [2]:
train_ = pd.read_csv('train.csv', dtype={'device_id': np.str})
eval_ = pd.read_csv('eval.csv', dtype={'device_id': np.str})
test = pd.read_csv('test.csv', dtype={'device_id': np.str})

In [3]:
train_df = train_.append(eval_)
print(train_df.device_id.unique().shape)
display(train_df.head().T)

(74645,)


Unnamed: 0,0,1,2,3,4
5_1_day,1,0,0,0,0
5_2_day,0,0,0,0,0
5_3_day,0,1,0,1,1
5_4_day,0,0,1,1,0
5_5_day,0,0,0,0,0
5_6_day,0,0,0,1,0
5_7_day,0,0,0,2,0
H0_to_2,0,0,0,0,0
H0_to_6,0,0,0,0,1
H0_to_9,0,1,0,1,1


## parper time-series features

In [6]:

fdics = dict()
for dirpath, dirnames, filenames in os.walk('../csvs/'):
    for f in filenames:
        try:
            fdics[f] = pd.read_csv(os.path.join(dirpath, f), dtype={'device_id': np.str,
                                              'app_id': np.str,
                                              'event_id': np.str,
                                              'label_id': np.str})
        except:
            print(f)
            fdics[f] = pd.read_csv(os.path.join(dirpath, f))

In [7]:
# read data
events = fdics['events.csv']
ga_train = fdics['gender_age_train.csv']
ga_test  = fdics['gender_age_test.csv']

# merge tran and test first
ga_train.drop(['gender', 'age', 'group'], axis=1, inplace=True)
ga_train_test = ga_train.append(ga_test)

# merge with events table
events_ta_te = pd.merge(ga_train_test, events, how='left', on='device_id')

# fill nan values

events_ta_te['event_id'] = events_ta_te.event_id.fillna('-1')
times = events_ta_te.timestamp.dropna()
events_ta_te['timestamp'] = events_ta_te.timestamp.apply(lambda x: x if not pd.isnull(x) 
                                                         else np.random.choice(times))
events_ta_te['longitude'] = events_ta_te.longitude.fillna(0)
events_ta_te['latitude'] = events_ta_te.latitude.fillna(0)

df_gps_raw = events_ta_te[['longitude', 'latitude']]
df_gps = df_gps_raw.drop_duplicates()

# define the number of kilometers in one radian
kms_per_radian = 6371.0088

# define eps radians for both 20km, 35km per radians resolution
eps_rad_35, eps_rad_20 = 35 / kms_per_radian, 20 / kms_per_radian

# convert long/lat to radians
coords = df_gps.as_matrix(columns=['latitude', 'longitude']) 

# fit with DBSCAN estimator
db_20 = DBSCAN(eps=eps_rad_20, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
db_35 = DBSCAN(eps=eps_rad_35, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    
# assign labels
df_gps['geo_label_20'] = db_20.labels_
df_gps['geo_label_35'] = db_35.labels_


# merge back to events
events_ta_te = pd.merge(events_ta_te, df_gps, on=['longitude', 'latitude'], how='left')
events_ta_te.drop(['longitude', 'latitude'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
display(events_ta_te.head(3))

Unnamed: 0,device_id,event_id,timestamp,geo_label_20,geo_label_35
0,-8076087639492063270,-1,2016-05-05 03:26:51,0,0
1,-2897161552818060146,-1,2016-05-01 06:53:50,0,0
2,-8260683887967679142,2479656,2016-05-01 14:23:37,0,0


## prepare app time series

In [10]:
apps = pd.merge(fdics['label_categories.csv'], fdics['app_labels.csv'], how='left', on='label_id')
apps.dropna(subset=['app_id'], inplace=True)
apps['category'] = apps.category.apply(lambda x: x.replace(' ','_'))

apps_cates = apps.category.unique().tolist()
global apps_cates

def flattern_cate(df):
    S = {col:0 for col in apps_cates}
    vc = df.category.value_counts()
    for k,v in vc.iteritems():
        S[k] = v
    return pd.Series(S)

apps_gp_appid = apps.groupby('app_id')
apps_flatteren = apps_gp_appid.apply(flattern_cate)

#  Variance category features selection 
threshold = .95 * (1 - .95)
sel = VarianceThreshold(threshold=(threshold))
sel_ = sel.fit(apps_flatteren)

# select cols which are above the threshold
apps_cate_reduced = apps_flatteren.ix[:, sel_.get_support()]
apps_cates = apps_cate_reduced.columns.tolist()
apps_cate_reduced['app_id'] = apps_cate_reduced.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## merge events and apps 

In [14]:
# merge everything
apps = pd.merge(fdics['app_events.csv'], apps_cate_reduced, on='app_id', how='left')

print('events shape before merge: ', events_ta_te.shape)
events_ta_te = pd.merge(events_ta_te, apps, on='event_id', how='left')
print('events shape after merge: ', events_ta_te.shape)

('events shape before merge: ', (3288630, 8))
('events shape after merge: ', (33642753, 25))


### convert timestamps  

In [11]:
events_ta_te['timestamp'] =  pd.to_datetime(events_ta_te['timestamp'])
events_ta_te['day_of_week'] = events_ta_te['timestamp'].apply(lambda x: x.weekday())
events_ta_te['hour'] = events_ta_te['timestamp'].apply(lambda x: x.hour)
events_ta_te['date'] = events_ta_te['timestamp'].apply(lambda x: x.date())

In [12]:
events_ta_te.head().T

Unnamed: 0,0,1,2,3,4
device_id,-8076087639492063270,-2897161552818060146,-8260683887967679142,-4938849341048082022,245133531816851882
event_id,-1,-1,2479656,-1,-1
timestamp,2016-05-05 03:26:51,2016-05-01 06:53:50,2016-05-01 14:23:37,2016-05-03 22:44:14,2016-05-04 14:32:00
geo_label_20,0,0,0,0,0
geo_label_35,0,0,0,0,0
day_of_week,3,6,6,1,2
hour,3,6,14,22,14
date,2016-05-05,2016-05-01,2016-05-01,2016-05-03,2016-05-04


In [15]:
events_ta_te.columns.values

array(['device_id', 'event_id', 'timestamp', 'geo_label_20',
       'geo_label_35', 'day_of_week', 'hour', 'date', 'app_id',
       'is_installed', 'is_active', '1_free', 'Cards_RPG',
       'Casual_puzzle_categories', 'Custom_label', 'Industry_tag',
       'Irritation_/_Fun_1', 'Personal_Effectiveness_1',
       'Property_Industry_2.0', 'Property_Industry_new', 'Relatives_1',
       'Services_1', 'Tencent', 'game', 'unknown'], dtype=object)

### prepare events_ta_te for merging 

In [17]:
app_cats_ts = ['1_free', 'Cards_RPG',
       'Casual_puzzle_categories', 'Custom_label', 'Industry_tag',
       'Irritation_/_Fun_1', 'Personal_Effectiveness_1',
       'Property_Industry_2.0', 'Property_Industry_new', 'Relatives_1',
       'Services_1', 'Tencent', 'game', 'unknown']
app_cats_ts_new = ['TS_'+name for name in app_cats_ts]
events_ta_te.rename(columns = { old: new for old,new in zip(app_cats_ts, app_cats_ts_new)}, inplace=True)

events_ta_te.drop(['event_id', 'timestamp', 'app_id'], axis=1, inplace=True)

In [18]:
print(events_ta_te.shape, events_ta_te.device_id.unique().shape)
events_ta_te.columns.values



((33642753, 23), (186716,))


array(['device_id', 'geo_label_20', 'geo_label_35', 'day_of_week', 'hour',
       'date', 'app_id', 'is_installed', 'is_active', 'TS_1_free',
       'TS_Cards_RPG', 'TS_Casual_puzzle_categories', 'TS_Custom_label',
       'TS_Industry_tag', 'TS_Irritation_/_Fun_1',
       'TS_Personal_Effectiveness_1', 'TS_Property_Industry_2.0',
       'TS_Property_Industry_new', 'TS_Relatives_1', 'TS_Services_1',
       'TS_Tencent', 'TS_game', 'TS_unknown'], dtype=object)

## merge time-series with static features 

In [21]:
train_df.columns.values

array(['5_1_day', '5_2_day', '5_3_day', '5_4_day', '5_5_day', '5_6_day',
       '5_7_day', 'H0_to_2', 'H0_to_6', 'H0_to_9', 'H12_to_14',
       'H14_to_18', 'H18_to_20', 'H18_to_23', 'H19_to_23', 'H1_to_5',
       'H20_to_22', 'H2_to_6', 'H4_to_8', 'H6_to_9', 'H8_to_10',
       'H9_to_12', 'H9_to_14', 'H9_to_18', 'app_active_count', 'app_count',
       'avg_events_perday', 'event_count', 'geo_20_1_count',
       'geo_20_2_count', 'geo_20_3_count', 'geo_35_1_count',
       'geo_35_2_count', 'geo_35_3_count', 'geo_count_51holiday',
       'geo_count_None_51holiday', 'group', 'holiday_51_count',
       'hour_dt_count', 'hour_nt_count', 'weekday_count', 'weekend_count',
       '1_free', 'Cards_RPG', 'Casual_puzzle_categories', 'Custom_label',
       'Industry_tag', 'Irritation_/_Fun_1', 'Personal_Effectiveness_1',
       'Property_Industry_2.0', 'Property_Industry_new', 'Relatives_1',
       'Services_1', 'Tencent', 'game', 'unknown', 'device_id',
       'phone_brand', 'device_model', 'dt_

In [24]:
train_df.shape

(74839, 63)

In [26]:
events_train = events_ta_te[events_ta_te.device_id.isin(train_df.device_id.unique())]
events_test = events_ta_te[events_ta_te.device_id.isin(test.device_id.unique())]

print(events_train.shape, events_train.device_id.unique().shape)
print(events_test.shape, events_test.device_id.unique().shape)


((12947753, 22), (74645,))
((20695000, 22), (112071,))


In [27]:
train_merged = pd.merge(events_train, train_df, on='device_id', how='left')
test_merged = pd.merge(events_test, test, on='device_id', how='left')

print(train_merged.shape, train_merged.device_id.unique().shape)
print(test_merged.shape, test_merged.device_id.unique().shape)

((12987131, 84), (74645,))
((20794110, 84), (112071,))


In [30]:
events_train.isnull().sum()

device_id                           0
geo_label_20                        0
geo_label_35                        0
day_of_week                         0
hour                                0
date                                0
is_installed                   710556
is_active                      710556
TS_1_free                      710556
TS_Cards_RPG                   710556
TS_Casual_puzzle_categories    710556
TS_Custom_label                710556
TS_Industry_tag                710556
TS_Irritation_/_Fun_1          710556
TS_Personal_Effectiveness_1    710556
TS_Property_Industry_2.0       710556
TS_Property_Industry_new       710556
TS_Relatives_1                 710556
TS_Services_1                  710556
TS_Tencent                     710556
TS_game                        710556
TS_unknown                     710556
dtype: int64

In [31]:
train_df.isnull().sum()

5_1_day                     0
5_2_day                     0
5_3_day                     0
5_4_day                     0
5_5_day                     0
5_6_day                     0
5_7_day                     0
H0_to_2                     0
H0_to_6                     0
H0_to_9                     0
H12_to_14                   0
H14_to_18                   0
H18_to_20                   0
H18_to_23                   0
H19_to_23                   0
H1_to_5                     0
H20_to_22                   0
H2_to_6                     0
H4_to_8                     0
H6_to_9                     0
H8_to_10                    0
H9_to_12                    0
H9_to_14                    0
H9_to_18                    0
app_active_count            0
app_count                   0
avg_events_perday           0
event_count                 0
geo_20_1_count              0
geo_20_2_count              0
                           ..
geo_35_3_count              0
geo_count_51holiday         0
geo_count_

In [29]:
train_merged.isnull().sum()

device_id                           0
geo_label_20                        0
geo_label_35                        0
day_of_week                         0
hour                                0
date                                0
is_installed                   713679
is_active                      713679
TS_1_free                      713679
TS_Cards_RPG                   713679
TS_Casual_puzzle_categories    713679
TS_Custom_label                713679
TS_Industry_tag                713679
TS_Irritation_/_Fun_1          713679
TS_Personal_Effectiveness_1    713679
TS_Property_Industry_2.0       713679
TS_Property_Industry_new       713679
TS_Relatives_1                 713679
TS_Services_1                  713679
TS_Tencent                     713679
TS_game                        713679
TS_unknown                     713679
5_1_day                             0
5_2_day                             0
5_3_day                             0
5_4_day                             0
5_5_day     

In [33]:
test_merged.isnull().sum()

device_id                             0
geo_label_20                          0
geo_label_35                          0
day_of_week                           0
hour                                  0
date                                  0
is_installed                    1140649
is_active                       1140649
TS_1_free                       1140649
TS_Cards_RPG                    1140649
TS_Casual_puzzle_categories     1140649
TS_Custom_label                 1140649
TS_Industry_tag                 1140649
TS_Irritation_/_Fun_1           1140649
TS_Personal_Effectiveness_1     1140649
TS_Property_Industry_2.0        1140649
TS_Property_Industry_new        1140649
TS_Relatives_1                  1140649
TS_Services_1                   1140649
TS_Tencent                      1140649
TS_game                         1140649
TS_unknown                      1140649
5_1_day                               0
5_2_day                               0
5_3_day                               0


In [34]:
train_merged.fillna(-1, inplace=True)
test_merged.fillna(-1, inplace=True)
test_merged.drop(['group'], axis=1, inplace=True)

### split train into train and eval

In [35]:
train_merged = train_merged.iloc[np.random.permutation(len(train_merged))]

trai_ = train_merged.iloc[:-300000, :]
eval_ = train_merged.iloc[-300000:,:]
trai_.to_csv('train.csv', index=False)
eval_.to_csv('eval.csv', index=False)
test_merged.to_csv('test.csv', index=False)

In [38]:
trai_.shape

(12687131, 84)

In [39]:
tt = train_.drop_duplicates(['device_id'])
print(tt.shape)
tt.head()

(54703, 63)


Unnamed: 0,5_1_day,5_2_day,5_3_day,5_4_day,5_5_day,5_6_day,5_7_day,H0_to_2,H0_to_6,H0_to_9,...,Tencent,game,unknown,device_id,phone_brand,device_model,dt_count_51holiday,dt_count_None_51holiday,nt_count_51holiday,nt_count_None_51holiday
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,-1000369272589010951,vivo,Y17T,1,0,0,0
1,0,0,1,0,0,0,0,0,0,1,...,0,0,0,-1000572055892391496,OPPO,R819T,0,1,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,-1000643208750517791,金立,Others,0,0,0,1
3,0,0,1,1,0,1,2,0,0,1,...,0,0,2,-1001337759327042486,OPPO,A31,0,3,0,1
4,0,0,1,0,0,0,0,0,1,1,...,0,0,0,-1001949518704267063,OPPO,R1,0,0,0,1


## Sort train data by dpulicates 

In [3]:
import pandas as pd
import numpy as np

In [4]:
train_df = pd.read_csv('train.csv', dtype={'device_id': np.str})

In [5]:
train_df['ids'] = train_df.groupby('device_id')['device_id'].cumcount()

In [9]:
train_df[['device_id', 'ids']].iloc[[3000, 12000,12002],:]

Unnamed: 0,device_id,ids
3000,92253795511102805,2
12000,7826519609729655346,2
12002,5177144580224941136,0


In [14]:
train_df.sort_values(by=['ids'], inplace=True)

In [11]:
train_df[['device_id', 'ids']].iloc[[3000, 12000,12002],:]

Unnamed: 0,device_id,ids
12548652,-1002969456091702673,243
10566732,-1021642161927895031,1486
10579129,-1021642161927895031,1488


In [16]:
train_df[['device_id', 'ids']].head(110)

Unnamed: 0,device_id,ids
4215796,-1000369272589010951,0
6267858,-6564867613556609542,0
14636,-6564340457392363163,0
11587010,-6564313410201276271,0
159399,-6564307798349654262,0
705450,-6564134659504425442,0
32967,-6564115631890919343,0
2872628,-656407883286086774,0
221629,-6563968582368132659,0
10032,-6563927821739874072,0


In [19]:
train_df.drop('ids', axis=1, inplace=True)

In [20]:
train_df.to_csv('train_sorted.csv', index=False)

In [25]:
import tensorflow as tf

ImportError: cannot import name pywrap_tensorflow