In [27]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import time
from sklearn.metrics import log_loss
import os
data_dir = '/home/satyendra/sessions/kaggle/data/talking_data'
device_file = 'phone_brand_device_model.csv'

In [18]:
def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table

In [19]:
def read_train_test():
    # App
    print('Read apps...')
    app = pd.read_csv(os.path.join(data_dir,"app_events.csv"), dtype={'device_id': np.str})
    app['appcounts'] = app.groupby(['event_id'])['app_id'].transform('count')
    app_small = app[['event_id', 'appcounts']].drop_duplicates('event_id', keep='first')

    # Events
    print('Read events...')
    events = pd.read_csv(os.path.join(data_dir,"events.csv"), dtype={'device_id': np.str})
    events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
    events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')
    e1=pd.merge(events, app_small, how='left', on='event_id', left_index=True)
    e1.loc[e1.isnull()['appcounts'] ==True, 'appcounts']=0
    e1['appcounts1'] = e1.groupby(['device_id'])['appcounts'].transform('sum')
    e1_small = e1[['device_id', 'appcounts1']].drop_duplicates('device_id', keep='first')


    # Phone brand
    print('Read brands...')
    pbd = pd.read_csv(os.path.join(data_dir,"phone_brand_device_model.csv"), dtype={'device_id': np.str})
    pbd.drop_duplicates('device_id', keep='first', inplace=True)
    pbd = map_column(pbd, 'phone_brand')
    pbd = map_column(pbd, 'device_model')

    # Train
    print('Read train...')
    train = pd.read_csv(os.path.join(data_dir,"gender_age_train.csv"), dtype={'device_id': np.str})
    train = map_column(train, 'group')
    train = train.drop(['age'], axis=1)
    train = train.drop(['gender'], axis=1)
    train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
    train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)
    train = pd.merge(train, e1_small, how='left', on='device_id', left_index=True)
    train.fillna(-1, inplace=True)

    # Test
    print('Read test...')
    test = pd.read_csv(os.path.join(data_dir,"gender_age_test.csv"), dtype={'device_id': np.str})
    test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
    test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)
    test = pd.merge(test, e1_small, how='left', on='device_id', left_index=True)
    test.fillna(-1, inplace=True)

    # Features
    features = list(test.columns.values)
    features.remove('device_id')

    return train, test, features

In [20]:
train, test, features = read_train_test()

Read apps...
Read events...
Read brands...
Read train...
Read test...


In [24]:
pbd = pd.read_csv(os.path.join(data_dir,"phone_brand_device_model.csv"), dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)
pbd = map_column(pbd, 'phone_brand')
pbd = map_column(pbd, 'device_model')

In [28]:
device_df = pd.read_csv(os.path.join(data_dir, device_file))

In [30]:
device_df['phone'] = device_df['phone_brand'] + device_df['device_model']
del device_df['phone_brand']
del device_df['device_model']
device_df['phone'] = device_df['phone'].map(lambda x: x.replace(' ',''))

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

In [33]:
X = vectorizer.fit_transform(device_df['phone'])

In [48]:
df1 = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

In [36]:
del device_df['phone']

In [47]:
df2 = pd.concat([device_df,df1],axis=1)

Unnamed: 0,device_id,0,1,2,3,4,5,6,7,8,...,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680
0,-8890648629457979026,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1277779817574759137,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5137427614288105724,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3669464369358936369,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-5019277647504317457,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,3238009352149731868,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,-3883532755183027260,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,-2972199645857147708,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,-5827952925479472594,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,-8262508968076336275,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
df1 = None