# Common import statements
For different splits go to the respective sections.

In [1]:
import pandas as pd
from pandas import DataFrame as df
import IPython as ip
import re
import numpy as np
import time

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
# Set input dir
import sys
sys.path.append("/home/mschlupp/pythonTools")
files_dir = "files/"

## Split data into `hasEvents` and `noEvents`

In [105]:
ga_full = pd.read_csv(files_dir+"traintest_fullevt.csv")

In [10]:
ga_hasEvts = ga_full[ga_full.hasEvents>0]

In [9]:
ga_noEvts = ga_full[ga_full.hasEvents==0]

In [13]:
ga_noEvts = ga_noEvts.drop(["hasEvents", "nEvts", "longitude_mean"
                , "longitude_variance", "latitude_mean"
                , "latitude_variance", "usageTime_mean"
                , "usageTime_variance", "usageDay_mean"
                ,"usageDay_variance"], axis=1)

In [14]:
ga_noEvts.to_csv("files/ga_noEvts.csv", index=False)

In [15]:
ga_hasEvts.to_csv("files/ga_hasEvts.csv", index=False)

In [17]:
%ls files/

app_events.csv              label_categories.csv
app_labels.csv              phone_brand_device_model.csv
apps_labels_categories.csv  phone_brand_device_model_engl.csv
events.csv                  sample_submission.csv
events_day_hour.csv         traintest_fullevt.csv
ga_hasEvts.csv              traintest_phone.csv
ga_noEvts.csv               traintest_phone_day_hour.csv
gender_age_test.csv         traintest_phone_evts.csv
gender_age_train.csv


## Split events data in ```hasApp``` and ```noApp```

In [7]:
evts= pd.read_csv("files/events_day_hour.csv", index_col=0)

In [6]:
ga_evts = pd.read_csv("files/ga_hasEvts.csv")

There are actually devide_ids that are not in the training and test set, but present in the events data. First get rid of those.

Eg:

In [9]:
-8295895346874348777 in ga_evts.device_id.values

False

In [151]:
# this is not very fast!
#ll = evts.device_id.map(lambda x: x in ga_evts.device_id.values)
#ll.value_counts()

#True     3160417
#False      92533
#Name: device_id, dtype: int64

In [153]:
# ask if the evts.device_id is in ga_evts
evts=evts.drop(["timestamp"],axis=1)
evts=evts[list(map(lambda x: x in ga_evts.device_id.values,evts.device_id))]

In [155]:
len(evts)

3160417

In [157]:
evts.to_csv("files/evts_ga_aligned_ids.csv", index=False)

Ok now all `device_id` entries are present in the train and test samples.

In [64]:
#read file in chunks, because else we run in memory troubles
size=50000
evts_chunks= pd.read_csv("files/evts_ga_aligned_ids.csv", chunksize=size)

In [65]:
evts = pd.DataFrame()

In [66]:
# try to be faster
# that's so much faster!!
start=time.time()
for i,chunk in enumerate(evts_chunks): 
    chunk = chunk.merge(ga_evts[["device_id","isTrain"]],on="device_id", how="left")
    #print(type(l), "\n", l)
    if i % 10 == 0:
        t = time.time()
        print((i+1)*size/1e3, "k events processed! Took: ", (t-start)/60., " minutes.")
    if i==0:
        evts = chunk 
    else:
        evts = pd.concat([evts,chunk], ignore_index=True)
        
print("\n\n done in ", (time.time()-start)/60., " minutes")

50.0 k events processed! Took:  0.0014940301577250163  minutes.
550.0 k events processed! Took:  0.020696655909220377  minutes.
1050.0 k events processed! Took:  0.5695470650990804  minutes.
1550.0 k events processed! Took:  0.6027606805165608  minutes.
2050.0 k events processed! Took:  0.6420061270395915  minutes.
2550.0 k events processed! Took:  0.688981544971466  minutes.
3050.0 k events processed! Took:  0.74254203637441  minutes.


 done in  0.7639761527379354  minutes


In [73]:
# take device with highest amount of events occurrances
evts.device_id.value_counts().head(1)

1186608308763918427    33426
Name: device_id, dtype: int64

In [77]:
# test if the merge was successful
print(ga_evts.isTrain[ga_evts.device_id==1186608308763918427])
evts[evts.device_id==1186608308763918427].isTrain.value_counts()

55072    0
Name: isTrain, dtype: int64


0    33426
Name: isTrain, dtype: int64

Ok this worked.
Event for the device with the highest number of event occurrences

In [32]:
# this got a bit more complicated than necessary due to missing device ids

# add training flag and write output to file
start=time.time()
for i,chunk in enumerate(evts_chunks): 
    l=list()
    for x in chunk.device_id.values:
        l.append(ga_evts[ga_evts.device_id==x].isTrain.values[0])    
    chunk["isTrain"]=l
    #print(type(l), "\n", l)
    if i % 10 == 0:
        t = time.time()
        print(i*size/1e3, "k events processed! Took: ", (t-start)/60., " minutes.")
    if i==0:
        evts = chunk #.to_csv("files/finalSets/evts_withTrain.csv", mode='w')
    else:
        evts = evts.append(chunk, ignore_index=True) #.to_csv("files/finalSets/evts_withTrain.csv", mode='a',header=False)
        
print("\n\n done in ", (time.time()-s)/60., " minutes")
# horribly slow: about 50 minutes

0.0 k events processed! Took:  0.7183468858400981  minutes.


KeyboardInterrupt: 

## We start to split data

In [2]:
# let'test our implementation with 1k observations before processing all data
evts = pd.read_csv("files/finalSets/evts_withTrain.csv")

In [4]:
len(evts)

3161049

In [90]:
pd.DataFrame(columns=evts.columns)

Unnamed: 0,event_id,device_id,longitude,latitude,day,time,hour,usageDay,isTrain


In [87]:
pd.concat([evts,pd.DataFrame([1,2,3,4,5,6,7,8,9])], axis=1)

Unnamed: 0,event_id,device_id,longitude,latitude,day,time,hour,usageDay,isTrain,0
0,,,,,,,,,,1
1,,,,,,,,,,2
2,,,,,,,,,,3
3,,,,,,,,,,4
4,,,,,,,,,,5
5,,,,,,,,,,6
6,,,,,,,,,,7
7,,,,,,,,,,8
8,,,,,,,,,,9


In [73]:
df

Unnamed: 0,a,b
0,1,3
1,2,2
2,3,1


In [51]:
# now we need the event ids from the apps data
app_evt_id = pd.read_csv("files/app_events.csv", usecols=["event_id"]
                         , dtype=np.float64)

In [60]:
app_evt_id = app_evt_id.event_id.unique()

In [26]:
evts["hasApp"] = 0

In [56]:
evts.event_id = pd.to_numeric(evts.event_id)

  if __name__ == '__main__':


In [65]:
list(map(lambda x: x in app_evt_id, evts.event_id.values))

KeyboardInterrupt: 

In [66]:
import time
s = time.time
evts.hasApp = [1 if x in app_evt_id else 0 for x in evts.event_id.values]
print("took:", (time.time-s)/60, " minutes")

KeyboardInterrupt: 

In [None]:
print("a")

In [34]:
evts.hasApp.loc[list(map(lambda x: x in app_evt_id, evts.event_id))] = 1

  if __name__ == '__main__':


In [35]:
evts.hasApp.value_counts()

0    3161049
Name: hasApp, dtype: int64