# Import libraries

In [1]:
import numpy as np
import pandas as pd

# app_labels

#### Read data

In [2]:
df_app_label = pd.read_csv("../data/app_labels.csv")
df_app_label.head()

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


#### Check duplicated data

In [3]:
print(df_app_label.duplicated().sum())
df_app_label.drop_duplicates(keep='first', inplace=True)
df_app_label.shape

491


(459452, 2)

#### Check missing data

In [4]:
df_app_label.isnull().any()

app_id      False
label_id    False
dtype: bool

# label_categories

#### Read data

In [5]:
df_label_categories = pd.read_csv("../data/label_categories.csv")
print(df_label_categories.shape)
df_label_categories.head()

(930, 2)


Unnamed: 0,label_id,category
0,1,
1,2,game-game type
2,3,game-Game themes
3,4,game-Art Style
4,5,game-Leisure time


#### Check duplicated data

In [6]:
print(df_label_categories.duplicated().sum())
df_label_categories.drop_duplicates(keep='first', inplace=True)
df_label_categories.shape

0


(930, 2)

#### Check missing data

In [7]:
print(df_label_categories[df_label_categories['category'].isnull()])
df_label_categories.dropna(inplace=True)
df_label_categories.shape

     label_id category
0           1      NaN
229       248      NaN
245       264      NaN


(927, 2)

# Merge df_app_label and df_label_categories

In [8]:
df_app = df_app_label.merge(df_label_categories, how='inner', on='label_id')
print(df_app.shape)
df_app.head()

(459452, 3)


Unnamed: 0,app_id,label_id,category
0,7324884708820027918,251,Finance
1,-4494216993218550286,251,Finance
2,8756705988821000489,251,Finance
3,1061207043315821111,251,Finance
4,-1491198667294647703,251,Finance


#### Drop label_id

In [9]:
df_app.drop(columns='label_id', inplace=True)
df_app.head()

Unnamed: 0,app_id,category
0,7324884708820027918,Finance
1,-4494216993218550286,Finance
2,8756705988821000489,Finance
3,1061207043315821111,Finance
4,-1491198667294647703,Finance


#### One-hot encoding category 

In [10]:
df_app = pd.get_dummies(df_app)
print(df_app.shape)
df_app.head()

(459452, 474)


Unnamed: 0,app_id,category_1 free,category_1 reputation,category_1 vitality,category_3 kindom game,category_80s Japanese comic,category_90s Japanese comic,category_A beauty care,category_A shares,category_ARPG,...,category_travel,category_tribe,category_trickery,category_unknown,category_video,category_violence comic,category_vitality,category_war chess,category_weibo,category_zombies game
0,7324884708820027918,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-4494216993218550286,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8756705988821000489,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1061207043315821111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1491198667294647703,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Groupby app_id and sum

In [11]:
df_app = df_app.groupby('app_id').sum()
print(df_app.shape)
df_app.head()

(113211, 473)


Unnamed: 0_level_0,category_1 free,category_1 reputation,category_1 vitality,category_3 kindom game,category_80s Japanese comic,category_90s Japanese comic,category_A beauty care,category_A shares,category_ARPG,category_Academic Information,...,category_travel,category_tribe,category_trickery,category_unknown,category_video,category_violence comic,category_vitality,category_war chess,category_weibo,category_zombies game
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223281467940916832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9222877069545393219,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9222785464897897681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9222198347540756780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9221970424041518544,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# app_events

#### Read data

In [13]:
df_app_events = pd.read_csv("../data/app_events.csv")
df_app_events.head()

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0
3,2,-653184325010919369,1,1
4,2,8693964245073640147,1,1


#### Drop is_installed and filter by is_active

In [14]:
df_app_events.drop(columns='is_installed', inplace=True)
df_app_events = df_app_events[df_app_events.is_active == 1]
df_app_events.head()

Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,1
3,2,-653184325010919369,1
4,2,8693964245073640147,1
5,2,4775896950989639373,1
9,2,7167114343576723123,1


#### merge on app_id

In [15]:
df_app.reset_index(inplace=True)
df_app_events = df_app_events.merge(df_app, on='app_id', how='inner')
print(df_app_events.shape)
df_app_events.head()

(12732996, 476)


Unnamed: 0,event_id,app_id,is_active,category_1 free,category_1 reputation,category_1 vitality,category_3 kindom game,category_80s Japanese comic,category_90s Japanese comic,category_A beauty care,...,category_travel,category_tribe,category_trickery,category_unknown,category_video,category_violence comic,category_vitality,category_war chess,category_weibo,category_zombies game
0,2,5927333115845830913,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,5927333115845830913,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,5927333115845830913,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,39,5927333115845830913,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,44,5927333115845830913,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Drop app_id and is_active

In [16]:
df_app_events.drop(columns=['is_active', 'app_id'], inplace=True)
df_app_events.head()

Unnamed: 0,event_id,category_1 free,category_1 reputation,category_1 vitality,category_3 kindom game,category_80s Japanese comic,category_90s Japanese comic,category_A beauty care,category_A shares,category_ARPG,...,category_travel,category_tribe,category_trickery,category_unknown,category_video,category_violence comic,category_vitality,category_war chess,category_weibo,category_zombies game
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,39,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,44,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Clear non-used dataframe 

In [17]:
import gc
del df_app_label
del df_label_categories
del df_app

print(gc.collect())

208


#### Groupby event_id and sum

In [18]:
df_app_events = df_app_events.groupby('event_id').sum()
df_app_events.shape

(1477059, 473)

# events

#### Read data

In [19]:
df_events = pd.read_csv("../data/events.csv")
df_events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


#### Drop time and location

In [20]:
df_events.drop(columns=['timestamp', 'longitude', 'latitude'], inplace=True)
df_events.head()

Unnamed: 0,event_id,device_id
0,1,29182687948017175
1,2,-6401643145415154744
2,3,-4833982096941402721
3,4,-6815121365017318426
4,5,-5373797595892518570


#### Merge df_app_events on event_id

In [21]:
df_app_events.reset_index(inplace=True)
df_events = df_events.merge(df_app_events, on='event_id', how='inner')
print(df_events.shape)
df_events.head()

(1477059, 475)


Unnamed: 0,event_id,device_id,category_1 free,category_1 reputation,category_1 vitality,category_3 kindom game,category_80s Japanese comic,category_90s Japanese comic,category_A beauty care,category_A shares,...,category_travel,category_tribe,category_trickery,category_unknown,category_video,category_violence comic,category_vitality,category_war chess,category_weibo,category_zombies game
0,2,-6401643145415154744,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,1476664663289716375,1,1,0,0,0,0,0,0,...,0,0,0,7,1,0,0,0,0,0
2,7,5990807147117726237,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,1,0
3,9,-2073340001552902943,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,16,9070651185984875886,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Drop event_id and groupby device_id

In [22]:
df_events.drop(columns='event_id', inplace=True)
df_events = df_events.groupby('device_id').sum()
print(df_events.shape)
df_events.head()

(60669, 473)


Unnamed: 0_level_0,category_1 free,category_1 reputation,category_1 vitality,category_3 kindom game,category_80s Japanese comic,category_90s Japanese comic,category_A beauty care,category_A shares,category_ARPG,category_Academic Information,...,category_travel,category_tribe,category_trickery,category_unknown,category_video,category_violence comic,category_vitality,category_war chess,category_weibo,category_zombies game
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222956879900151005,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,244.0,0.0,0.0,0.0,0.0,0.0,0.0
-9222661944218806987,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
-9222399302879214035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0
-9221825537663503111,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,20.0,45.0,0.0,0.0,0.0,0.0,0.0
-9221767098072603291,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0


#### Transfer all non-zero numbers to one

In [23]:
df_events = df_events.applymap(lambda x:1 if x!=0 else 0)
df_events.head()

Unnamed: 0_level_0,category_1 free,category_1 reputation,category_1 vitality,category_3 kindom game,category_80s Japanese comic,category_90s Japanese comic,category_A beauty care,category_A shares,category_ARPG,category_Academic Information,...,category_travel,category_tribe,category_trickery,category_unknown,category_video,category_violence comic,category_vitality,category_war chess,category_weibo,category_zombies game
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222956879900151005,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
-9222661944218806987,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
-9222399302879214035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
-9221825537663503111,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
-9221767098072603291,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


#### Write to label_matrix

In [24]:
df_events.to_csv('../matrix_for_model/label_matrix.csv')