# Import libraries

In [1]:
import numpy as np
import pandas as pd

# phone_brand_device_model

#### Read data

In [2]:
df_brand = pd.read_csv("../data/phone_brand_device_model.csv")
df_brand.shape

(187245, 3)

#### Check duplicated data

In [3]:
print('Numbers of duplicated data:', df_brand.duplicated('device_id').sum())
df_brand.drop_duplicates('device_id', keep='first', inplace=True)
df_brand.shape

Numbers of duplicated data: 529


(186716, 3)

#### Check missing data

In [4]:
df_brand.isnull().any()

device_id       False
phone_brand     False
device_model    False
dtype: bool

#### One-hot encoding 

In [5]:
df_brand = pd.get_dummies(df_brand)
df_brand.head()

Unnamed: 0,device_id,phone_brand_E人E本,phone_brand_E派,phone_brand_HTC,phone_brand_LG,phone_brand_LOGO,phone_brand_Lovme,phone_brand_MIL,phone_brand_OPPO,phone_brand_PPTV,...,device_model_魅蓝Note3,device_model_魅蓝metal,device_model_魔镜 X5,device_model_麦芒3,device_model_麦芒3S,device_model_麦芒4,device_model_黄金斗士A8,device_model_黄金斗士Note8,device_model_黄金斗士S8畅玩版,device_model_黄金斗士青春版
0,-8890648629457979026,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1277779817574759137,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5137427614288105724,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3669464369358936369,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-5019277647504317457,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# gender_age_train

#### Read data

In [6]:
df_train = pd.read_csv("../data/gender_age_train.csv")
df_train.shape

(74645, 4)

#### Check duplicated data

In [7]:
print('Numbers of duplicated data:', df_train.duplicated('device_id').sum())

Numbers of duplicated data: 0


#### Check missing data

In [8]:
df_train.isnull().any()

device_id    False
gender       False
age          False
group        False
dtype: bool

# gender_age_test

#### Read data

In [9]:
df_test = pd.read_csv("../data/gender_age_test.csv")
df_test.shape

(112071, 1)

#### Check duplicated data

In [10]:
print('Numbers of duplicated data:', df_test.duplicated('device_id').sum())

Numbers of duplicated data: 0


#### Check missing data

In [11]:
df_test.isnull().any()

device_id    False
dtype: bool

# Assert that test+train same as phone_brand_device_model

In [12]:
assert (df_test.shape[0] + df_train.shape[0]) == df_brand.shape[0]

# Merge train and phone_brand

In [13]:
train_merge_brand = df_train.merge(df_brand, on='device_id', how='left')
train_merge_brand.set_index('device_id', inplace=True)
train_merge_brand.shape

(74645, 1733)

#### Create train_y

In [14]:
train_y = train_merge_brand['group']
train_y.to_csv('../matrix_for_model/train_y.csv', index=False)
train_y.head()

device_id
-8076087639492063270    M32-38
-2897161552818060146    M32-38
-8260683887967679142    M32-38
-4938849341048082022    M29-31
 245133531816851882     M29-31
Name: group, dtype: object

#### Create dataframe of device_id and brand_and_model

In [15]:
train_merge_brand.drop(columns=['gender', 'age', 'group'], inplace=True)
train_merge_brand.head()

Unnamed: 0_level_0,phone_brand_E人E本,phone_brand_E派,phone_brand_HTC,phone_brand_LG,phone_brand_LOGO,phone_brand_Lovme,phone_brand_MIL,phone_brand_OPPO,phone_brand_PPTV,phone_brand_SUGAR,...,device_model_魅蓝Note3,device_model_魅蓝metal,device_model_魔镜 X5,device_model_麦芒3,device_model_麦芒3S,device_model_麦芒4,device_model_黄金斗士A8,device_model_黄金斗士Note8,device_model_黄金斗士S8畅玩版,device_model_黄金斗士青春版
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-8076087639492063270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-2897161552818060146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-8260683887967679142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-4938849341048082022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
245133531816851882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Write to train_brand_matrix

In [16]:
train_merge_brand.to_csv('../matrix_for_model/train_brand_matrix.csv')

# Merge test and phone_brand

In [17]:
test_merge_brand = df_test.merge(df_brand, on='device_id', how='left')
test_merge_brand.set_index('device_id', inplace=True)
print(test_merge_brand.shape)
test_merge_brand.head()

(112071, 1730)


Unnamed: 0_level_0,phone_brand_E人E本,phone_brand_E派,phone_brand_HTC,phone_brand_LG,phone_brand_LOGO,phone_brand_Lovme,phone_brand_MIL,phone_brand_OPPO,phone_brand_PPTV,phone_brand_SUGAR,...,device_model_魅蓝Note3,device_model_魅蓝metal,device_model_魔镜 X5,device_model_麦芒3,device_model_麦芒3S,device_model_麦芒4,device_model_黄金斗士A8,device_model_黄金斗士Note8,device_model_黄金斗士S8畅玩版,device_model_黄金斗士青春版
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002079943728939269,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-1547860181818787117,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7374582448058474277,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-6220210354783429585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-5893464122623104785,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Write to test_brand_matrix

In [22]:
test_merge_brand.to_csv('../matrix_for_model/test_brand_matrix.csv')