## Cleaning Data from Talking Data Kaggle competition for Dashboard

This notebook does the necessary data cleaning to output dashboard_data.csv and dashboard_data.json using both the given datasets in the competition and also real-world map data from China.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from shapely.geometry import Point, shape
import json

In [3]:
a = os.getcwd()

path1 = a + '\data\gender_age_train.csv'
path2 = a + '\data\phone_brand_device_model.csv'
path3 = a + '\data\events.csv'

train = pd.read_csv(path1)
phone_brand = pd.read_csv(path2)
events = pd.read_csv(path3)

with open(a + '\data\china_provinces_en.json') as data_file:
    provinces_json = json.load(data_file)
    
train.head()

Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38
3,-4938849341048082022,M,30,M29-31
4,245133531816851882,M,30,M29-31


In [4]:
n_samples = 30000
top_10_brands_en = {'华为':'Huawei', '小米':'Xiaomi', '三星':'Samsung', 'vivo':'vivo', 'OPPO':'OPPO', 
                    '魅族':'Meizu', '酷派':'Coolpad', '乐视':'LeEco', '联想':'Lenovo', 'HTC':'HTC'}

df = train.merge(events, how='left', on='device_id').merge(phone_brand, how='left', on='device_id')
df = df[df['longitude'] != 0].sample(n=n_samples)
df['phone_brand_en'] = df['phone_brand'].apply(lambda phone_brand: top_10_brands_en[phone_brand] if (phone_brand in top_10_brands_en) else 'Other')
df.head()

Unnamed: 0,device_id,gender,age,group,event_id,timestamp,longitude,latitude,phone_brand,device_model,phone_brand_en
2365,-1516688507910543815,F,43,F43+,1486215.0,2016-05-02 20:06:04,118.34,34.35,华为,麦芒3,Huawei
444970,31208013033799294,F,30,F29-32,851929.0,2016-05-05 16:36:11,113.78,23.02,vivo,X5SL,vivo
92676,3862448815156123127,F,29,F29-32,778630.0,2016-05-06 21:33:18,120.72,27.82,小米,note顶配版,Xiaomi
1060688,8516510612349919591,F,29,F29-32,1578040.0,2016-05-03 11:19:40,114.55,38.0,华为,荣耀3C,Huawei
1084390,-1147337202699083457,M,23,M23-26,706264.0,2016-05-06 11:39:27,110.93,22.35,小米,MI 4,Xiaomi


In [5]:
def get_age_segment(age):
    if age <= 22:
        return '22-'
    elif age <= 26:
        return '23-26'
    elif age <= 28:
        return '27-28'
    elif age <= 32:
        return '29-32'
    elif age <= 38:
        return '33-38'
    else:
        return '39+'

df['age_segment'] = df['age'].apply(lambda age: get_age_segment(age))
df.head()

Unnamed: 0,device_id,gender,age,group,event_id,timestamp,longitude,latitude,phone_brand,device_model,phone_brand_en,age_segment
2365,-1516688507910543815,F,43,F43+,1486215.0,2016-05-02 20:06:04,118.34,34.35,华为,麦芒3,Huawei,39+
444970,31208013033799294,F,30,F29-32,851929.0,2016-05-05 16:36:11,113.78,23.02,vivo,X5SL,vivo,29-32
92676,3862448815156123127,F,29,F29-32,778630.0,2016-05-06 21:33:18,120.72,27.82,小米,note顶配版,Xiaomi,29-32
1060688,8516510612349919591,F,29,F29-32,1578040.0,2016-05-03 11:19:40,114.55,38.0,华为,荣耀3C,Huawei,29-32
1084390,-1147337202699083457,M,23,M23-26,706264.0,2016-05-06 11:39:27,110.93,22.35,小米,MI 4,Xiaomi,23-26


In [6]:
def get_location(longitude, latitude, provinces_json):
    point = Point(longitude,latitude)
    
    for record in provinces_json['features']:
        polygon = shape(record['geometry'])
        if polygon.contains(point):
            return record['properties']['name']
    return 'other'

In [7]:
df['location'] = df.apply(lambda row: get_location(row['longitude'], row['latitude'], provinces_json), axis=1)

In [24]:
pd.to_datetime(df['timestamp'], unit='ns')

#df['timestamp'] = pd.to_datetime(df['timestamp'],unit='ms')

2365      2016-05-02 20:06:04
444970    2016-05-05 16:36:11
92676     2016-05-06 21:33:18
1060688   2016-05-03 11:19:40
1084390   2016-05-06 11:39:27
50735                     NaT
816924    2016-05-01 20:10:59
1007917   2016-05-05 09:56:48
785678    2016-05-05 23:49:55
830953    2016-05-05 09:38:02
377196    2016-05-02 01:25:06
235220    2016-05-04 22:53:08
1196569   2016-05-06 07:07:57
1236503   2016-05-03 22:05:07
169153    2016-05-04 07:37:10
379729    2016-05-03 14:08:13
245130    2016-05-06 22:48:58
1243569   2016-05-01 11:41:58
801333    2016-05-04 15:21:41
1239852   2016-05-05 20:07:41
114436    2016-05-06 12:27:54
322461    2016-05-02 11:06:32
972011    2016-05-03 08:13:02
382600    2016-05-02 11:41:27
308532    2016-05-07 12:40:01
43214                     NaT
420969    2016-05-07 04:17:19
920682    2016-05-05 09:42:22
313317    2016-05-01 20:00:08
895925    2016-05-03 22:17:31
                  ...        
336316                    NaT
107919    2016-05-06 12:05:04
158525    

In [8]:
df.to_csv(a + "data\\dashboard_data.csv",)

In [9]:
cols_to_keep = ['timestamp', 'longitude', 'latitude', 'phone_brand_en', 'gender', 'age_segment', 'location']
df_clean = df[cols_to_keep].dropna()
df_clean.head()

Unnamed: 0,timestamp,longitude,latitude,phone_brand_en,gender,age_segment,location
2365,2016-05-02 20:06:04,118.34,34.35,Huawei,F,39+,Jiangsu
444970,2016-05-05 16:36:11,113.78,23.02,vivo,F,29-32,Guangdong
92676,2016-05-06 21:33:18,120.72,27.82,Xiaomi,F,29-32,Zhejiang
1060688,2016-05-03 11:19:40,114.55,38.0,Huawei,F,29-32,Hebei
1084390,2016-05-06 11:39:27,110.93,22.35,Xiaomi,M,23-26,Guangdong


In [10]:
df_clean.to_json(a + "\\data\\dashboard_data.json",orient='records')

In [15]:
df_clean.head(50)

Unnamed: 0,timestamp,longitude,latitude,phone_brand_en,gender,age_segment,location
2365,2016-05-02 20:06:04,118.34,34.35,Huawei,F,39+,Jiangsu
444970,2016-05-05 16:36:11,113.78,23.02,vivo,F,29-32,Guangdong
92676,2016-05-06 21:33:18,120.72,27.82,Xiaomi,F,29-32,Zhejiang
1060688,2016-05-03 11:19:40,114.55,38.0,Huawei,F,29-32,Hebei
1084390,2016-05-06 11:39:27,110.93,22.35,Xiaomi,M,23-26,Guangdong
816924,2016-05-01 20:10:59,116.59,35.43,Samsung,M,33-38,Shandong
1007917,2016-05-05 09:56:48,116.13,38.18,Xiaomi,M,39+,Hebei
785678,2016-05-05 23:49:55,115.95,39.72,Huawei,M,39+,Beijing
830953,2016-05-05 09:38:02,119.79,33.46,Samsung,F,23-26,Jiangsu
377196,2016-05-02 01:25:06,114.12,22.62,Huawei,M,39+,Guangdong
