In [1]:
# Owen Wichiencharoen's standard Python Imports:

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import seaborn as sns
sns.set_style('darkgrid')

import matplotlib.pyplot as plt
%matplotlib inline


### REGRESSION PACKAGES
# from scipy import stats
# import statsmodels.formula.api as smf
# import statsmodels.api as sm
# import sklearn.linear_model as lm

### DATASET PACKAGES
# from sklearn.cross_validation import train_test_split, cross_val_score
# from sklearn import datasets, metrics

### TREE/RANDOMFOREST PACKAGES
# from sklearn.tree import DecisionTreeClassifier, export_graphviz
# from sklearn.ensemble import RandomForestClassifier
# import pydot
# from os import system
# from sklearn.externals.six import StringIO
# from IPython.display import Image

### OTHER PACKAGES
#import itertools
#import pandas_datareader.data as pdweb
#from pandas_datareader.data import DataReader
#from datetime import datetime
#from io import StringIO

## Important notes from Kaggle Forum:

https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums

- not all events have location (latitude/longitude), and you can treat (0,0),(1,0),(0,1),(1,1) lat/long as NaN.

In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import shutil
from os import environ, listdir, makedirs
from os.path import join, expanduser, exists, isdir
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup as soup
from pandas import *

## Set path for data file storage
analysis_dir = expanduser('~/kaggle')

## Create path if necessary
if not exists(analysis_dir):
    makedirs(analysis_dir)


def get_data_dir(competition):
    data_dir = join(analysis_dir, competition, 'data')
    if not exists(data_dir):
        makedirs(data_dir)
    return data_dir


def download_data(competition='talkingdata-mobile-user-demographics', filetype=['.zip']):
    """Download kaggle competition data files of designated type(s)
    Assumes that KAGGLE_USER and KAGGLE_PASSWD can be retrieved via os.environ[] as environment variables
    Requires prior acceptance of terms and conditions for specific competition and user
    :param competition: kaggle competition url name
    :param filetype: desired data file formats for download - list
    """

    data_dir = get_data_dir(competition)
    kaggle_url = 'https://www.kaggle.com'
    login_url = '/'.join(s.strip('/') for s in [kaggle_url, 'account', 'login'])
    data_url = '/'.join(s.strip('/') for s in [kaggle_url, 'c', competition, 'data'])
    login_data = dict(UserName=environ['KAGGLE_USER'], Password=environ['KAGGLE_PASSWD'])

    with requests.session() as s:
        s.post(login_url, data=login_data)
        response = s.get(data_url)
        html = soup(response.text, 'html.parser')
        links = [a.get('href') for a in html.find_all('a') if a.get('href', None) and a.get('href').endswith(tuple(filetype))]

        for link in links:
            file_name = link.split('/')[-1]
            print('Downloading:', file_name)
            with open(join(data_dir, file_name), 'wb') as f:  # open binary type file for compressed
                response = s.get(urljoin(kaggle_url, link), stream=True)  # send download request
                shutil.copyfileobj(response.raw, f)  # download uncompressed


def data_to_hdf(competition='talkingdata-mobile-user-demographics'):
    """Uncompress competition data and store in HDFStore"""

    data_dir = get_data_dir(competition)
    with HDFStore(join(data_dir, 'source.h5')) as store:
        for f in [l for l in listdir(data_dir) if not isdir(l) and not l.startswith('.') and not l.endswith('.h5')]:
            file_name = f.split('.')[0]
            print('Storing:', file_name)
            try:
                store.put(file_name, read_csv(join(data_dir, f)))
            except Exception as e:
                print(f, '\n', e)

if __name__ == '__main__':
    download_data()
    data_to_hdf()

Downloading: gender_age_test.csv.zip
Downloading: app_labels.csv.zip
Downloading: label_categories.csv.zip
Downloading: phone_brand_device_model.csv.zip
Downloading: sample_submission.csv.zip
Downloading: events.csv.zip
Downloading: app_events.csv.zip
Downloading: gender_age_train.csv.zip
Storing: app_events
Storing: app_labels
Storing: events
Storing: gender_age_test
Storing: gender_age_train
Storing: label_categories
Storing: phone_brand_device_model


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['category']]

  exec(code_obj, self.user_global_ns, self.user_ns)


Storing: sample_submission


NameError: name 'app_events' is not defined

In [13]:
raw_label_categories = pd.read_csv('../../kaggle_data/label_categories.csv')
raw_phone_brand_device_model = pd.read_csv('../../kaggle_data/phone_brand_device_model.csv')
raw_gender_age_train = pd.read_csv('../../kaggle_data/gender_age_train.csv')
raw_gender_age_test = pd.read_csv('../../kaggle_data/gender_age_test.csv')
raw_app_events = pd.read_csv('../../kaggle_data/app_events.csv')
raw_app_labels = pd.read_csv('../../kaggle_data/app_labels.csv')

In [14]:
# MAIN EVENTS TABLE

print(raw_events.info())
print('\nHow many rows in original raw_events? Answer: ',len(raw_events))
print('\nHow many NaN/nulls are there in this DF?')
print(raw_events.isnull().sum())
raw_events.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3252950 entries, 0 to 3252949
Data columns (total 5 columns):
event_id     int64
device_id    int64
timestamp    object
longitude    float64
latitude     float64
dtypes: float64(2), int64(2), object(1)
memory usage: 124.1+ MB
None

How many rows in original raw_events? Answer:  3252950

How many NaN/nulls are there in this DF?
event_id     0
device_id    0
timestamp    0
longitude    0
latitude     0
dtype: int64


Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [30]:
# phone_brand_device_model TABLE

print(raw_phone_brand_device_model.info())
print('\nHow many rows in original raw_events? Answer: ',len(raw_phone_brand_device_model))
print('\nHow many NaN/nulls are there in this DF?')
print(raw_phone_brand_device_model.isnull().sum())
raw_phone_brand_device_model.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187245 entries, 0 to 187244
Data columns (total 3 columns):
device_id       187245 non-null int64
phone_brand     187245 non-null object
device_model    187245 non-null object
dtypes: int64(1), object(2)
memory usage: 4.3+ MB
None

How many rows in original raw_events? Answer:  187245

How many NaN/nulls are there in this DF?
device_id       0
phone_brand     0
device_model    0
dtype: int64


Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


In [33]:
raw_phone_brand_device_model['device_id'].value_counts().sort_values(ascending=False)[:5]

 9218059356897525342    2
 4781989393190512097    2
 5435517613682152801    2
-7525913571507064767    2
-3910393802259418325    2
Name: device_id, dtype: int64

In [None]:
events = pd.merge(left=raw_events, right=raw_phone_brand_device_model, how='left', left_on='device_id',right_on='device_id')

In [None]:
print('How many rows in original raw_events? Answer: ',len(raw_events))
print('How many rows in this new DF? Answer: ',len(events))
print('\nHow many NaN/nulls in this DF?')
print(events.isnull().sum())
raw_events.head()