# Talking Data

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import sys
import time
import random
import tarfile
import zipfile
#import StringIO
from IPython.display import display, Image

from sklearn.preprocessing import LabelEncoder
#from scipy import ndimage
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import linear_model, decomposition, datasets, ensemble
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer,precision_score, recall_score, f1_score, average_precision_score, accuracy_score


datadir="./data/TalkingData/"

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

### some function def

In [2]:
def loadData(datadir,filename):
    # Load the wholesale customers dataset
    #data = pd.read_csv(filename)
    data = ''
    print ("loading: "+datadir+filename)
    try:
        if zipfile.is_zipfile(datadir+filename):
            z = zipfile.ZipFile(datadir+filename)
            filename = z.open(filename[:-4])
        else:
            filename=datadir+filename
        data = pd.read_csv(filename, parse_dates=True)  
        print ("Dataset has {} samples with {} features each.".format(*data.shape))
    except Exception as e:
        print ("Dataset could not be loaded. Is the dataset missing?")
        print(e)
    return data

def writeData(data,filename):
    # Load the wholesale customers dataset
    try:
        data.to_csv(filename, index=False)
    except Exception as e:
        print ("Dataset could not be written.")
        print(e)
    verify=[]
    try:
        with open(filename, 'r') as f:
            for line in f:
                verify.append(line)
        f.closed
        return verify[:5]
    except IOError:
        sys.std

In [3]:
   
def runPredict(clf,data, display=True):
    index=random.randrange(len(data))
    y_pred = clf.predict(data[index].reshape(1, -1))[0]
    if display==True:
        print "for:",data[index], "prediction:",y_pred
    return y_pred

def train_classifier(clf, X_train, y_train):
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    return end - start
    #print "Done!\nTraining time (secs): {:.3f}".format(end - start)
    
# Predict on training set and compute F1 score
def predict_labels(clf, features, target):
    #print "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    #print "Done!\nPrediction time (secs): {:.3f}".format(end - start)
    return f1_score(target, y_pred,average='micro'),end - start #(None, 'micro', 'macro', 'weighted', 'samples')

# Train and predict using different training set sizes
def train_predict(clf, X_train, y_train, X_test, y_test):

    timeTrain=train_classifier(clf, X_train, y_train)
    predict_train,trainDelta=predict_labels(clf, X_train, y_train)
    predict_test,testDelta=predict_labels(clf, X_test, y_test)
    return predict_test,testDelta,predict_train,trainDelta,timeTrain # let's return the scores, so we can use them for comparisons

#for each data set size run and plot a train/test
def runTests(test_sizes, train_dataset,train_labels,test_dataset,test_labels, clf="", usePCA=False):
    test_f1=[]
    train_f1=[]

    for test_size in test_sizes:
        # Set up the train set for the test size
        X_train=train_dataset[:test_size]
        y_train=train_labels[:test_size]
        # Same for test
        X_test=test_dataset[-test_size:]
        y_test=test_labels[-test_size:]

 
        if clf == "":
            clf=LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42,  max_iter=1000,C=1e-5)
            
        if usePCA ==True:
            pca=decomposition.PCA(n_components = 14*14)
            clf=Pipeline(steps=[('pca', pca), ('classifier', clf )]) # set up the clf as a pipeline ])
        # Fit model to training data
        test,testDelta,train,trainDelta,timeTrain = train_predict(clf, X_train, y_train, X_test, y_test)
        test_f1.append(test)
        train_f1.append(train)
        print ("------------------------------------------")
        print ("Training set size: {},".format(len(X_train)),"Train time (secs): {:.3f}".format(timeTrain))
        print ("F1 score for training set: {},".format(train),"Prediction time (secs): {:.3f}".format(trainDelta))
        print ("F1 score for test set: {},".format(test),"Prediction time (secs): {:.3f}".format(testDelta))

    
    print ("\n",clf)
    print("Test F1:{}".format(test_f1))
    display("Train F1:{}".format(train_f1))
    plt.plot(test_f1,label="Test F1")
    plt.plot(train_f1,label="Train F1")
    plt.legend(loc=2)
    plt.title("F1 Score per run")
    plt.show()
    
    return clf    

## Import data

In [4]:
# load up the data!
app_events = loadData(datadir,'app_events.csv.zip')
app_labels = loadData(datadir,'app_labels.csv.zip')
events = loadData(datadir,'events.csv.zip')
label_categories = loadData(datadir,'label_categories.csv.zip')

phone_brand_device_model = loadData(datadir,'phone_brand_device_model.csv.zip')
phone_brand_device_model.drop_duplicates('device_id',keep='first', inplace=True)

gender_age_train = loadData(datadir,'gender_age_train.csv.zip')





loading: ./data/TalkingData/app_events.csv.zip
Dataset has 32473067 samples with 4 features each.
loading: ./data/TalkingData/app_labels.csv.zip
Dataset has 459943 samples with 2 features each.
loading: ./data/TalkingData/events.csv.zip
Dataset has 3252950 samples with 5 features each.
loading: ./data/TalkingData/label_categories.csv.zip
Dataset has 930 samples with 2 features each.
loading: ./data/TalkingData/phone_brand_device_model.csv.zip
Dataset has 187245 samples with 3 features each.
loading: ./data/TalkingData/gender_age_train.csv.zip
Dataset has 74645 samples with 4 features each.


In [15]:
display(app_labels.head(5))
display(label_categories.head(5))
print ("size of app_labels:",len(app_labels))

for i in range(len(app_labels)):
    app_labels.loc[i]['category']= "TEST"
    if i%10000 == 0:
        print(i,app_labels.loc[i])

display(app_labels.head(5))

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


Unnamed: 0,label_id,category
0,1,
1,2,game-game type
2,3,game-Game themes
3,4,game-Art Style
4,5,game-Leisure time


('size of app_labels:', 459943)
(0, app_id      7324884708820027918
label_id                    251
Name: 0, dtype: int64)
(10000, app_id      3651079935107643964
label_id                    713
Name: 10000, dtype: int64)
(20000, app_id      1483321555693163019
label_id                    717
Name: 20000, dtype: int64)
(30000, app_id      739987247270796251
label_id                   711
Name: 30000, dtype: int64)
(40000, app_id      3481474984493300197
label_id                    711
Name: 40000, dtype: int64)
(50000, app_id     -7270685603947800676
label_id                    718
Name: 50000, dtype: int64)
(60000, app_id      2748071763173703515
label_id                    711
Name: 60000, dtype: int64)
(70000, app_id      7550692134104585959
label_id                    704
Name: 70000, dtype: int64)
(80000, app_id     -6541810269037825584
label_id                    704
Name: 80000, dtype: int64)
(90000, app_id     -6761151265103874404
label_id                    704
Name: 90000, dt

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [None]:


# the ONE TABLE to rule them all

print ("merging: events")
df = gender_age_train.merge(events, how='left', on='device_id')
print ("merging: phone_brand_device_model")
df = df.merge(phone_brand_device_model, how='left', on='device_id')

print ("merging: phone_brand_device_model")
df = df.merge(app_events, how='left', on='event_id')

print ("merging: app_labels")
df = df.merge(app_labels, how='left', on='app_id')
print ("merging: label_categories")
df=  df.merge(label_categories, how='left', on='label_id')


#df=df.fillna(0)

#brandLE = LabelEncoder().fit(df.phone_brand)
#df['phone_brand'] = brandLE.transform(df['phone_brand'])
#modelLE = LabelEncoder().fit(df.device_model)
#df['device_model'] = modelLE.transform(df['device_model'])
#groupLE = LabelEncoder().fit(df.group)
#df['group'] = groupLE.transform(df['group'])
#categoryLE = LabelEncoder().fit(df.category)
#df['category'] = categoryLE.transform(df['category'])
#app_idLE = LabelEncoder().fit(df.app_id)
#df['app_id'] = app_idLE.transform(df['app_id'])

df.info()
display (df.head(5))
df=df.drop(['gender','age', 'event_id', 'label_id','is_installed','is_active', 'event_id', 'timestamp','device_id', 'longitude','latitude' ],1)
display(df.info())
display(df.head(5))

    

In [None]:
curr_app_id=df.loc[3].app_id

print(df[df.app_id==curr_app_id].head(10))
print(app_labels[app_labels.app_id==curr_app_id])


In [None]:
app_idLE.inverse_transform(6854)
display(app_idLE.inverse_transform(3514))

display(app_labels[app_labels['app_id']==app_idLE.inverse_transform(3514)])

categoryLE.inverse_transform(713)
categoryLE.inverse_transform(704)

In [None]:
display (df.head(2))

counterCat=0
counterApp=0
for i in range(len(df)):
    if df.loc[i].category!=0:
        counterCat+=1
    if df.loc[i].app_id!=0:
        counterApp+=1
        
    if i % 100000 ==0:
        print (i, "category:",df.loc[i].category, counterCat,"App:",df.loc[i].app_id, counterApp)
print("counter:",counter)

### Split data intro train/test, train a classifier

In [None]:
x=df.drop('group',1)
y=df['group']

display(x.head(2))
display(y.head(2))



In [None]:
#  train/validation split
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.25, random_state=42)

dataSize=X_train.shape[0]
print ("size of train data",dataSize, )
test_sizes=[50]
for i in range(5):
    test_sizes.append(int(round(dataSize*(i+1)*.2)))

#test_sizes=[63,630,6300,31500]
#test_sizes=[50,500,5001]
print ("run tests of size",test_sizes)

In [None]:
display (X_train[0], y_train[0])
display (x[0],y[0])

In [None]:
print ("Logistic:")
clf=runTests(test_sizes, X_train,y_train,X_test,y_test)
print("Validation Prediction is:",runPredict(clf,X_test))

In [None]:
print ("AdaBoost:")
clf = runTests(test_sizes, X_train,y_train,X_test,y_test,ensemble.AdaBoostClassifier())
print("Validation Prediction is:",runPredict(clf,X_test))

In [None]:
print("Validation Prediction is:",runPredict(clf,X_test))

### EOF