In [1]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pylab as pl
import numpy as np
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')

In [2]:
age_gender = pd.read_csv('data/age_gender_bkts.csv')
countries = pd.read_csv('data/countries.csv')
sessions = pd.read_csv('data/sessions.csv')
train_users = pd.read_csv('data/train_users_2.csv')

In [3]:
from datetime import datetime
def clean_up(dataframe):
    dataframe.replace("-unknown-", np.nan, inplace = True)
    new_timestamp = dataframe.timestamp_first_active.astype(str)
    dates = pd.to_datetime(pd.Series([datetime.strptime(date, '%Y%m%d%H%M%S') for date in new_timestamp]))
    
    
    #easiness in using datetime objects
    dataframe['timestamp_first_active'] = dates
    dataframe['date_account_created'] = pd.to_datetime(dataframe.date_account_created)
    dataframe['date_first_booking'] = pd.to_datetime(dataframe.date_first_booking)
    dataframe.rename(columns = {'id':'user_id'}, inplace = True)
    df = dataframe.merge(sessions, on = 'user_id', how = 'left')
    df_2 = df.drop('user_id', axis = 1)
    
    #sepparating in order to have numerical values for each
    
    #date_account_created
    df_2['dac_year'] = df_2.date_account_created.dt.year
    df_2['dac_month'] = df_2.date_account_created.dt.month
    df_2['dac_day'] = df_2.date_account_created.dt.day

    #timestamp_first_active
    df_2['tfa_year'] = df_2.timestamp_first_active.dt.year
    df_2['tfa_month'] = df_2.timestamp_first_active.dt.month
    df_2['tfa_day'] = df_2.timestamp_first_active.dt.day

    #date_first_booking
    df_2['dfb_year'] = df_2.date_first_booking.dt.year
    df_2['dfb_month'] = df_2.date_first_booking.dt.month
    df_2['dfb_day'] = df_2.date_first_booking.dt.day
    df_2 = df_2.drop(['date_account_created','timestamp_first_active','date_first_booking'],axis = 1)
    
    #encoding all the non-numerical variables
    from sklearn import preprocessing

    le = preprocessing.LabelEncoder()
    variables = ['gender','signup_method','language','affiliate_channel',
         'affiliate_provider','first_affiliate_tracked','affiliate_provider',
         'first_affiliate_tracked','signup_app','first_device_type','first_browser','action',
        'action_type','action_detail','device_type']
    for item in variables:
        le.fit(df_2[item])
        encoded = le.transform(df_2[item])
        df_2[item] = encoded
    
    #this should apply only for the training set
    if 'country_destination' in list(df_2.columns):
        labels = df_2.country_destination
        df_2 = df_2.drop('country_destination', axis = 1)
    
    df_2 = df_2.fillna(-1)
    
    #normalising the rest of the variables
    scaler = preprocessing.MinMaxScaler()
    for item in list(df_2.columns):
        df_2[item] = scaler.fit_transform(df_2[item])
    
    #return labels for training set
    if 'country_destination' in list(df.columns):
        return df_2, labels
    else:
        return df_2

In [4]:
import warnings
warnings.filterwarnings('ignore')

train_set, train_labels = clean_up(train_users)
train_set.head()



Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,...,secs_elapsed,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,dfb_year,dfb_month,dfb_day
0,0.0,0.0,0.5,0.0,0.208333,0.285714,0.235294,1,0.666667,0.375,...,0,0.0,0.454545,0.9,0,0.181818,0.6,0.0,0.0,0.0
1,0.666667,0.019355,0.5,0.0,0.208333,1.0,0.470588,1,0.666667,0.375,...,0,0.25,0.363636,0.8,0,0.363636,0.733333,0.0,0.0,0.0
2,0.333333,0.028288,0.0,0.12,0.208333,0.285714,0.235294,1,0.666667,0.75,...,0,0.0,0.727273,0.9,0,0.454545,0.266667,0.99752,0.692308,0.09375
3,0.333333,0.02134,0.5,0.0,0.208333,0.285714,0.235294,1,0.666667,0.375,...,0,0.25,1.0,0.133333,0,0.818182,1.0,0.998512,0.769231,0.28125
4,0.0,0.020844,0.0,0.0,0.208333,0.285714,0.235294,1,0.666667,0.375,...,0,0.0,0.727273,0.433333,0,1.0,0.233333,0.99752,0.230769,0.59375


In [5]:
import warnings
warnings.filterwarnings('ignore')

test_users = pd.read_csv('data/test_users.csv')
test_set_final = clean_up(test_users)
test_set_final.head()


Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,...,secs_elapsed,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,dfb_year,dfb_month,dfb_day
0,0.333333,0.017973,0.333333,0,0.26087,0.166667,0.25,1,0.333333,1,...,4.4e-05,0,0,0,0,0,0,0,0,0
1,0.333333,0.017973,0.333333,0,0.26087,0.166667,0.25,1,0.333333,1,...,0.00998,0,0,0,0,0,0,0,0,0
2,0.333333,0.017973,0.333333,0,0.26087,0.166667,0.25,1,0.333333,1,...,0.036048,0,0,0,0,0,0,0,0,0
3,0.333333,0.017973,0.333333,0,0.26087,0.166667,0.25,1,0.333333,1,...,0.017324,0,0,0,0,0,0,0,0,0
4,0.333333,0.017973,0.333333,0,0.26087,0.166667,0.25,1,0.333333,1,...,0.0,0,0,0,0,0,0,0,0,0


# XGBClassifier
****

In [None]:
X = train_set[0:1000]
y = train_labels[0:1000]
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)


In [None]:
y_pred = xgb.predict_proba(train_set[0:1000])

In [None]:
y_pred[0]

# Logistic Regression

In [None]:
encoded_train_labels = pd.DataFrame(train_labels,columns = ['country_destination'])
encoded_train_labels['new_code'] = np.where((encoded_train_labels.country_destination == 'US'), 1,0)
encoded_train_labels = encoded_train_labels.new_code
encoded_train_labels.head()

In [None]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        train_set, train_labels, test_size=0.3, random_state=0)

In [None]:
print "training: %i, %i" % (X_train.shape[0],y_train.shape[0])
print "test: %i, %i" % (X_test.shape[0],y_test.shape[0])

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression('l2',C=1.0)
clf.fit(X_train, y_train)


In [None]:
clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

# Multiclassification

****

In [6]:
from sklearn import preprocessing

le = preprocessing.MultiLabelBinarizer()

encoded_train_labels = train_labels


le.fit(encoded_train_labels)
encoded_train_labels= le.transform(encoded_train_labels)
encoded_train_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        train_set, encoded_train_labels, test_size=0.3, random_state=0)

In [8]:
print "training: %i, %i" % (X_train.shape[0],y_train.shape[0])
print "test: %i, %i" % (X_test.shape[0],y_test.shape[0])

training: 3974315, 3974315
test: 1703278, 1703278


In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

clf = OneVsRestClassifier(LinearSVC(random_state=0))

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)