In [1]:
import warnings


import pandas as pd

import pylab as pl
import numpy as np

import matplotlib.pyplot as plt
from datetime import datetime

from sklearn import preprocessing
from sklearn import cross_validation

%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [2]:
#defining function to read data 
def read_training_data(data_files):
    
    age_gender = pd.read_csv(data_files['age_gender'])
    countries = pd.read_csv(data_files['countries'])
    sessions = pd.read_csv(data_files['sessions'])
    train_users = pd.read_csv(data_files['users'])
    
    return age_gender, countries, sessions, train_users

#def

In [3]:
data_files = {'age_gender':'data/age_gender_bkts.csv',
'countries':'data/countries.csv',
'sessions':'data/sessions.csv',
'users':'data/train_users_2.csv'}

In [4]:
#general data wrangling
def clean_up(dataframe):

    #dealing with missing data
    dataframe.replace("-unknown-", np.nan, inplace = True)
    new_timestamp = dataframe.timestamp_first_active.astype(str)
    dates = pd.to_datetime(pd.Series([datetime.strptime(date, '%Y%m%d%H%M%S') for date in new_timestamp]))
    
    # replace age in areas where it doesn't make sense with -1
    av = dataframe.age.values
    dataframe['age'] = np.where(np.logical_or(av<14, av>100), -1, av)
    
    #replacing datetime objects which are more difficult to work with
    dataframe['timestamp_first_active'] = dates
    dataframe['date_account_created'] = pd.to_datetime(dataframe.date_account_created)
    dataframe['date_first_booking'] = pd.to_datetime(dataframe.date_first_booking)
    dataframe.rename(columns = {'id':'user_id'}, inplace = True)
    df = dataframe.merge(sessions, on = 'user_id', how = 'left')
    user_ids = df['user_id']
    df_2 = df.drop('user_id', axis = 1)
    
    #sepparating in order to have numerical values for each
    
    #date_account_created
    df_2['dac_year'] = df_2.date_account_created.dt.year
    df_2['dac_month'] = df_2.date_account_created.dt.month
    df_2['dac_day'] = df_2.date_account_created.dt.day

    #timestamp_first_active
    df_2['tfa_year'] = df_2.timestamp_first_active.dt.year
    df_2['tfa_month'] = df_2.timestamp_first_active.dt.month
    df_2['tfa_day'] = df_2.timestamp_first_active.dt.day

    #dropping timestamps from model
    df_2 = df_2.drop(['date_account_created','timestamp_first_active','date_first_booking'],axis = 1)
    
    
    #condition to check if it's the training set or test set
    if 'country_destination' in list(df_2.columns):

        labels = df_2.country_destination
        df_2 = df_2.drop('country_destination', axis = 1)
        
        print df_2.shape[0], user_ids.shape[0] 
        return df_2, labels, user_ids, df_2.columns
        
    else:
        print df_2.shape[0], user_ids.shape[0] 
        return df_2, user_ids, df_2.columns
    
    #if

#def

def encode_df(df_in,feature_list):
    
    #encoding all the non-numerical variables
    
    le = preprocessing.LabelEncoder()
    
    for item in feature_list:
        le.fit(df_in[item])
        encoded = le.transform(df_in[item])
        df_in[item] = encoded
    #for
    
    df_in = df_in.fillna(-1)
    
    return df_in

#def

def preprocessing_df(df_in,mode,normalizers=None):
    
    #
    # normalising the rest of the variables
    #
    
    if mode == 'train':
        scaler = preprocessing.MinMaxScaler().fit(df_in)
        df_in= scaler.transform(df_in)
        normalizers = scaler
        return df_in, normalizers
    elif mode == 'test':
        df_in = normalizers.transform(df_in)
        return df_in
    else:
        print 'Mode not defined {}'.format(mode)
    #if
    
#def

In [5]:
feature_list = ['gender','signup_method','language','affiliate_channel',
         'affiliate_provider','first_affiliate_tracked','affiliate_provider',
         'first_affiliate_tracked','signup_app','first_device_type','first_browser','action',
        'action_type','action_detail','device_type']

### Reading all the data
****

In [6]:
age_gender, countries, sessions, train_users = read_training_data(data_files)

### Preprocessing train data
****


In [7]:
train_set, train_labels, train_user_id, column_names_train = clean_up(train_users)

5677593 5677593


In [8]:
train_set = encode_df(train_set, feature_list)
train_set.shape[0]

5677593

In [9]:
train_set, normalizers = preprocessing_df(train_set,'train',normalizers=None)

In [10]:
encoded_dummies = pd.get_dummies(train_labels)
#encoded_dummies = pd.get_dummies(train_labels).as_matrix()
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()
train_labels_set = le.fit_transform(train_labels)
train_labels_set


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Reducing Dimensionality
****

In [11]:
from time import time
from sklearn.decomposition import RandomizedPCA
n_components = 15

print "Extracting the top %d features from %d rows" % (n_components, train_set.shape[0])
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(train_set)
print "done in %0.3fs" % (time() - t0)

#eigenvalues = pca.components_.reshape((n_components, X_train.shape[0],  X_train.shape[1]))

print "Projecting the input data on the eigenvalues orthonormal basis"
t0 = time()
X_train_pca = pca.transform(train_set)
#X_test_pca = pca.transform(test_set)
print "done in %0.3fs" % (time() - t0)



Extracting the top 15 features from 5677593 rows
done in 32.556s
Projecting the input data on the eigenvalues orthonormal basis
done in 1.967s


### Splitting Training Data - train and CV
****

In [12]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_train_pca, train_labels_set, test_size=0.3, random_state=0)

In [13]:
print "training: %i, %i" % (X_train.shape[0],y_train.shape[0])
print "test: %i, %i" % (X_test.shape[0],y_test.shape[0])

training: 3974315, 3974315
test: 1703278, 1703278


### Training phase
****

In [14]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators = 100, random_state=0, n_jobs = -1))

In [None]:
t0 = time()
clf.fit(X_train, y_train)
print "done in %0.3fs" % (time() - t0)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
#plotting importances at each class
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
y = {}
for i in range(0,len(clf.estimators_)-1):
    importances = clf.estimators_[i].feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    x = {}
    for f in range(X_train.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
        x[f+1] = indices[f]

    # Plot the feature importances of the forest
    plt.figure()
    y[i] = x
    plt.title("Feature importances")
    plt.bar(range(X_train.shape[1]), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(X_train.shape[1]), indices)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

In [None]:
from sklearn import metrics

print metrics.f1_score(y_test, y_pred)
print metrics.accuracy_score(y_test, y_pred)

# Process Again
****

In [None]:
print train_users.shape[0], train_set.shape[0]

### Reading Test Data
****

In [None]:
test_users = pd.read_csv('data/test_users.csv')
test_set, test_user_id, column_names_test = clean_up(test_users)


In [None]:
test_set = encode_df(test_set, feature_list)
test_set = preprocessing_df(test_set, 'test', normalizers = normalizers)
test_set.shape[0]

In [None]:
X_test_pca = pca.transform(test_set)

In [None]:
X_test_pca.shape

In [None]:
#training on the entire train set
t0 = time()
clf = OneVsRestClassifier(RandomForestClassifier(bootstrap = False, random_state=0, n_jobs = -1))
print "done in %0.3fs" % (time() - t0)

t0= time()
clf.fit(X_train_pca, train_labels_set)
print "done in %0.3fs" % (time() - t0)

t0 = time()
#predict probabilities of each country
y_pred = clf.predict_proba(X_test_pca)
print "done in %0.3fs" % (time() - t0)

In [None]:
y_pred.shape[0]

In [None]:
last_df = pd.concat([test_user_id,pd.DataFrame(y_pred, columns = encoded_dummies.columns)], axis = 1)
last_df = last_df.groupby('user_id').mean().reset_index()

last_df.head()


In [None]:
import operator

ids = []
result = {}
for index, row in last_df.iterrows():
    country_values = row[1:].to_dict()
    sorted_vals = sorted(country_values.items(), key=operator.itemgetter(1), reverse = True)[:5]
    result[row[0]]= sorted_vals

In [None]:
some_other_df = pd.DataFrame(result).T
some_other_df.head()

In [None]:
pd.DataFrame(some_other_df.unstack(0))\
.reset_index()\
.sort_values(['level_1','level_0'])[['level_1',0]].to_csv('something.csv', index = False)

### This returns a score of 84.53% accuracy on the website

In [None]:
len(clf.estimators_)

In [None]:
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
title = "Learning Curves (Random Forest)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(train_set.shape[0], n_iter=100,
                                   test_size=0.3, random_state=0)

estimator = OneVsRestClassifier(RandomForestClassifier(bootstrap = False, random_state=0))
plot_learning_curve(estimator, title, train_set, train_labels_set, ylim=(0.7, 1.01), cv=cv, n_jobs=4)


plt.show()