In [1]:
%matplotlib inline
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.decomposition import PCA

In [2]:
train_set = pd.read_csv('../train.csv', parse_dates=['srch_ci', 'srch_co'])

In [3]:
test_set = pd.read_csv('../test.csv',parse_dates=['srch_ci', 'srch_co'])

In [None]:
train_set.info()

In [None]:
test_set.info()

In [None]:
users_group = train_set.groupby('user_id')
bookings = users_group.aggregate({'is_booking':np.sum})
print np.sum(bookings)

In [4]:
train_set['srch_co'] = train_set['srch_co'].apply(lambda x: pd.to_datetime(x,errors='coerce'))

In [5]:
train_set['srch_ci'] = train_set['srch_ci'].apply(lambda x: pd.to_datetime(x,errors='coerce'))

In [None]:
temp = pd.to_datetime(train_set['srch_ci'],errors = 'coerce')
train_set['srch_ci'] = temp
temp = pd.to_datetime(train_set['srch_co'],errors = 'coerce')
train_set['srch_co'] = temp

In [6]:
train_set['num_days'] = (train_set['srch_co'] - train_set['srch_ci'])\
                                                            .values.astype('timedelta64[D]').astype(int)

In [7]:
train_set['date_time'] = train_set['date_time'].apply(lambda x: pd.to_datetime(x,errors='coerce'))

In [8]:
train_set['num_days_to_checkin'] = \
            (train_set['srch_ci'] - train_set['date_time'])\
                            .values.astype('timedelta64[D]').astype(int)

In [9]:
test_set['srch_ci'] = test_set['srch_ci'].apply(lambda x: pd.to_datetime(x,errors='coerce'))

In [10]:
test_set['srch_co'] = test_set['srch_co'].apply(lambda x: pd.to_datetime(x,errors='coerce'))

In [None]:
temp = pd.to_datetime(test_set['srch_ci'],errors = 'coerce')
test_set['srch_ci'] = temp
temp = pd.to_datetime(test_set['srch_co'],errors = 'coerce')
test_set['srch_co'] = temp

In [11]:
test_set['num_days'] = (test_set['srch_co'] - test_set['srch_ci'])\
                                                            .values.astype('timedelta64[D]').astype(int)

In [12]:
test_set['date_time'] = test_set['date_time'].apply(lambda x: pd.to_datetime(x,errors='coerce'))

In [13]:
test_set['num_days_to_checkin'] = \
            (test_set['srch_ci'] - test_set['date_time'])\
                            .values.astype('timedelta64[D]').astype(int)

## Get the top 10000 users with most transactions

In [None]:
filtered_user_ids_10k = list(train_set['user_id'].value_counts()[0:10000].index)

In [None]:
train_set['user_id'].value_counts()[0:10000].tail(5)

In [None]:
filtered_train_set_10k = train_set[train_set['user_id'].isin(filtered_user_ids_10k)]

In [None]:
filtered_train_set_10k.describe()

In [None]:
filtered_train_set_10k['srch_ci'] = pd.to_datetime(filtered_train_set_10k['srch_ci'])
filtered_train_set_10k['srch_co'] = pd.to_datetime(filtered_train_set_10k['srch_co'])

In [None]:
temp = pd.to_datetime(filtered_train_set_10k['srch_ci'],errors = 'ignore')
filtered_train_set_10k['srch_ci'] = temp
temp = pd.to_datetime(filtered_train_set_10k['srch_co'],errors = 'ignore')
filtered_train_set_10k['srch_co'] = temp

In [None]:
filtered_train_set_10k['num_days'] = (filtered_train_set_10k['srch_co'] - filtered_train_set_10k['srch_ci'])\
                                                            .values.astype('timedelta64[D]').astype(int)

In [None]:
filtered_train_set_10k['num_days'] = filtered_train_set_10k['num_days'].apply(lambda x: abs(x))

In [None]:
filtered_train_set_1k ['num_days'] = \
 (filtered_train_set_1k['srch_co'] - filtered_train_set_1k['srch_ci']).values.astype('timedelta64[D]').astype(int)

In [None]:
erroneous = filtered_train_set_10k[filtered_train_set_10k['num_days'] < 0]
len(erroneous)

In [None]:
filtered_train_set_10k = filtered_train_set_10k[filtered_train_set_10k['num_days'] >= 0] 

In [None]:
filtered_train_set_10k = filtered_train_set_10k.drop('hours_to_checkin',axis=1)

In [None]:
filtered_train_set_10k['date_time'] = pd.to_datetime(filtered_train_set_10k["date_time"])

In [None]:
filtered_train_set_10k['num_days_to_checkin'] = \
            (filtered_train_set_10k['srch_ci'] - filtered_train_set_10k['date_time'])\
                            .values.astype('timedelta64[D]').astype(int)

In [None]:
(filtered_train_set_10k['num_days_to_checkin'] < 0).sum()

In [None]:
filtered_train_set_10k['num_days_to_checkin'].describe()

In [None]:
filtered_train_set_10k['num_days'] = filtered_train_set_10k['num_days'].apply(lambda x: abs(x))

In [None]:
filtered_train_set_10k['month'] = filtered_train_set_10k['date_time'].apply(lambda x: x.month)

In [None]:
len(filtered_train_set_10k)

In [None]:
filtered_train_set_10k = filtered_train_set_10k[filtered_train_set_10k['num_days_to_checkin']<366]
print len(filtered_train_set_10k)

In [None]:
filtered_train_set_10k = filtered_train_set_10k[filtered_train_set_10k['num_days'] < 365]

In [None]:
filtered_train_set_10k = filtered_train_set_10k.dropna()

In [None]:
filtered_train_set_10k['log2_orig_destination_distance'] = \
                ((np.log2(filtered_train_set_10k['orig_destination_distance'])).round() + 7)

In [None]:
print filtered_train_set_10k['log2_orig_destination_distance'].hist()
print len(filtered_train_set_10k)

In [None]:
filtered_train_set_10k['log1p_num_days_to_checkin'] = \
            (np.log1p(filtered_train_set_10k['num_days_to_checkin'])*10).round()

In [None]:
filtered_train_set_10k['log1p_num_days'] =  \
            ((np.log1p(filtered_train_set_10k['num_days']))*10).round()

## PCA Model - destinations.csv

In [None]:
destinations = pd.read_csv('../destinations.csv')
cols = list(destinations.columns)
cols.remove('srch_destination_id')
destinations[cols] = (destinations[cols] - destinations[cols].mean())/(destinations[cols].max() - destinations[cols].min())
destinations.describe()

In [None]:
pca = PCA(n_components=5)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small,columns=[["d{0}".format(i + 1) for i in range(5)]])
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [None]:
np.sum(pca.explained_variance_ratio_)

In [None]:
print len(destinations['srch_destination_id'].value_counts())
print len(train_set['srch_destination_id'].value_counts())

In [None]:
dest_small.head()

In [None]:
dat = filtered_train_set_10k[:100]
dat.info()

In [None]:
dummy = pd.merge(test_set,dest_small,on = 'srch_destination_id')

In [None]:
print len(dummy)

In [None]:
dummy.dropna()
print len(dummy)

In [None]:
dummy.info()

In [None]:
test_set = pd.DataFrame(dummy)

In [None]:
filtered_train_set_10k = pd.DataFrame(dummy)

In [None]:
filtered_train_set_10k.to_csv('../Top_10k_users.csv.gzip',compression='gzip',index=False)

In [None]:
shortlist_users =[]
for c in user_id_100:
    print c
    break
    if c == 1:
        shortlist_users.append()
print shortlist_users

### Test users are a subset of Train set

In [None]:
set(test_set['user_id'].unique()) - set(train_set['user_id'].unique())

# Top 10000 Users

In [14]:
%matplotlib inline
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import naive_bayes
import operator
from pprint import pprint
from sklearn.decomposition import PCA
import time
from multiprocessing.dummy import Pool as ThreadPool 

In [None]:
filtered_data = pd.read_csv('../Top_10k_users.csv')

In [None]:
#test_data = pd.read_csv('../test.csv')

In [None]:
#destinations_data = pd.read_csv('../destinations.csv')

## Randomly permute data before proceeding further and filter users with atleast 10 records

In [None]:
s = filtered_data['user_id'].value_counts()
users_records_10 = s[s >= 10].index

In [None]:
print users_records_10

In [None]:
filtered_data = filtered_data[filtered_data['user_id'].isin(users_records_10)]
filtered_data.info()

In [None]:
filtered_data = filtered_data.iloc[np.random.permutation(len(filtered_data))]

In [None]:
X_columns = filtered_data.columns

In [21]:
X_columns = test_set.columns
X_columns = X_columns.drop(['id', 'date_time','srch_ci','srch_co']).astype(list)
print X_columns

Index([u'site_name', u'posa_continent', u'user_location_country',
       u'user_location_region', u'user_location_city',
       u'orig_destination_distance', u'user_id', u'is_mobile', u'is_package',
       u'channel', u'srch_adults_cnt', u'srch_children_cnt', u'srch_rm_cnt',
       u'srch_destination_id', u'srch_destination_type_id', u'hotel_continent',
       u'hotel_country', u'hotel_market', u'num_days', u'num_days_to_checkin'],
      dtype='object')


In [41]:
def content_based_nbc (X,Y,X_test):
    gnb = naive_bayes.GaussianNB()
    y_predict = gnb.fit(X,Y).predict(X_test)

    y_predict_pb = gnb.predict_proba(X_test)
    #accuracy_top = 0.0
    #Y_predicted_top = pd.DataFrame(columns=['cb_recommendations'])
    for i in range(len(y_predict_pb)):
        ordered ={}
        '''ordered['cb_recommendations'] =\
            [c for c,i in (sorted(zip (gnb.classes_,y_predict_pb[i]), key= operator.itemgetter(1),reverse=True)[0:5])]
        '''
        ordered = [c for c,i in (sorted(zip (gnb.classes_,y_predict_pb[i]), key= operator.itemgetter(1),reverse=True)[0:5])]
        # ordered - get only cluster ids without probabilities
        #print ordered
        #ordered = list([c for c,i in ordered])
        #Y_predicted_top = Y_predicted_top.append(ordered,ignore_index = True)
        #accuracy_top += (Y_test[i] in dict(ordered['cb_recommendations']).keys())
        #break
        #print "From Top %s,%s,%s" %(ordered[0][0],y_predict[i],Y[i])
    
    '''%xdel gnb
    %xdel y_predict_pb
    %xdel y_predict
    %xdel ordered'''
    #accuracy = (Y_test == y_predict).sum()*1.0
    #accuracy_top = accuracy_top*1.0/len(X_test)
    #print accuracy,accuracy_top
    #return (accuracy,accuracy_top,Y_predicted_top)
    return ordered

In [None]:
print "Before dropping na", len(filtered_data)#, len(test_data)
#filtered_data['orig_destination_distance'] = filtered_data['orig_destination_distance'].fillna(-1)
filtered_data = filtered_data.dropna()
#test_data = test_data.dropna()
filtered_data = filtered_data.reset_index(drop=True)
#test_data = test_data.reset_index(drop=True)
print "After dropping na", len(filtered_data)#, len(test_data)
#filtered_data = filtered_data[filtered_data['num_days']>=0]

## Get recommendations on test.csv

In [56]:
def get_recommendations(test_users_grouped,train_users_grouped):
    #predictions = pd.DataFrame(columns=['id','hotel_cluster'])
    iterations = 0
    filename = '../predictions_group.csv'
    writefile = open(filename,'wb')
    writefile.write('id, hotel_cluster \n')
    for user,filtered_test in test_users_grouped:
        iterations += 1
        if iterations%1000 == 0:
            print "Finished %d items" %iterations
        try:
            filtered_train = train_users_grouped.get_group(user)
            
            #print len(filtered_test), len(filtered_train)
            filtered_train = filtered_train.fillna(0)
            filtered_train = filtered_train.reset_index(drop=True)

            filtered_test = filtered_test.fillna(0)
            filtered_test = filtered_test.reset_index(drop=True)

            X_train = filtered_train[X_columns]
            Y_train = filtered_train['hotel_cluster']
            X_test = filtered_test[X_columns]
            X_test_id = filtered_test['id'].astype(int)
            if len(X_train) < 1:
                print "No training data"
                continue
            if len(X_test) < 1:
                print "No test data"
                continue
            #pred_dict = {}
            
            Y_predicted_top = content_based_nbc(X_train,Y_train,X_test)
            
            #print Y_predicted_top, type(Y_predicted_top)
            
            for i in range(len(X_test)):
                #pred_dict['id'] = str(X_test_id[i])
                #pred_dict['hotel_cluster'] = ' '.join([str(x) for x in Y_predicted_top])
                entry = str(X_test_id[i]) + ',' + ' '.join([str(x) for x in Y_predicted_top]) + '\n'
                writefile.write(entry)
                #predictions = predictions.append(pred_dict,ignore_index=True)
                #pprint(pred_dict)
            #pprint(Y_predicted_top)
            #%xdel X_train
            #%xdel X_test
            #%xdel Y_train
            #%xdel Y_predicted_top
        except ValueError as e:
            print e, user
        #break
    writefile.close()
    #predictions.to_csv('../predictions_group_test.csv',index=False)

In [46]:
test_set_grouped = test_set.groupby('user_id')
train_set_grouped = train_set.groupby('user_id')

In [57]:
get_recommendations(test_set_grouped,train_set_grouped)

Finished 1000 items
Finished 2000 items
Finished 3000 items
Finished 4000 items
Finished 5000 items
Finished 6000 items
Finished 7000 items
Finished 8000 items
Finished 9000 items
Finished 10000 items
Finished 11000 items
Finished 12000 items
Finished 13000 items
Finished 14000 items
Finished 15000 items
Finished 16000 items
Finished 17000 items
Finished 18000 items
Finished 19000 items
Finished 20000 items
Finished 21000 items
Finished 22000 items
Finished 23000 items
Finished 24000 items
Finished 25000 items
Finished 26000 items
Finished 27000 items
Finished 28000 items
Finished 29000 items
Finished 30000 items
Finished 31000 items
Finished 32000 items
Finished 33000 items
Finished 34000 items
Finished 35000 items
Finished 36000 items
Finished 37000 items
Finished 38000 items
Finished 39000 items
Finished 40000 items
Finished 41000 items
Finished 42000 items
Finished 43000 items
Finished 44000 items
Finished 45000 items
Finished 46000 items
Finished 47000 items
Finished 48000 items
F

In [59]:
print len(test_set)

2528243


In [60]:
results = pd.read_csv('../predictions_group.csv')
print len(results)

2528243


In [61]:
results.head()

Unnamed: 0,id,hotel_cluster
0,361344,28 19 40 56 72
1,361345,28 19 40 56 72
2,0,60 20
3,1,60 20
4,723207,91 1 18 21 41


In [64]:
results = results.sort(columns='id')

  if __name__ == '__main__':


In [66]:
results = results.reset_index(drop=True)

In [69]:
results.to_csv('../predictions.csv.gzip',compression='gzip',index=False)

SyntaxError: invalid syntax (<ipython-input-69-0348ae54acfa>, line 1)

In [67]:
users = filtered_data['user_id'].unique()
print "Number of unique users: ",len(users)

NameError: name 'filtered_data' is not defined

In [None]:
def k_fold(filtered_data, k=5):
    accuracy = {}
    avg_accuracy = 0.0
    avg_accuracy_top = 0.0
    
    #users = filtered_data['user_id'].unique()
    filtered_with_recommendations = pd.DataFrame(columns =list(X_columns) + ['hotel_cluster','cb_recommendations'])
    #print len(users)
    filtered_data_grouped = filtered_data.groupby('user_id')
    print len(filtered_data_grouped)
    #for user in users:
    for user,filtered in filtered_data_grouped:
        try:
            filtered_with_recommendations_user = pd.DataFrame(columns =list(X_columns) + \
                                                              ['hotel_cluster','cb_recommendations'])
            '''filtered = filtered_data[filtered_data['user_id'] == user]

            filtered = filtered.dropna()'''
            filtered = filtered.reset_index(drop=True)
            
            fold_size = len(filtered)/k
            
            avg_accuracy_user = 0.0
            avg_accuracy_top_user = 0.0
            
            skipped_folds = 0
            for i in range(k):
                filtered_train = filtered[:fold_size*(i)].append(filtered[fold_size*(i+1):])
                filtered_test =  filtered[fold_size*(i):fold_size*(i+1)]
                
                filtered_train = filtered_train.reset_index(drop=True)
                filtered_test = filtered_test.reset_index(drop=True)
               
                X_train = filtered_train[X_columns]
                Y_train = filtered_train['hotel_cluster']
                
                X_test = filtered_test[X_columns]
                Y_test = filtered_test['hotel_cluster']
                
                #print X_train.shape, Y_train.shape
                #print X_test.shape, Y_test.shape
                #break
                
                if len(X_train) < 1:
                    skipped_folds +=1
                    #print fold_size, len(filtered)
                    #print "No training data"
                    continue
                if len(X_test) < 1:
                    skipped_folds +=1
                    #print fold_size, len(filtered)
                    #print "No test data"
                    continue
                
                acc,acc_top,Y_predicted_top = content_based_nbc(X_train,Y_train,X_test,Y_test)
                accuracy[user] = acc
                avg_accuracy_user += acc
                avg_accuracy_top_user += acc_top
            if k != skipped_folds:
                avg_accuracy += avg_accuracy_user*1.0/(k-skipped_folds)
                avg_accuracy_top += avg_accuracy_top_user*1.0/(k-skipped_folds)

            #break
        except ValueError as e:
            print e, user
    print avg_accuracy*1.0/len(users),avg_accuracy_top*1.0/len(users)

In [None]:
def get_cb_recommendations(train_set,test_set):
    accuracy = {}
    avg_accuracy = 0.0
    avg_accuracy_top = 0.0

    filtered_with_recommendations = pd.DataFrame(columns = list(X_columns) + ['hotel_cluster','cb_recommendations'])
    Y_predicted_top = pd.DataFrame(columns=['cb_recommendations'])
    #print len(users)
    train_set_grouped = train_set.groupby('user_id')
    test_set_grouped = test_set.groupby('user_id')
    
    for user,filtered_test in test_set_grouped:
        try:
            filtered_with_recommendations_user = pd.DataFrame(columns = list(X_columns) +\
                                                              ['hotel_cluster','cb_recommendations'])
            filtered_train = train_set_grouped.get_group(user)
            
            #filtered_train = filtered_train.dropna()
            filtered_train = filtered_train.reset_index(drop=True)

            #filtered_test = filtered_test.dropna()
            filtered_test = filtered_test.reset_index(drop=True)

            X_train = filtered_train[X_columns]
            Y_train = filtered_train['hotel_cluster']
            
            X_test = filtered_test[X_columns]
            Y_test = filtered_test['hotel_cluster']
            
            if len(X_train) < 1:
                print "No training data"
                continue
            if len(X_test) < 1:
                print "No test data"
                continue
            #return values from the NBC model
            acc,acc_top,Y_predicted_top = content_based_nbc(X_train,Y_train,X_test,Y_test)
            
            filtered_with_recommendations_user = pd.concat([X_test,Y_test,Y_predicted_top],axis=1)
            filtered_with_recommendations = \
                pd.concat([filtered_with_recommendations,filtered_with_recommendations_user],axis=0,ignore_index=True)
            #print len(X_test),len(Y_test),len(Y_predicted_top),len(filtered_with_recommendations)
            
            accuracy[user] = acc
            avg_accuracy += acc
            avg_accuracy_top += acc_top
            #break
        except ValueError as e:
            print e, user
            
    print avg_accuracy*1.0/len(filtered_with_recommendations)
    print avg_accuracy_top*1.0/len(filtered_with_recommendations)
    return filtered_with_recommendations

In [None]:
curr_time  = datetime.fromtimestamp(int(time.time()))
print curr_time

In [None]:
filtered_data_with_recos = get_cb_recommendations(filtered_data,filtered_data)

In [None]:
filtered_data_with_recos = get_cb_recommendations(filtered_data[:1000],filtered_data[:1000])

In [None]:
for col in X_columns:
    try:
        k = filtered_data[col][:100000].plot(kind='kde',title = col,legend = True)
        print col
        plt.savefig(col +'.png')
        plt.show()
        #break
    except TypeError as e:
        print col
        continue

In [None]:
naive_bayes.MultinomialNB