In [105]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [106]:
train = pd.read_csv('coursera_sessions_train.txt', sep='\n', names=['sessions'])
test = pd.read_csv('coursera_sessions_test.txt', sep='\n', names=['sessions'])

In [107]:
train.head(10)

Unnamed: 0,sessions
0,"0,1,2,3,4,5;"
1,"9,10,11,9,11,12,9,11;"
2,"16,17,18,19,20,21;"
3,"24,25,26,27,24;"
4,"34,35,36,34,37,35,36,37,38,39,38,39;"
5,42;
6,"47,48,49;"
7,"59,60,61,62,60,63,64,65,66,61,67,68,67;67,60,63"
8,"71,72,73,74;"
9,"76,77,78;"


In [108]:
test.head()

Unnamed: 0,sessions
0,"6,7,8;"
1,"13,14,15;"
2,"22,23;"
3,"28,29,30,31,32,33;"
4,"40,41;"


In [109]:
# Train data preprocessing
train['sessions_spl'] = map(lambda s: s.split(';'), train.sessions)

train['shown'] = [train.sessions_spl[i][0].split(',') for i in range(train.shape[0])]
train['bought'] = [train.sessions_spl[i][1].split(',') for i in range(train.shape[0])]

del train['sessions_spl']
del train['sessions']

# Test data preprocessing
test['sessions_spl'] = map(lambda s: s.split(';'), test.sessions)

test['shown'] = [test.sessions_spl[i][0].split(',') for i in range(test.shape[0])]
test['bought'] = [test.sessions_spl[i][1].split(',') for i in range(test.shape[0])]

del test['sessions_spl']
del test['sessions']

In [110]:
train.head()

Unnamed: 0,shown,bought
0,"[0, 1, 2, 3, 4, 5]",[]
1,"[9, 10, 11, 9, 11, 12, 9, 11]",[]
2,"[16, 17, 18, 19, 20, 21]",[]
3,"[24, 25, 26, 27, 24]",[]
4,"[34, 35, 36, 34, 37, 35, 36, 37, 38, 39, 38, 39]",[]


In [111]:
def in_list(data,ind):
    
    shown = []
    bought = []
      
    for item_s in data['shown'][ind]:
        shown.append(int(item_s))
        
    for item_b in data['bought'][ind]:
        try:
            bought.append(int(item_b))            
        except ValueError:
            continue
                
    return shown, bought

In [112]:
train['bought'] = [in_list(train, i)[1] for i in range(train.shape[0])]
test['bought'] = [in_list(test, i)[1] for i in range(test.shape[0])]

In [113]:
def build_dict(lst):
    
    dct = {}
    
    for elt in lst:
        if elt in dct:
            value = dct[elt]
            dct[elt]=value+1
        else:
            dct[elt]=1
            
    return dct

In [114]:
def build_2_dicts(data):
    
    dct_shown = {}
    dct_bought = {}

    for i in range(data.shape[0]):
        lst_shown = in_list(data,i)[0]
        lst_bought = in_list(data,i)[1]
        
        for elt in lst_shown:
            if elt in dct_shown:
                value = dct_shown[elt]
                dct_shown[elt]=value+1
            else:
                dct_shown[elt]=1
            
        for elt in lst_bought:
            if elt in dct_bought:
                value = dct_bought[elt]
                dct_bought[elt]=value+1
            else:
                dct_bought[elt]=1
        
    return dct_shown, dct_bought

In [115]:
dct_shown, dct_bought = build_2_dicts(train)

In [116]:
s_dct_shown = sorted(dct_shown.items(), key=lambda (k, v): v, reverse=True)
s_dct_bought = sorted(dct_bought.items(), key=lambda (k, v): v, reverse=True)

In [117]:
s_dct_shown_ids = [s_dct_shown[i][0] for i in range(len(s_dct_shown))]
s_dct_shown_freqs = [s_dct_shown[i][1] for i in range(len(s_dct_shown))]

s_dct_bought_ids = [s_dct_bought[i][0] for i in range(len(s_dct_bought))]
s_dct_bought_freqs = [s_dct_bought[i][1] for i in range(len(s_dct_bought))]

In [118]:
def sorted_by(data, dct, ind):
    
    s_lst = []
    values = []
    
    shown = in_list(data, ind)[0]
    
    for elt in shown:
        values.append(dct.get(elt))
        
    #if ind<50:
    #    print shown
    #    print values
    
    srt = sorted(dict(zip(shown, values)).items(), key=lambda (k, v): v, reverse=True)
    srt_ids = [srt[i][0] for i in range(len(srt))]
    
    for elt in srt_ids:
        for i in range(build_dict(shown).get(elt)):
            s_lst.append(elt)
    
    return s_lst

In [119]:
train['s_by_shown'] = [sorted_by(train, dct_shown, i) for i in range(train.shape[0])]
train['s_by_bought'] = [sorted_by(train, dct_bought, i) for i in range(train.shape[0])]

test['s_by_shown'] = [sorted_by(test, dct_shown, i) for i in range(test.shape[0])]
test['s_by_bought'] = [sorted_by(test, dct_bought, i) for i in range(test.shape[0])]

In [120]:
train.head(50)

Unnamed: 0,shown,bought,s_by_shown,s_by_bought
0,"[0, 1, 2, 3, 4, 5]",[],"[4, 2, 3, 0, 1, 5]","[5, 0, 1, 2, 3, 4]"
1,"[9, 10, 11, 9, 11, 12, 9, 11]",[],"[12, 9, 9, 9, 10, 11, 11, 11]","[9, 9, 9, 10, 11, 11, 11, 12]"
2,"[16, 17, 18, 19, 20, 21]",[],"[17, 20, 16, 19, 18, 21]","[17, 16, 18, 19, 20, 21]"
3,"[24, 25, 26, 27, 24]",[],"[27, 24, 24, 26, 25]","[24, 24, 25, 26, 27]"
4,"[34, 35, 36, 34, 37, 35, 36, 37, 38, 39, 38, 39]",[],"[35, 35, 34, 34, 36, 36, 37, 37, 38, 38, 39, 39]","[38, 38, 34, 34, 35, 35, 36, 36, 37, 37, 39, 39]"
5,[42],[],[42],[42]
6,"[47, 48, 49]",[],"[48, 47, 49]","[48, 49, 47]"
7,"[59, 60, 61, 62, 60, 63, 64, 65, 66, 61, 67, 6...","[67, 60, 63]","[63, 64, 65, 66, 67, 67, 68, 60, 60, 61, 61, 5...","[67, 67, 60, 60, 63, 64, 65, 66, 68, 59, 61, 6..."
8,"[71, 72, 73, 74]",[],"[73, 72, 71, 74]","[73, 71, 72, 74]"
9,"[76, 77, 78]",[],"[77, 76, 78]","[76, 77, 78]"


In [121]:
#print dct_shown.get(414), dct_shown.get(416), dct_shown.get(418), dct_shown.get(415), dct_shown.get(419), dct_shown.get(420), dct_shown.get(421), dct_shown.get(413), dct_shown.get(417)
#print dct_bought.get(416), dct_bought.get(417), dct_bought.get(418), dct_bought.get(419), dct_bought.get(420), dct_bought.get(421), dct_bought.get(413), dct_bought.get(414), dct_bought.get(415)
#print dct_bought

In [122]:
test.head()

Unnamed: 0,shown,bought,s_by_shown,s_by_bought
0,"[6, 7, 8]",[],"[7, 6, 8]","[8, 6, 7]"
1,"[13, 14, 15]",[],"[13, 14, 15]","[13, 14, 15]"
2,"[22, 23]",[],"[22, 23]","[22, 23]"
3,"[28, 29, 30, 31, 32, 33]",[],"[28, 33, 30, 29, 32, 31]","[28, 32, 29, 30, 33, 31]"
4,"[40, 41]",[],"[40, 41]","[40, 41]"


In [123]:
def del_data_rows(data):
    
    data['len'] = [len(str(data['bought'][i])) for i in range(data.shape[0])]
    data = data[data.len>4]
    
    del data['len']
            
    return data

In [124]:
train = del_data_rows(train)
test = del_data_rows(test)

train.head()

Unnamed: 0,shown,bought,s_by_shown,s_by_bought
7,"[59, 60, 61, 62, 60, 63, 64, 65, 66, 61, 67, 6...","[67, 60, 63]","[63, 64, 65, 66, 67, 67, 68, 60, 60, 61, 61, 5...","[67, 67, 60, 60, 63, 64, 65, 66, 68, 59, 61, 6..."
19,"[138, 198, 199, 127]",[199],"[127, 138, 198, 199]","[138, 127, 199, 198]"
30,"[303, 304, 305, 306, 307, 308, 309, 310, 311, ...",[303],"[303, 306, 304, 307, 309, 310, 305, 308, 311, ...","[303, 304, 305, 306, 307, 308, 309, 310, 311, ..."
33,"[352, 353, 352]",[352],"[352, 352, 353]","[352, 352, 353]"
55,[519],[519],[519],[519]


In [185]:
def precision(shown, bought, ind, k):
        
    temp_shown = []
    
    # get unique values from shown preserving order
    for elt_shown in shown[ind]:
        if not elt_shown in temp_shown:
            temp_shown.append(elt_shown)
        
    temp_bought = bought[ind]
    
    lst = []
    
    for element in temp_shown[:k]:
        if element in temp_bought:
            lst.append(element)
    
    precision = len(lst) / float(k)
        
    return precision

In [186]:
def recall(shown, bought, ind, k):
    
    temp_shown = []
    
    for elt_shown in shown[ind]:
        if not elt_shown in temp_shown:
            temp_shown.append(elt_shown)
        
    temp_bought = (bought[ind])
    
    lst = []
    
    for element in temp_shown[:k]:
        if element in temp_bought:
            lst.append(element)
    
    recall = len(lst) / float(len(temp_bought))
    
    return recall

In [164]:
def avgprec_calc(data, shown, bought, k=1):
    
    data['precision@'+str(k)] = [precision(shown, bought, i, k) for i in data.index]
    
    return round(sum([data['precision@'+str(k)][i] for i in data.index])/data.shape[0],2)

In [165]:
def avgrecall_calc(data, shown, bought, k=1):
    
    data['recall@'+str(k)] = [recall(shown, bought, i, k) for i in data.index]
    
    return round(sum([data['recall@'+str(k)][i] for i in data.index])/data.shape[0],2)

In [166]:
def save_answerArray(fname,array):
    with open(fname,"w") as fout:
        fout.write(" ".join([str(el) for el in array]))

In [167]:
train.head(50)

Unnamed: 0,shown,bought,s_by_shown,s_by_bought,precision@5,precision@1,recall@1,recall@5
7,"[59, 60, 61, 62, 60, 63, 64, 65, 66, 61, 67, 6...","[67, 60, 63]","[63, 64, 65, 66, 67, 67, 68, 60, 60, 61, 61, 5...","[67, 67, 60, 60, 63, 64, 65, 66, 68, 59, 61, 6...",0.4,1.0,0.333333,0.666667
19,"[138, 198, 199, 127]",[199],"[127, 138, 198, 199]","[138, 127, 199, 198]",0.2,0.0,0.0,1.0
30,"[303, 304, 305, 306, 307, 308, 309, 310, 311, ...",[303],"[303, 306, 304, 307, 309, 310, 305, 308, 311, ...","[303, 304, 305, 306, 307, 308, 309, 310, 311, ...",0.2,1.0,1.0,1.0
33,"[352, 353, 352]",[352],"[352, 352, 353]","[352, 352, 353]",0.2,1.0,1.0,1.0
55,[519],[519],[519],[519],0.2,1.0,1.0,1.0
64,"[599, 600, 601, 602]","[603, 604, 602, 599, 605, 606, 600]","[601, 599, 602, 600]","[602, 600, 599, 601]",0.6,0.0,0.0,0.428571
72,"[687, 688, 689, 690, 691, 690, 688, 690, 688, ...","[690, 688]","[687, 688, 688, 688, 691, 690, 690, 690, 689, ...","[688, 688, 688, 690, 690, 690, 687, 689, 691, ...",0.2,0.0,0.0,0.5
89,"[850, 851, 852]",[851],"[852, 850, 851]","[851, 850, 852]",0.2,0.0,0.0,1.0
93,"[879, 884, 170, 137, 170, 879, 884, 879, 885, ...",[879],"[137, 170, 170, 170, 343, 343, 343, 343, 879, ...","[170, 170, 170, 137, 887, 879, 879, 879, 879, ...",0.0,0.0,0.0,0.0
125,[1118],[1118],[1118],[1118],0.2,1.0,1.0,1.0


In [177]:
avgprec_calc(train, train['s_by_shown'], train['bought'], k=5)

0.21

In [178]:
avgprec5_3_train = avgprec_calc(train, train['s_by_bought'], train['bought'], k=5)

In [179]:
avgprec1_3_train = avgprec_calc(train, train['s_by_bought'], train['bought'], k=1)

In [180]:
avgrec1_1_train = avgrecall_calc(train, train['s_by_shown'], train['bought'], k=1)
avgprec1_1_train = avgprec_calc(train, train['s_by_shown'], train['bought'], k=1)
avgrec5_1_train = avgrecall_calc(train, train['s_by_shown'], train['bought'], k=5)
avgprec5_1_train = avgprec_calc(train, train['s_by_shown'], train['bought'], k=5)

array1 = [avgrec1_1_train, avgprec1_1_train, avgrec5_1_train, avgprec5_1_train]
print array1
save_answerArray('Answer_1.txt', array1)

[0.44, 0.51, 0.82, 0.21]


In [181]:
avgrec1_2_test = avgrecall_calc(test, test['s_by_shown'], test['bought'], k=1)
avgprec1_2_test = avgprec_calc(test, test['s_by_shown'], test['bought'], k=1)
avgrec5_2_test = avgrecall_calc(test, test['s_by_shown'], test['bought'], k=5)
avgprec5_2_test = avgprec_calc(test, test['s_by_shown'], test['bought'], k=5)

array2 = [avgrec1_2_test, avgprec1_2_test, avgrec5_2_test, avgprec5_2_test]
print array2
save_answerArray('Answer_2.txt', array2)

[0.41, 0.48, 0.8, 0.2]


In [182]:
avgrec1_3_train = avgrecall_calc(train, train['s_by_bought'], train['bought'], k=1)
avgprec1_3_train = avgprec_calc(train, train['s_by_bought'], train['bought'], k=1)
avgrec5_3_train = avgrecall_calc(train, train['s_by_bought'], train['bought'], k=5)
avgprec5_3_train = avgprec_calc(train, train['s_by_bought'], train['bought'], k=5)

array3 = [avgrec1_3_train, avgprec1_3_train, avgrec5_3_train, avgprec5_3_train]
print array3
save_answerArray('Answer_3.txt', array3)

[0.68, 0.79, 0.93, 0.25]


In [183]:
avgrec1_4_test = avgrecall_calc(test, test['s_by_bought'], test['bought'], k=1)
avgprec1_4_test = avgprec_calc(test, test['s_by_bought'], test['bought'], k=1)
avgrec5_4_test = avgrecall_calc(test, test['s_by_bought'], test['bought'], k=5)
avgprec5_4_test = avgprec_calc(test, test['s_by_bought'], test['bought'], k=5)

array4 = [avgrec1_4_test, avgprec1_4_test, avgrec5_4_test, avgprec5_4_test]
print array4
save_answerArray('Answer_4.txt', array4)

[0.41, 0.48, 0.79, 0.2]


In [191]:
train

Unnamed: 0,shown,bought,s_by_shown,s_by_bought,precision@5,precision@1,recall@1,recall@5
7,"[59, 60, 61, 62, 60, 63, 64, 65, 66, 61, 67, 6...","[67, 60, 63]","[63, 64, 65, 66, 67, 67, 68, 60, 60, 61, 61, 5...","[67, 67, 60, 60, 63, 64, 65, 66, 68, 59, 61, 6...",0.6,1.0,0.333333,1.000000
19,"[138, 198, 199, 127]",[199],"[127, 138, 198, 199]","[138, 127, 199, 198]",0.2,0.0,0.000000,1.000000
30,"[303, 304, 305, 306, 307, 308, 309, 310, 311, ...",[303],"[303, 306, 304, 307, 309, 310, 305, 308, 311, ...","[303, 304, 305, 306, 307, 308, 309, 310, 311, ...",0.2,1.0,1.000000,1.000000
33,"[352, 353, 352]",[352],"[352, 352, 353]","[352, 352, 353]",0.2,1.0,1.000000,1.000000
55,[519],[519],[519],[519],0.2,1.0,1.000000,1.000000
64,"[599, 600, 601, 602]","[603, 604, 602, 599, 605, 606, 600]","[601, 599, 602, 600]","[602, 600, 599, 601]",0.6,1.0,0.142857,0.428571
72,"[687, 688, 689, 690, 691, 690, 688, 690, 688, ...","[690, 688]","[687, 688, 688, 688, 691, 690, 690, 690, 689, ...","[688, 688, 688, 690, 690, 690, 687, 689, 691, ...",0.4,1.0,0.500000,1.000000
89,"[850, 851, 852]",[851],"[852, 850, 851]","[851, 850, 852]",0.2,1.0,1.000000,1.000000
93,"[879, 884, 170, 137, 170, 879, 884, 879, 885, ...",[879],"[137, 170, 170, 170, 343, 343, 343, 343, 879, ...","[170, 170, 170, 137, 887, 879, 879, 879, 879, ...",0.2,0.0,0.000000,1.000000
125,[1118],[1118],[1118],[1118],0.2,1.0,1.000000,1.000000


In [189]:
test

Unnamed: 0,shown,bought,s_by_shown,s_by_bought,recall@1,precision@1,recall@5,precision@5
7,"[63, 68, 69, 70, 66, 61, 59, 61, 66, 68]","[66, 63]","[63, 66, 66, 68, 68, 61, 61, 59, 69, 70]","[63, 66, 66, 68, 68, 69, 70, 59, 61, 61]",0.500000,1.0,1.000000,0.4
14,"[158, 159, 160, 159, 161, 162]",[162],"[158, 162, 160, 159, 159, 161]","[158, 162, 160, 161, 159, 159]",0.000000,0.0,1.000000,0.2
19,"[200, 201, 202, 203, 204]","[201, 205]","[204, 202, 203, 200, 201]","[204, 202, 200, 201, 203]",0.000000,0.0,0.500000,0.2
34,"[371, 372, 371]","[371, 373]","[371, 371, 372]","[371, 371, 372]",0.500000,1.0,0.500000,0.2
40,[422],[422],[422],[422],1.000000,1.0,1.000000,0.2
47,"[463, 465, 466, 465, 19, 467, 464, 468, 469, 4...","[462, 460]","[469, 464, 464, 468, 468, 19, 470, 471, 471, 4...","[469, 474, 474, 467, 463, 463, 464, 464, 465, ...",0.000000,0.0,0.000000,0.0
57,"[543, 544, 545, 546]",[543],"[544, 545, 546, 543]","[544, 545, 546, 543]",0.000000,0.0,1.000000,0.2
94,"[900, 894, 904, 894, 902, 904, 905, 906, 895, ...",[904],"[901, 901, 901, 901, 901, 901, 916, 906, 908, ...","[901, 901, 901, 901, 901, 901, 905, 906, 896, ...",0.000000,0.0,0.000000,0.0
115,[1063],[1063],[1063],[1063],1.000000,1.0,1.000000,0.2
125,"[1119, 1120, 1121, 1122, 1123, 1124, 1125, 112...",[1126],"[1120, 1119, 1121, 1121, 1121, 1122, 1122, 112...","[1120, 1121, 1121, 1121, 1122, 1122, 1123, 112...",0.000000,0.0,0.000000,0.0


In [193]:
test.bought[49675]

[56766, 51344, 30875, 51343, 51342, 84008, 30595, 86369, 91026]

In [194]:
test.s_by_bought[49675]

[30595,
 8433,
 8433,
 78075,
 86369,
 88667,
 84008,
 84008,
 84008,
 74473,
 86762,
 8620,
 36546,
 102484,
 1275,
 1275,
 30875,
 8093]