In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
def print_to_file(value, name):
    print(name, value)
    with open(name, 'w') as file:
        file.write(str(value))

In [3]:
def str_to_ar(string):
    return list(map(int, string.split(','))) if len(string) != 0 else None

def get_improved_df(filepath):
    df = pd.read_csv(filepath, sep = ';', names = ['viewed','buying'],
                    converters = {0:str_to_ar, 1:str_to_ar})
    print(df.head(10))
    return df

In [5]:
df_train = get_improved_df('coursera_sessions_train.csv')

                                              viewed        buying
0                                 [0, 1, 2, 3, 4, 5]          None
1                      [9, 10, 11, 9, 11, 12, 9, 11]          None
2                           [16, 17, 18, 19, 20, 21]          None
3                               [24, 25, 26, 27, 24]          None
4   [34, 35, 36, 34, 37, 35, 36, 37, 38, 39, 38, 39]          None
5                                               [42]          None
6                                       [47, 48, 49]          None
7  [59, 60, 61, 62, 60, 63, 64, 65, 66, 61, 67, 6...  [67, 60, 63]
8                                   [71, 72, 73, 74]          None
9                                       [76, 77, 78]          None


In [7]:
df_test = get_improved_df('coursera_sessions_test.csv')

                                         viewed    buying
0                                     [6, 7, 8]      None
1                                  [13, 14, 15]      None
2                                      [22, 23]      None
3                      [28, 29, 30, 31, 32, 33]      None
4                                      [40, 41]      None
5              [43, 44, 43, 45, 43, 45, 43, 46]      None
6  [50, 51, 47, 52, 49, 53, 54, 55, 56, 57, 58]      None
7      [63, 68, 69, 70, 66, 61, 59, 61, 66, 68]  [66, 63]
8                                          [75]      None
9                          [79, 80, 81, 82, 83]      None


In [63]:
arr_viewed_train = df_train['viewed'].values
arr_buying_train = df_train.dropna()['buying'].values
count_viewed_train = Counter(arr_viewed_train[0])
count_buying_train = Counter(arr_buying_train[0])
for i in arr_viewed_train[1:]:
    count_viewed_train+=Counter(i)
for i in arr_buying_train[1:]:
    count_buying_train+=Counter(i)

In [61]:
arr_viewed_test = df_test['viewed'].values
arr_buying_test = df_test.dropna()['buying'].values
count_viewed_test = Counter(arr_viewed_test[0])
count_buying_test = Counter(arr_buying_test[0])
for i in arr_viewed_train[1:]:
    count_viewed_train+=Counter(i)
for i in arr_buying_train[1:]:
    count_buying_train+=Counter(i)

In [53]:
def get_counter(series):
    lst = series.values
    all_values = []
    for sublist in lst:
        for item in sublist:
            all_values.append(item)
    cnt = Counter(all_values)
    return cnt

In [54]:
def get_num(series):
    # Формируем список всех встречающихся ИД в наборе
    lst = series.values
    all_values = []
    for sublist in lst:
        for item in sublist:
            all_values.append(item)
    # Считаем сколько раз встретился каждый ИД в наборе
    freq = np.bincount(all_values)
    ii = np.nonzero(freq)[0]
    # Возвращаем список - (ИД - количество раз сколько он встретился в списке)
    dic = {}
    for id_, freq_ in zip(ii,freq[ii]):
        dic[id_] = freq_
    return dic

In [55]:
cnt_train_viewed = get_counter(df_train.viewed)
cnt_train_buying = get_counter(df_train.buying.dropna())

In [56]:
num_train_viewed = get_num(df_train.viewed)
num_train_buying = get_num(df_train.buying.dropna())

In [68]:
num_train_viewed

{0: 6,
 1: 6,
 2: 9,
 3: 7,
 4: 11,
 5: 4,
 6: 283,
 7: 312,
 8: 225,
 9: 7,
 10: 7,
 11: 5,
 12: 17,
 13: 10,
 14: 4,
 16: 36,
 17: 70,
 18: 11,
 19: 35,
 20: 48,
 21: 11,
 22: 37,
 23: 19,
 24: 7,
 25: 6,
 26: 7,
 27: 15,
 28: 148,
 29: 16,
 30: 29,
 31: 8,
 32: 11,
 33: 60,
 34: 11,
 35: 22,
 36: 11,
 37: 8,
 38: 8,
 39: 4,
 40: 5,
 41: 2,
 42: 24,
 43: 6,
 44: 36,
 45: 3,
 46: 18,
 47: 2,
 48: 3,
 49: 1,
 50: 21,
 51: 9,
 53: 1,
 55: 9,
 56: 1,
 57: 3,
 58: 1,
 59: 1,
 60: 2,
 61: 2,
 62: 1,
 63: 6,
 64: 3,
 65: 2,
 66: 2,
 67: 2,
 68: 2,
 71: 141,
 72: 167,
 73: 677,
 74: 36,
 75: 17,
 76: 4,
 77: 49,
 78: 4,
 79: 48,
 80: 11,
 81: 10,
 83: 34,
 84: 12,
 85: 165,
 86: 10,
 87: 4,
 88: 1,
 89: 19,
 90: 13,
 91: 2,
 92: 12,
 93: 94,
 94: 8,
 95: 2,
 96: 10,
 97: 12,
 99: 73,
 101: 18,
 102: 4,
 103: 11,
 104: 6,
 105: 7,
 106: 82,
 107: 11,
 108: 10,
 109: 20,
 110: 14,
 111: 13,
 112: 11,
 113: 29,
 114: 129,
 115: 154,
 116: 22,
 117: 9,
 118: 27,
 119: 8,
 120: 3,
 121: 4,
 122: 

In [69]:
cnt_train_viewed

Counter({0: 6,
         1: 6,
         2: 9,
         3: 7,
         4: 11,
         5: 4,
         9: 7,
         10: 7,
         11: 5,
         12: 17,
         16: 36,
         17: 70,
         18: 11,
         19: 35,
         20: 48,
         21: 11,
         24: 7,
         25: 6,
         26: 7,
         27: 15,
         34: 11,
         35: 22,
         36: 11,
         37: 8,
         38: 8,
         39: 4,
         42: 24,
         47: 2,
         48: 3,
         49: 1,
         59: 1,
         60: 2,
         61: 2,
         62: 1,
         63: 6,
         64: 3,
         65: 2,
         66: 2,
         67: 2,
         68: 2,
         71: 141,
         72: 167,
         73: 677,
         74: 36,
         76: 4,
         77: 49,
         78: 4,
         84: 12,
         85: 165,
         86: 10,
         87: 4,
         88: 1,
         89: 19,
         90: 13,
         91: 2,
         92: 12,
         93: 94,
         114: 129,
         115: 154,
         116: 22,
         1

In [72]:
sorted_num_train_viewed = sorted(num_train_viewed.items(), key=lambda item: item[1],reverse=True)

In [73]:
sorted_num_train_viewed

[(73, 677),
 (158, 641),
 (204, 396),
 (262, 387),
 (162, 318),
 (7, 312),
 (137, 306),
 (1185, 284),
 (6, 283),
 (170, 280),
 (800, 253),
 (5202, 227),
 (8, 225),
 (609, 213),
 (3149, 213),
 (3324, 204),
 (1346, 199),
 (751, 197),
 (1184, 186),
 (1283, 186),
 (1844, 186),
 (1933, 186),
 (325, 184),
 (551, 177),
 (4604, 177),
 (1334, 174),
 (42149, 174),
 (259, 173),
 (1852, 172),
 (70238, 170),
 (72, 167),
 (258, 167),
 (758, 166),
 (85, 165),
 (1342, 165),
 (343, 164),
 (2290, 163),
 (5501, 160),
 (1323, 159),
 (3718, 158),
 (301, 156),
 (115, 154),
 (879, 153),
 (2922, 153),
 (3697, 153),
 (6974, 151),
 (260, 150),
 (1214, 149),
 (2084, 149),
 (3286, 149),
 (28, 148),
 (255, 147),
 (363, 147),
 (302, 144),
 (71, 141),
 (791, 140),
 (1349, 140),
 (1595, 140),
 (875, 139),
 (1934, 139),
 (884, 138),
 (1814, 138),
 (2845, 136),
 (553, 135),
 (759, 133),
 (5894, 133),
 (1651, 131),
 (3882, 131),
 (114, 129),
 (1949, 129),
 (2397, 129),
 (11027, 129),
 (72586, 127),
 (552, 125),
 (469, 1

In [74]:
def get_recommend_ids(viewed, cnt, k):
    viewed = list(set(viewed))
    lenght = min(k, len(viewed))
    pos = range(len(viewed))
    freq = list(cnt[id_] if id_ in cnt.keys() else 0 for id_ in viewed)
    d = {'id': viewed, 'pos': pos, 'freq': freq}
    df = pd.DataFrame(data = d).sort_values(['freq', 'pos'], ascending=[False, True])   
    return set(df['id'][:lenght].tolist())