In [1]:
import numpy as np

# read train set and qualifying set
Xq = np.genfromtxt("data/data_qualifying_blanc.csv", delimiter=",", dtype=np.int)
Xt = np.genfromtxt("data/data_train.csv", delimiter=",", dtype=np.int)

# predict ratings
mean = np.mean(Xt[:, 2])
Xq_mean = np.append(Xq, np.full((Xq.shape[0], 1), mean), axis=1)

# save in right format
np.savetxt("qualifying_mean.csv", Xq_mean,
           delimiter=",", newline="\n", encoding="utf-8")

In [2]:
sparsematrix = np.zeros((max(Xt[:, 0]+1), max(Xt[:, 1])+1))
spcol = len(sparsematrix[0,:])
sprow = len(sparsematrix[:, 0])
print(spcol)
print(sprow)

2080
5499


In [3]:
for i in range(len(Xt[:, ])):
    user = Xt[i, 0]
    item = Xt[i, 1]
    rating = Xt[i, 2]
    sparsematrix[user, item] = rating+1

In [4]:
def euklidnorm(vec):
    tmp = 0
    for i in vec:
        tmp+=i**2
    return np.sqrt(tmp)

def cosim(user1, user2):
    return np.dot(user1, user2)/(euklidnorm(user1)*euklidnorm(user2))

def pearson(user1, user2):
    mu1 = np.mean(user1)
    mu2 = np.mean(user2)
    user11 = [user1[i] - mu1 for i in range(len(user1))]
    user21 = [user2[i] - mu2 for i in range(len(user2))]
    return cosim(user11, user21)


def desparse(user_item_matrix):
    copy = user_item_matrix
    deleted_rows = []
    deleted_columns = []
    i = 0 
    j = 0
    br0 = copy.shape[0]
    br1 = copy.shape[1]
    
    while i < br0 : 
        if all(copy[i]==0):
            copy = np.delete(copy, i, 0)
            br0-=1
            deleted_rows.append(i)
        else:
            
            i+=1
    print("halftime")
    while j < br1 :
        if all(copy[:,j]==0):
            copy = np.delete(copy, j, 1)
            br1-=1
            deleted_columns.append(j)
        else:
            
            j+=1
    return copy, deleted_rows, deleted_columns


def user_distance_matrix(user_item_matrix, sim):
    user_dist = np.zeros((user_item_matrix.shape[0], user_item_matrix.shape[0]))
    for i in range(len(user_item_matrix)):
        for j in range(i, len(user_item_matrix)):
            
            if i == j:
                user_dist[i][j] = 0
            else:
                user_dist[i][j] = sim(user_item_matrix[i], user_item_matrix[j])
    return user_dist



def fill_zeros_with_mean(user_item_matrix):
    copy = user_item_matrix
    for i in range(copy.shape[0]):
        for j in range(copy.shape[1]):
            if copy[i,j] == 0:
                copy[i,j] = mean
    return copy

def make_symmetric_matrix(half_matrix):
    matrix = np.transpose(half_matrix)
    for i in range(matrix.shape[0]):
        for j in range(i, matrix.shape[0]):
            matrix[i][j] = matrix[j][i]
    return matrix;
                
#knn returns list of indices of the user_item_matrix
def knn(user_index,  user_distance_matrix, k):
    nn = []
    for i in range(user_distance_matrix.shape[0]):
        if user_index != i:
            tmp = user_distance_matrix[user_index][i]
            
            if len(nn)<k:
                nn.append((i, tmp))
                
            else:
                if tmp > nn[-1][1]:
                    nn[-1] = (i, tmp)
            nn = sorted(nn, reverse = True,  key = lambda el: el[1])
    return [x[0] for x in nn]

#speicher die indices der knn zu jedem user
def matrix_of_knn(user_distance_matrix, k):
    matrix = []
    for i in range(user_distance_matrix.shape[0]):
        tmp = knn(i, user_distance_matrix, k)
        matrix.append(tmp)
    return matrix

#ersetzt indices durch user 
def index_to_user(list_of_indices, user_item_matrix):
    copy = []
    for index in list_of_indices:
        copy.append(user_item_matrix[index])
    return copy

#berechne die erwarteten ratings für einen user aufgrund der knn 
def predict_rating(user_index, list_of_indices, distance_matrix, user_item_matrix):
    copy = user_item_matrix
    user = user_item_matrix[user_index]
    list_of_users = np.array(index_to_user(list_of_indices, user_item_matrix))
    for i in range(len(user)):
        if user_item_matrix[user_index][i]==0:
            r_hat = np.mean(user)

            sum_of_rating = 0
            sum_of_sim = 0
            
            
            for j in range(len(list_of_users)):
                simuv = distance_matrix[user_index][j]
                
                sum_of_rating += simuv*(list_of_users[j][i] - np.mean(list_of_users[j]))
                sum_of_sim += np.abs(simuv)
            user[i] = r_hat + sum_of_rating/sum_of_sim
    return copy


def prediction_matrix(user_item_matrix, distance_matrix, matrix_of_knn_indices):
    copy = user_item_matrix
    for i in range(len(user_item_matrix)):
        copy = predict_rating(i, matrix_of_knn_indices[i], distance_matrix, user_item_matrix)
    return copy
        
        
def rebuild_user_item_matrix(user_item_matrix, deleted_rows, deleted_kolumns):
    print(len(deleted_rows))
    print(len(deleted_kolumns))
    copy = user_item_matrix
    for i in range(len(deleted_columns)-1, 0, -1):
        if deleted_columns[i] == copy.shape[1]:
            copy = np.append(copy, np.zeros((1, copy.shape[1])), axis = 1)
        elif  deleted_columns[i] < copy.shape[1]:
            copy = np.insert(copy, deleted_columns[i], 0, axis = 1)
        print(copy.shape)
    for j in range(len(deleted_rows)-1, 0, -1):
        if deleted_rows[j] == copy.shape[0]:
            copy = np.append(copy, np.zeros((1, copy.shape[0])), axis = 0)
        elif deleted_rows[j] < copy.shape[0]:
            copy = np.insert(copy, deleted_rows[i], 0, axis = 0)
        print(copy.shape)
    return copy
    
def get_predictions_for_test_data(user_item_matrix, test_data):
    tmp = []
    for i in range(len(test_data)):
        tmp.append(user_item_matrix[test_data[i][0]-1][test_data[i][1]-1])
           
    tmp = np.asarray(tmp)
    print(tmp.shape)
    
    test_data = np.append(test_data, np.reshape(tmp, (test_data.shape[0], 1)), axis = 1)
    print(test_data.shape)
    return(test_data)


In [5]:
user_item_matrix, deleted_rows, deleted_columns = desparse(sparsematrix)


halftime


In [6]:
#user_distances = user_distance_matrix(user_item_matrix, pearson)

In [7]:
item_user_matrix = np.transpose(user_item_matrix)

In [8]:
#item_distances = user_distance_matrix(item_user_matrix, pearson)

In [None]:

#user_distances = make_symmetric_matrix(user_distances)
#item_distances = make_symmetric_matrix(item_distances)
#np.savetxt("user_item_distance.csv", user_distances, delimiter=",")
#np.savetxt("item_user_distance.csv", item_distances, delimiter=",")


In [8]:
user_distances = np.genfromtxt("user_item_distance.csv", delimiter=",", dtype=np.float)

In [9]:
user_mknn = matrix_of_knn(user_distances, 10)

In [10]:
user_based_prediction = prediction_matrix(user_item_matrix, user_distances, user_mknn)

In [11]:
rebuild_uim = rebuild_user_item_matrix(user_based_prediction, deleted_rows, deleted_columns)
print(rebuild_uim.shape)

3834
886
(1665, 1195)
(1665, 1196)
(1665, 1197)
(1665, 1198)
(1665, 1199)
(1665, 1200)
(1665, 1201)
(1665, 1202)
(1665, 1203)
(1665, 1204)
(1665, 1205)
(1665, 1206)
(1665, 1207)
(1665, 1208)
(1665, 1209)
(1665, 1210)
(1665, 1211)
(1665, 1212)
(1665, 1213)
(1665, 1214)
(1665, 1215)
(1665, 1216)
(1665, 1217)
(1665, 1218)
(1665, 1219)
(1665, 1220)
(1665, 1221)
(1665, 1222)
(1665, 1223)
(1665, 1224)
(1665, 1225)
(1665, 1226)
(1665, 1227)
(1665, 1228)
(1665, 1229)
(1665, 1230)
(1665, 1231)
(1665, 1232)
(1665, 1233)
(1665, 1234)
(1665, 1235)
(1665, 1236)
(1665, 1237)
(1665, 1238)
(1665, 1239)
(1665, 1240)
(1665, 1241)
(1665, 1242)
(1665, 1243)
(1665, 1244)
(1665, 1245)
(1665, 1246)
(1665, 1247)
(1665, 1248)
(1665, 1249)
(1665, 1250)
(1665, 1251)
(1665, 1252)
(1665, 1253)
(1665, 1254)
(1665, 1255)
(1665, 1256)
(1665, 1257)
(1665, 1258)
(1665, 1259)
(1665, 1260)
(1665, 1261)
(1665, 1262)
(1665, 1263)
(1665, 1264)
(1665, 1265)
(1665, 1266)
(1665, 1267)
(1665, 1268)
(1665, 1269)
(1665, 1270)
(16

(1665, 1833)
(1665, 1834)
(1665, 1835)
(1665, 1836)
(1665, 1837)
(1665, 1838)
(1665, 1839)
(1665, 1840)
(1665, 1841)
(1665, 1842)
(1665, 1843)
(1665, 1844)
(1665, 1845)
(1665, 1846)
(1665, 1847)
(1665, 1848)
(1665, 1849)
(1665, 1850)
(1665, 1851)
(1665, 1852)
(1665, 1853)
(1665, 1854)
(1665, 1855)
(1665, 1856)
(1665, 1857)
(1665, 1858)
(1665, 1859)
(1665, 1860)
(1665, 1861)
(1665, 1862)
(1665, 1863)
(1665, 1864)
(1665, 1865)
(1665, 1866)
(1665, 1867)
(1665, 1868)
(1665, 1869)
(1665, 1870)
(1665, 1871)
(1665, 1872)
(1665, 1873)
(1665, 1874)
(1665, 1875)
(1665, 1876)
(1665, 1877)
(1665, 1878)
(1665, 1879)
(1665, 1880)
(1665, 1881)
(1665, 1882)
(1665, 1883)
(1665, 1884)
(1665, 1885)
(1665, 1886)
(1665, 1887)
(1665, 1888)
(1665, 1889)
(1665, 1890)
(1665, 1891)
(1665, 1892)
(1665, 1893)
(1665, 1894)
(1665, 1895)
(1665, 1896)
(1665, 1897)
(1665, 1898)
(1665, 1899)
(1665, 1900)
(1665, 1901)
(1665, 1902)
(1665, 1903)
(1665, 1904)
(1665, 1905)
(1665, 1906)
(1665, 1907)
(1665, 1908)
(1665, 1909)

(2068, 2079)
(2069, 2079)
(2070, 2079)
(2071, 2079)
(2072, 2079)
(2073, 2079)
(2074, 2079)
(2075, 2079)
(2076, 2079)
(2077, 2079)
(2078, 2079)
(2079, 2079)
(2080, 2079)
(2081, 2079)
(2082, 2079)
(2083, 2079)
(2084, 2079)
(2085, 2079)
(2086, 2079)
(2087, 2079)
(2088, 2079)
(2089, 2079)
(2090, 2079)
(2091, 2079)
(2092, 2079)
(2093, 2079)
(2094, 2079)
(2095, 2079)
(2096, 2079)
(2097, 2079)
(2098, 2079)
(2099, 2079)
(2100, 2079)
(2101, 2079)
(2102, 2079)
(2103, 2079)
(2104, 2079)
(2105, 2079)
(2106, 2079)
(2107, 2079)
(2108, 2079)
(2109, 2079)
(2110, 2079)
(2111, 2079)
(2112, 2079)
(2113, 2079)
(2114, 2079)
(2115, 2079)
(2116, 2079)
(2117, 2079)
(2118, 2079)
(2119, 2079)
(2120, 2079)
(2121, 2079)
(2122, 2079)
(2123, 2079)
(2124, 2079)
(2125, 2079)
(2126, 2079)
(2127, 2079)
(2128, 2079)
(2129, 2079)
(2130, 2079)
(2131, 2079)
(2132, 2079)
(2133, 2079)
(2134, 2079)
(2135, 2079)
(2136, 2079)
(2137, 2079)
(2138, 2079)
(2139, 2079)
(2140, 2079)
(2141, 2079)
(2142, 2079)
(2143, 2079)
(2144, 2079)

(2701, 2079)
(2702, 2079)
(2703, 2079)
(2704, 2079)
(2705, 2079)
(2706, 2079)
(2707, 2079)
(2708, 2079)
(2709, 2079)
(2710, 2079)
(2711, 2079)
(2712, 2079)
(2713, 2079)
(2714, 2079)
(2715, 2079)
(2716, 2079)
(2717, 2079)
(2718, 2079)
(2719, 2079)
(2720, 2079)
(2721, 2079)
(2722, 2079)
(2723, 2079)
(2724, 2079)
(2725, 2079)
(2726, 2079)
(2727, 2079)
(2728, 2079)
(2729, 2079)
(2730, 2079)
(2731, 2079)
(2732, 2079)
(2733, 2079)
(2734, 2079)
(2735, 2079)
(2736, 2079)
(2737, 2079)
(2738, 2079)
(2739, 2079)
(2740, 2079)
(2741, 2079)
(2742, 2079)
(2743, 2079)
(2744, 2079)
(2745, 2079)
(2746, 2079)
(2747, 2079)
(2748, 2079)
(2749, 2079)
(2750, 2079)
(2751, 2079)
(2752, 2079)
(2753, 2079)
(2754, 2079)
(2755, 2079)
(2756, 2079)
(2757, 2079)
(2758, 2079)
(2759, 2079)
(2760, 2079)
(2761, 2079)
(2762, 2079)
(2763, 2079)
(2764, 2079)
(2765, 2079)
(2766, 2079)
(2767, 2079)
(2768, 2079)
(2769, 2079)
(2770, 2079)
(2771, 2079)
(2772, 2079)
(2773, 2079)
(2774, 2079)
(2775, 2079)
(2776, 2079)
(2777, 2079)

(3335, 2079)
(3336, 2079)
(3337, 2079)
(3338, 2079)
(3339, 2079)
(3340, 2079)
(3341, 2079)
(3342, 2079)
(3343, 2079)
(3344, 2079)
(3345, 2079)
(3346, 2079)
(3347, 2079)
(3348, 2079)
(3349, 2079)
(3350, 2079)
(3351, 2079)
(3352, 2079)
(3353, 2079)
(3354, 2079)
(3355, 2079)
(3356, 2079)
(3357, 2079)
(3358, 2079)
(3359, 2079)
(3360, 2079)
(3361, 2079)
(3362, 2079)
(3363, 2079)
(3364, 2079)
(3365, 2079)
(3366, 2079)
(3367, 2079)
(3368, 2079)
(3369, 2079)
(3370, 2079)
(3371, 2079)
(3372, 2079)
(3373, 2079)
(3374, 2079)
(3375, 2079)
(3376, 2079)
(3377, 2079)
(3378, 2079)
(3379, 2079)
(3380, 2079)
(3381, 2079)
(3382, 2079)
(3383, 2079)
(3384, 2079)
(3385, 2079)
(3386, 2079)
(3387, 2079)
(3388, 2079)
(3389, 2079)
(3390, 2079)
(3391, 2079)
(3392, 2079)
(3393, 2079)
(3394, 2079)
(3395, 2079)
(3396, 2079)
(3397, 2079)
(3398, 2079)
(3399, 2079)
(3400, 2079)
(3401, 2079)
(3402, 2079)
(3403, 2079)
(3404, 2079)
(3405, 2079)
(3406, 2079)
(3407, 2079)
(3408, 2079)
(3409, 2079)
(3410, 2079)
(3411, 2079)

(3967, 2079)
(3968, 2079)
(3969, 2079)
(3970, 2079)
(3971, 2079)
(3972, 2079)
(3973, 2079)
(3974, 2079)
(3975, 2079)
(3976, 2079)
(3977, 2079)
(3978, 2079)
(3979, 2079)
(3980, 2079)
(3981, 2079)
(3982, 2079)
(3983, 2079)
(3984, 2079)
(3985, 2079)
(3986, 2079)
(3987, 2079)
(3988, 2079)
(3989, 2079)
(3990, 2079)
(3991, 2079)
(3992, 2079)
(3993, 2079)
(3994, 2079)
(3995, 2079)
(3996, 2079)
(3997, 2079)
(3998, 2079)
(3999, 2079)
(4000, 2079)
(4001, 2079)
(4002, 2079)
(4003, 2079)
(4004, 2079)
(4005, 2079)
(4006, 2079)
(4007, 2079)
(4008, 2079)
(4009, 2079)
(4010, 2079)
(4011, 2079)
(4012, 2079)
(4013, 2079)
(4014, 2079)
(4015, 2079)
(4016, 2079)
(4017, 2079)
(4018, 2079)
(4019, 2079)
(4020, 2079)
(4021, 2079)
(4022, 2079)
(4023, 2079)
(4024, 2079)
(4025, 2079)
(4026, 2079)
(4027, 2079)
(4028, 2079)
(4029, 2079)
(4030, 2079)
(4031, 2079)
(4032, 2079)
(4033, 2079)
(4034, 2079)
(4035, 2079)
(4036, 2079)
(4037, 2079)
(4038, 2079)
(4039, 2079)
(4040, 2079)
(4041, 2079)
(4042, 2079)
(4043, 2079)

(4601, 2079)
(4602, 2079)
(4603, 2079)
(4604, 2079)
(4605, 2079)
(4606, 2079)
(4607, 2079)
(4608, 2079)
(4609, 2079)
(4610, 2079)
(4611, 2079)
(4612, 2079)
(4613, 2079)
(4614, 2079)
(4615, 2079)
(4616, 2079)
(4617, 2079)
(4618, 2079)
(4619, 2079)
(4620, 2079)
(4621, 2079)
(4622, 2079)
(4623, 2079)
(4624, 2079)
(4625, 2079)
(4626, 2079)
(4627, 2079)
(4628, 2079)
(4629, 2079)
(4630, 2079)
(4631, 2079)
(4632, 2079)
(4633, 2079)
(4634, 2079)
(4635, 2079)
(4636, 2079)
(4637, 2079)
(4638, 2079)
(4639, 2079)
(4640, 2079)
(4641, 2079)
(4642, 2079)
(4643, 2079)
(4644, 2079)
(4645, 2079)
(4646, 2079)
(4647, 2079)
(4648, 2079)
(4649, 2079)
(4650, 2079)
(4651, 2079)
(4652, 2079)
(4653, 2079)
(4654, 2079)
(4655, 2079)
(4656, 2079)
(4657, 2079)
(4658, 2079)
(4659, 2079)
(4660, 2079)
(4661, 2079)
(4662, 2079)
(4663, 2079)
(4664, 2079)
(4665, 2079)
(4666, 2079)
(4667, 2079)
(4668, 2079)
(4669, 2079)
(4670, 2079)
(4671, 2079)
(4672, 2079)
(4673, 2079)
(4674, 2079)
(4675, 2079)
(4676, 2079)
(4677, 2079)

(5236, 2079)
(5237, 2079)
(5238, 2079)
(5239, 2079)
(5240, 2079)
(5241, 2079)
(5242, 2079)
(5243, 2079)
(5244, 2079)
(5245, 2079)
(5246, 2079)
(5247, 2079)
(5248, 2079)
(5249, 2079)
(5250, 2079)
(5251, 2079)
(5252, 2079)
(5253, 2079)
(5254, 2079)
(5255, 2079)
(5256, 2079)
(5257, 2079)
(5258, 2079)
(5259, 2079)
(5260, 2079)
(5261, 2079)
(5262, 2079)
(5263, 2079)
(5264, 2079)
(5265, 2079)
(5266, 2079)
(5267, 2079)
(5268, 2079)
(5269, 2079)
(5270, 2079)
(5271, 2079)
(5272, 2079)
(5273, 2079)
(5274, 2079)
(5275, 2079)
(5276, 2079)
(5277, 2079)
(5278, 2079)
(5279, 2079)
(5280, 2079)
(5281, 2079)
(5282, 2079)
(5283, 2079)
(5284, 2079)
(5285, 2079)
(5286, 2079)
(5287, 2079)
(5288, 2079)
(5289, 2079)
(5290, 2079)
(5291, 2079)
(5292, 2079)
(5293, 2079)
(5294, 2079)
(5295, 2079)
(5296, 2079)
(5297, 2079)
(5298, 2079)
(5299, 2079)
(5300, 2079)
(5301, 2079)
(5302, 2079)
(5303, 2079)
(5304, 2079)
(5305, 2079)
(5306, 2079)
(5307, 2079)
(5308, 2079)
(5309, 2079)
(5310, 2079)
(5311, 2079)
(5312, 2079)

In [12]:
filled_rebuild = fill_zeros_with_mean(rebuild_uim)

In [13]:
Xq = get_predictions_for_test_data(filled_rebuild, Xq)
print(Xq.shape)

(108660,)
(108660, 3)
(108660, 3)


In [14]:
np.savetxt("qualifying_mean.csv", Xq, delimiter=",", newline="\n", encoding="utf-8")