In [1]:
import pandas as pd
import statistics 
import numpy as np

In [2]:
def build_str(grouped_user):
    df_users = {}

    for i,j in zip(grouped_user.index.tolist(), grouped_user.values.tolist()):
        df_users[i] = len(j) 
        
    return df_users

In [3]:
def get_values(df_users):
    maximum = max(df_users, key=df_users.get)
    minimum = min(df_users, key=df_users.get)
    max_items_per_user = df_users[maximum]
    min_items_per_user = df_users[minimum]
    avg_items_per_user = statistics.mean(list(df_users.values()))
    median_items_per_user = statistics.median(list(df_users.values()))
    
    return max_items_per_user, min_items_per_user, round(avg_items_per_user,3), median_items_per_user

In [4]:
def lenght_list_value(lista, value):
    return len([x for x in lista if x >= value])

In [5]:
def atleast_ratings(df_users):
    values = list(df_users.values())
    
    return lenght_list_value(values, 10), lenght_list_value(values, 50), lenght_list_value(values, 100)

In [6]:
ml_original = pd.read_csv('u.csv')

training = pd.read_csv('trainset.csv')
training.columns = ['user', 'item', 'rating', 'timestamp']

evaluation = pd.read_csv('testset.csv')
evaluation.columns = ['user', 'item', 'rating', 'timestamp']

evaluation_estrat = pd.read_csv('testset_stratified.csv')
evaluation_estrat.columns = ['user', 'item', 'rating', 'timestamp']

ml_imdb = pd.merge(training, evaluation, how='outer')

datasets = [ml_original, ml_imdb, training, evaluation, evaluation_estrat]

In [7]:
tr_4 = training[training['rating']>=4]
tr_4

Unnamed: 0,user,item,rating,timestamp
0,1,1,4.0,964982703
1,1,47,5.0,964983815
2,1,50,5.0,964982931
4,1,101,5.0,964980868
5,1,110,4.0,964982176
6,1,151,5.0,964984041
7,1,163,5.0,964983650
8,1,231,5.0,964981179
9,1,235,4.0,964980908
10,1,260,5.0,964981680


In [8]:
for d in datasets:
    print(d)

       user  item  rating  timestamp
0       196   242       3  881250949
1       186   302       3  891717742
2        22   377       1  878887116
3       244    51       2  880606923
4       166   346       1  886397596
5       298   474       4  884182806
6       115   265       2  881171488
7       253   465       5  891628467
8       305   451       3  886324817
9         6    86       3  883603013
10       62   257       2  879372434
11      286  1014       5  879781125
12      200   222       5  876042340
13      210    40       3  891035994
14      224    29       3  888104457
15      303   785       3  879485318
16      122   387       5  879270459
17      194   274       2  879539794
18      291  1042       4  874834944
19      234  1184       2  892079237
20      119   392       4  886176814
21      167   486       4  892738452
22      299   144       4  877881320
23      291   118       2  874833878
24      308     1       4  887736532
25       95   546       2  879196566
2

In [9]:
results = list()
for d in datasets:
    
    items = d.item.unique()
    users = d.user.unique()
    
    num_ratings = len(d)
    num_items = len(items)
    num_users = len(users)
    density = round(num_ratings/(num_items*num_users),3)
    
    print('--------------------------')
    print('Ratings: ' + str(num_ratings))
    print('Items: ' + str(num_items))
    print('Users: ' + str(num_users))
    print('Density: ' + str(density))
    
    
    ######### ITEMS 
    
    grouped_user = d.groupby('item')['user'].apply(list)
    
    # diccionario que va a contener como key el item, como value, el numero de users con los que ha interactuado
    df_users = build_str(grouped_user)
        
    max_items_per_item, min_items_per_item, avg_items_per_item, median_items_per_item = get_values(df_users)
    
    print('Maximum # ratings per item: ' + str(max_items_per_item))
    print('Median # ratings per item: ' + str(median_items_per_item))
    print('Average # ratings per item: ' + str(avg_items_per_item))
    print('Minimum # ratings per item: ' + str(min_items_per_item))
    
    tenratings_item, fiftyratings_item, ohratings_item = atleast_ratings(df_users)
    
    print('# Items with at least 10 ratings: ' + str(tenratings_item))
    print('# Items with at least 50 ratings: ' + str(fiftyratings_item))
    print('# Items with at least 100 ratings: ' + str(ohratings_item))
    
    ###### USERS
    
    grouped_user = d.groupby('user')['item'].apply(list)
    
    # diccionario que va a contener como key el user, como value, el numero de items con los que ha interactuado
    df_users = build_str(grouped_user)
        
    max_items_per_user, min_items_per_user, avg_items_per_user, median_items_per_user = get_values(df_users)
    
    print('Maximum # ratings per user: ' + str(max_items_per_user))
    print('Median # ratings per user: ' + str(median_items_per_user))
    print('Average # ratings per user: ' + str(avg_items_per_user))
    print('Minimum # ratings per user: ' + str(min_items_per_user))
    
    
    tenratings_user, fiftyratings_user, ohratings_user = atleast_ratings(df_users)
    
    print('# Users with at least 10 ratings: ' + str(tenratings_user))
    print('# Users with at least 50 ratings: ' + str(fiftyratings_user))
    print('# Users with at least 100 ratings: ' + str(ohratings_user))
    
    
    ########### RATINGS
    d_mayorigual_4 = round(len(d[d['rating']>=4])*100 / num_ratings,3)
    d_menor_4 = round(len(d[d['rating']<4])*100 / num_ratings,3)
    
    print('% Ratings with value greater than or equal to 4: ' + str(d_mayorigual_4))
    print('% Ratings with value lower than 4: ' + str(d_menor_4))
    
    results.append([num_ratings,num_items,num_users,density,max_items_per_item,median_items_per_item,avg_items_per_item,min_items_per_item,tenratings_item,fiftyratings_item,ohratings_item,max_items_per_user,median_items_per_user,avg_items_per_user,min_items_per_user,tenratings_user,fiftyratings_user,ohratings_user,d_mayorigual_4,d_menor_4])

--------------------------
Ratings: 100000
Items: 1682
Users: 943
Density: 0.063
Maximum # ratings per item: 583
Median # ratings per item: 27.0
Average # ratings per item: 59.453
Minimum # ratings per item: 1
# Items with at least 10 ratings: 1152
# Items with at least 50 ratings: 603
# Items with at least 100 ratings: 338
Maximum # ratings per user: 737
Median # ratings per user: 65
Average # ratings per user: 106.045
Minimum # ratings per user: 20
# Users with at least 10 ratings: 943
# Users with at least 50 ratings: 568
# Users with at least 100 ratings: 364
% Ratings with value greater than or equal to 4: 55.375
% Ratings with value lower than 4: 44.625
--------------------------
Ratings: 11477
Items: 164
Users: 587
Density: 0.119
Maximum # ratings per item: 329
Median # ratings per item: 43.5
Average # ratings per item: 69.982
Minimum # ratings per item: 1
# Items with at least 10 ratings: 146
# Items with at least 50 ratings: 78
# Items with at least 100 ratings: 45
Maximum # r

In [10]:
print(results)

[[100000, 1682, 943, 0.063, 583, 27.0, 59.453, 1, 1152, 603, 338, 737, 65, 106.045, 20, 943, 568, 364, 55.375, 44.625], [11477, 164, 587, 0.119, 329, 43.5, 69.982, 1, 146, 78, 45, 128, 12, 19.552, 1, 325, 58, 3, 52.54, 47.46], [10330, 164, 584, 0.108, 305, 39.0, 62.988, 1, 143, 73, 34, 113, 11.0, 17.688, 1, 316, 39, 3, 52.662, 47.338], [1147, 145, 394, 0.02, 30, 5, 7.91, 1, 45, 0, 0, 15, 2.0, 2.911, 1, 8, 0, 0, 51.439, 48.561], [280, 109, 134, 0.019, 10, 2, 2.569, 1, 1, 0, 0, 11, 1.0, 2.09, 1, 2, 0, 0, 37.5, 62.5]]


In [11]:
traspose = [list(i) for i in zip(*results)]
traspose

[[100000, 11477, 10330, 1147, 280],
 [1682, 164, 164, 145, 109],
 [943, 587, 584, 394, 134],
 [0.063, 0.119, 0.108, 0.02, 0.019],
 [583, 329, 305, 30, 10],
 [27.0, 43.5, 39.0, 5, 2],
 [59.453, 69.982, 62.988, 7.91, 2.569],
 [1, 1, 1, 1, 1],
 [1152, 146, 143, 45, 1],
 [603, 78, 73, 0, 0],
 [338, 45, 34, 0, 0],
 [737, 128, 113, 15, 11],
 [65, 12, 11.0, 2.0, 1.0],
 [106.045, 19.552, 17.688, 2.911, 2.09],
 [20, 1, 1, 1, 1],
 [943, 325, 316, 8, 2],
 [568, 58, 39, 0, 0],
 [364, 3, 3, 0, 0],
 [55.375, 52.54, 52.662, 51.439, 37.5],
 [44.625, 47.46, 47.338, 48.561, 62.5]]

In [12]:
row_val = ['Ratings', 'Items', 'Users', 'Density', 'Maximum # ratings per item',  'Median # ratings per item', 'Average # ratings per item', 'Minimum # ratings per item', '# Items with at least 10 ratings', '# Items with at least 50 ratings', '# Items with at least 100 ratings', 'Maximum # ratings per user', 'Median # ratings per user', 'Average # ratings per user', 'Minimum # ratings per user', '# Users with at least 10 ratings', '# Users with at least 50 ratings', '# Users with at least 100 ratings', '% Ratings with value greater than or equal to 4', '% Ratings with value lower than 4']
col_val = ['MovieLens 100K', 'MovieLens and IMDB', 'Training set', 'Original Evaluation set', 'Stratified Evaluation set']

In [13]:
df_results = pd.DataFrame(np.array(traspose), columns=col_val)
df_results

Unnamed: 0,MovieLens 100K,MovieLens and IMDB,Training set,Original Evaluation set,Stratified Evaluation set
0,100000.0,11477.0,10330.0,1147.0,280.0
1,1682.0,164.0,164.0,145.0,109.0
2,943.0,587.0,584.0,394.0,134.0
3,0.063,0.119,0.108,0.02,0.019
4,583.0,329.0,305.0,30.0,10.0
5,27.0,43.5,39.0,5.0,2.0
6,59.453,69.982,62.988,7.91,2.569
7,1.0,1.0,1.0,1.0,1.0
8,1152.0,146.0,143.0,45.0,1.0
9,603.0,78.0,73.0,0.0,0.0


In [15]:
df_results.to_csv('results_analysis.csv', index=False)