In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

from utils import get_dfs, concat_output_filename, load_head_items
from prep_organized_boycotts import (
    group_by_age, group_by_gender, group_by_genre,
    group_by_occupation, group_by_power, group_by_state, group_by_genre_strict
)

In [2]:
dfs = get_dfs('ml-1m')

Path to ratings file is: /Users/nick/.surprise_data/ml-1m/ml-1m/ratings.dat
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip...
Done! Dataset ml-1m has been saved to /Users/nick/.surprise_data/ml-1m


In [3]:
users_df = dfs['users']
ratings_df = dfs['ratings']
print(users_df.head())
print(ratings_df.head())


print(len(ratings_df.groupby('user_id')))
print(len(ratings_df.groupby('movie_id')))

   user_id gender  age  occupation zip_code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455
   user_id  movie_id  rating  unix_timestamp
0        1      1193       5       978300760
1        1       661       3       978302109
2        1       914       3       978301968
3        1      3408       4       978300275
4        1      2355       5       978824291
6040
3706


# Warning:
The following cell is very slow, so it will save outputs into some_distances.json

Therefore, you may wish to just skip running this cell unless something major has changed

In [4]:
from collections import defaultdict
from scipy.spatial.distance import cosine

groups = group_by_gender(users_df) + group_by_genre(users_df, ratings_df, dfs['movies'], 'ml-1m')
groups += group_by_age(users_df) + group_by_occupation(users_df)
groups += group_by_power(users_df, ratings_df, 'ml-1m')

#groups = group_by_gender(users_df)
groups += [{'name': 'all', 'df': users_df}]
movie_set = sorted(list(set(ratings_df.movie_id)))

user_to_vec = {}
group_to_implicit_vecs = defaultdict(list)
group_to_explicit_vecs = defaultdict(list)

group_to_num_ratings = defaultdict(int)

# we're gonna do a bunch of stuff at once
for group in groups:
    print(group['name'])
    
    # Get all the user ids in this boycott group (e.g. all Comedy fans or all users of a certain age range)
    group_uid_set = set(list(group['df'].user_id))
    
    for user_id in group_uid_set:
        # get all the ratings for this user
        user_ratings = ratings_df[ratings_df.user_id == user_id]
        user_movie_set = list(set(user_ratings.movie_id))
        
        implicit_vec = []
        explicit_vec = []
        for movie_id in movie_set:
            if movie_id in user_movie_set:
                implicit_vec.append(1)
                rating = user_ratings[user_ratings.movie_id == movie_id].rating
                explicit_vec.append(int(rating))
            else:
                implicit_vec.append(0)
                explicit_vec.append(3)
        implicit_vec = np.array(implicit_vec)
        explicit_vec = np.array(explicit_vec)

        group_to_implicit_vecs[group['name']].append(implicit_vec)
        group_to_explicit_vecs[group['name']].append(explicit_vec)
        group_to_num_ratings[group['name']] += len(user_movie_set)

group_to_type_to_centroid = defaultdict(dict)
for group_to_vecs, type_ in [
    (group_to_implicit_vecs, 'implicit',),
    (group_to_explicit_vecs, 'explicit',),
]:
    for group, vecs in group_to_vecs.items():
        group_to_type_to_centroid[group][type_] = np.mean(vecs, axis=0)

group_to_group_to_vectype_to_distancetype_to_ = defaultdict(
    lambda: defaultdict(
        lambda: defaultdict(dict)
    )
)
"""
{
    comedy:
        western:
            explicit:
                cosine
                    0.5
                euclidean
                    0.4
            implicit
                cosine
                    0.3
                euclidean
                    0.2
        drama
        ...
        ...
    western:
        ...
        ...
}
"""
for group, type_to_centroid in group_to_type_to_centroid.items():
    for type_, centroid in type_to_centroid.items():
        for group2, type_to_centroid2 in group_to_type_to_centroid.items():
            if group == group2:
                continue
            for type_2, centroid2 in type_to_centroid2.items():
                if type_2 != type_:
                    continue
#                 print('centroid below')
#                 print(centroid)
#                 print('centroid2 below')
#                 print(centroid2)
                group_to_group_to_vectype_to_distancetype_to_[
                    group][group2][type_]['euclidean'] = np.linalg.norm(centroid - centroid2)
                group_to_group_to_vectype_to_distancetype_to_[
                    group][group2][type_]['cosine'] = cosine(centroid, centroid2)
        
def default_to_regular(d):
    if isinstance(d, defaultdict):
        d = {key: default_to_regular(val) for key, val in d.items()}
    return d

group_to_group_to_vectype_to_distancetype_to_ = default_to_regular(group_to_group_to_vectype_to_distancetype_to_)          
pprint(group_to_group_to_vectype_to_distancetype_to_)
import json
with open('some_distances.json', 'w') as f:
    json.dump(group_to_group_to_vectype_to_distancetype_to_, f)
    
with open('group_to_num_ratings.json', 'w') as f:
    json.dump(dict(group_to_num_ratings), f)

male users excluded
female users excluded
Fans of Drama excluded using threshold 4
Fans of Animation excluded using threshold 4
Fans of Children's excluded using threshold 4
Fans of Musical excluded using threshold 4
Fans of Comedy excluded using threshold 4
Fans of Sci-Fi excluded using threshold 4
Fans of War excluded using threshold 4
Fans of Thriller excluded using threshold 4
Fans of Action excluded using threshold 4
Fans of Adventure excluded using threshold 4
Fans of Crime excluded using threshold 4
Fans of Romance excluded using threshold 4
Fans of Film-Noir excluded using threshold 4
Fans of Fantasy excluded using threshold 4
Fans of Mystery excluded using threshold 4
Fans of Horror excluded using threshold 4
Fans of Documentary excluded using threshold 4
Fans of Western excluded using threshold 4
Under 18 excluded
18-24 excluded
25-34 excluded
35-44 excluded
45-49 excluded
50-55 excluded
56+ excluded
other excluded
academic/educator excluded
artist excluded
clerical/admin exc

                                                                                   'euclidean': 2.975434474716342}},
                    'Fans of Comedy excluded using threshold 4': {'explicit': {'cosine': 3.1874399733289316e-05,
                                                                               'euclidean': 1.6616385339978559},
                                                                  'implicit': {'cosine': 0.025908253727351505,
                                                                               'euclidean': 1.1817976328347493}},
                    'Fans of Crime excluded using threshold 4': {'explicit': {'cosine': 0.00025956957230566235,
                                                                              'euclidean': 4.901353140538247},
                                                                 'implicit': {'cosine': 0.020531761350970013,
                                                                              'euclidean': 2.763151

                                                                                'euclidean': 3.3461530342407095},
                                                                   'implicit': {'cosine': 0.024682563497533838,
                                                                                'euclidean': 1.5930610553807405}},
                    'Fans of Sci-Fi excluded using threshold 4': {'explicit': {'cosine': 0.0001822210556949866,
                                                                               'euclidean': 3.7910942145192372},
                                                                  'implicit': {'cosine': 0.058629714099721064,
                                                                               'euclidean': 1.8281132161643636}},
                    'Fans of Thriller excluded using threshold 4': {'explicit': {'cosine': 9.65913841494137e-05,
                                                                                 'euclidean': 2.

                                       'implicit': {'cosine': 0.07123098555936003,
                                                    'euclidean': 1.8818882699035453}},
                    'programmer excluded': {'explicit': {'cosine': 8.574112193993066e-05,
                                                         'euclidean': 2.4168029248907716},
                                            'implicit': {'cosine': 0.09350473788369573,
                                                         'euclidean': 2.023211427915048}},
                    'retired excluded': {'explicit': {'cosine': 3.807848862757979e-05,
                                                      'euclidean': 1.679088833935848},
                                         'implicit': {'cosine': 0.05091056289970186,
                                                      'euclidean': 1.6347825882598854}},
                    'sales/marketing excluded': {'explicit': {'cosine': 7.070448775448046e-05,
                           

                                                                                                'euclidean': 6.461063775643736}},
                                      'Fans of Documentary excluded using threshold 4': {'explicit': {'cosine': 0.0018775337180522156,
                                                                                                      'euclidean': 13.031500228952913},
                                                                                         'implicit': {'cosine': 0.33864993716406155,
                                                                                                      'euclidean': 11.90424417661425}},
                                      'Fans of Drama excluded using threshold 4': {'explicit': {'cosine': 0.0002448272320337841,
                                                                                                'euclidean': 4.578234114977914},
                                                                        

                                               'doctor/health care excluded': {'explicit': {'cosine': 7.867430596397718e-05,
                                                                                            'euclidean': 2.4226617655094125},
                                                                               'implicit': {'cosine': 0.048095144803919854,
                                                                                            'euclidean': 1.3902560113733797}},
                                               'executive/managerial excluded': {'explicit': {'cosine': 5.404022940458386e-05,
                                                                                              'euclidean': 2.1177444273433523},
                                                                                 'implicit': {'cosine': 0.02647099515460305,
                                                                                              'euclidean': 1.049381488

                                                  '50-55 excluded': {'explicit': {'cosine': 0.000537168567971924,
                                                                                  'euclidean': 6.834154540401077},
                                                                     'implicit': {'cosine': 0.09355849534215521,
                                                                                  'euclidean': 5.405747646981549}},
                                                  '56+ excluded': {'explicit': {'cosine': 0.0006853555729297023,
                                                                                'euclidean': 7.732430677889434},
                                                                   'implicit': {'cosine': 0.14889513523013143,
                                                                                'euclidean': 6.474726761484839}},
                                                  'Bottom 10% contributors excluded': {'exp

                                                                                                  'euclidean': 5.088712899627757},
                                                                                     'implicit': {'cosine': 0.05390197601477842,
                                                                                                  'euclidean': 8.696444568101082}},
                                                   'Under 18 excluded': {'explicit': {'cosine': 0.00044110486386017556,
                                                                                      'euclidean': 6.36299477755979},
                                                                         'implicit': {'cosine': 0.12643091799713901,
                                                                                      'euclidean': 4.393739267855904}},
                                                   'academic/educator excluded': {'explicit': {'cosine': 0.00031839972363523206,
    

                                                                                'implicit': {'cosine': 0.05422934787949041,
                                                                                             'euclidean': 1.487787456328161}},
                                               'tradesman/craftsman excluded': {'explicit': {'cosine': 0.00010839401252860092,
                                                                                             'euclidean': 2.8812172578119846},
                                                                                'implicit': {'cosine': 0.11654689175420718,
                                                                                             'euclidean': 2.3017648231068533}},
                                               'unemployed excluded': {'explicit': {'cosine': 9.659115081150915e-05,
                                                                                    'euclidean': 2.765978457353779},
         

                                                                                                   'implicit': {'cosine': 0.09148476873256772,
                                                                                                                'euclidean': 5.951216612356249}},
                                                    'Fans of Mystery excluded using threshold 4': {'explicit': {'cosine': 0.0005591089155022599,
                                                                                                                'euclidean': 6.415252162369474},
                                                                                                   'implicit': {'cosine': 0.059695119388852635,
                                                                                                                'euclidean': 5.124539544487662}},
                                                    'Fans of Romance excluded using threshold 4': {'explicit': {'cosine': 0.0009164

                                              'male users excluded': {'explicit': {'cosine': 1.8219353549842054e-05,
                                                                                   'euclidean': 1.320507161973423},
                                                                      'implicit': {'cosine': 0.019948133371286092,
                                                                                   'euclidean': 1.1560088120032033}},
                                              'other excluded': {'explicit': {'cosine': 2.679691314078081e-05,
                                                                              'euclidean': 1.532090028822918},
                                                                 'implicit': {'cosine': 0.029590702255481327,
                                                                              'euclidean': 1.2919558910801152}},
                                              'programmer excluded': {'explicit': {'cosin

                                                  "Fans of Children's excluded using threshold 4": {'explicit': {'cosine': 0.0006501486346445207,
                                                                                                                 'euclidean': 6.821766712620101},
                                                                                                    'implicit': {'cosine': 0.07758293606012967,
                                                                                                                 'euclidean': 5.159895124783064}},
                                                  'Fans of Comedy excluded using threshold 4': {'explicit': {'cosine': 0.0007192270372171627,
                                                                                                             'euclidean': 7.588391107152577},
                                                                                                'implicit': {'cosine': 0.080498294294

                                                                                              'euclidean': 6.020377594017013},
                                                                                 'implicit': {'cosine': 0.08894553721791376,
                                                                                              'euclidean': 3.343470808179558}},
                                               'customer service excluded': {'explicit': {'cosine': 0.0003123780612878102,
                                                                                          'euclidean': 5.54084328803665},
                                                                             'implicit': {'cosine': 0.057252684066241954,
                                                                                          'euclidean': 2.782351521722888}},
                                               'doctor/health care excluded': {'explicit': {'cosine': 0.00037262882868782654,
   

                                                                                'euclidean': 6.752724382542989},
                                                                   'implicit': {'cosine': 0.02573232722384744,
                                                                                'euclidean': 4.504415542580987}},
                                                '45-49 excluded': {'explicit': {'cosine': 0.0005159531760674518,
                                                                                'euclidean': 7.087593441878298},
                                                                   'implicit': {'cosine': 0.027036560123601894,
                                                                                'euclidean': 4.940487022838466}},
                                                '50-55 excluded': {'explicit': {'cosine': 0.0005074997574489037,
                                                                                'euclidean': 6.98

                                                                                               'implicit': {'cosine': 0.06571616526710089,
                                                                                                            'euclidean': 5.444291143480846}},
                                                'K-12 student excluded': {'explicit': {'cosine': 0.0002527593280475804,
                                                                                       'euclidean': 4.72174853109384},
                                                                          'implicit': {'cosine': 0.18102382192120792,
                                                                                       'euclidean': 3.1937166007317184}},
                                                'Top 10% contributors excluded': {'explicit': {'cosine': 0.00045289964106076663,
                                                                                               'euclidean': 5.7

                                                                                   'euclidean': 1.1976234503355467}},
                                               'self-employed excluded': {'explicit': {'cosine': 0.00012237253242086688,
                                                                                       'euclidean': 3.092810316479099},
                                                                          'implicit': {'cosine': 0.04782111395222077,
                                                                                       'euclidean': 1.628597654856666}},
                                               'technician/engineer excluded': {'explicit': {'cosine': 9.792425313048803e-05,
                                                                                             'euclidean': 2.993941863332455},
                                                                                'implicit': {'cosine': 0.020880922620835185,
                         

                                            'Fans of Film-Noir excluded using threshold 4': {'explicit': {'cosine': 0.00044809992640748586,
                                                                                                          'euclidean': 5.7245557373074405},
                                                                                             'implicit': {'cosine': 0.05379362328984283,
                                                                                                          'euclidean': 4.526132583860798}},
                                            'Fans of Horror excluded using threshold 4': {'explicit': {'cosine': 0.00011566570273169319,
                                                                                                       'euclidean': 2.9297873838921435},
                                                                                          'implicit': {'cosine': 0.024862575684093446,
                                  

                                                'homemaker excluded': {'explicit': {'cosine': 0.0011036143182111546,
                                                                                    'euclidean': 9.91743830529103},
                                                                       'implicit': {'cosine': 0.18014898716136551,
                                                                                    'euclidean': 7.338411042520588}},
                                                'lawyer excluded': {'explicit': {'cosine': 0.0006870867823741689,
                                                                                 'euclidean': 8.16433752131349},
                                                                    'implicit': {'cosine': 0.07465111043555916,
                                                                                 'euclidean': 5.972370321979227}},
                                                'male users excluded': {'explici

                                                                                                 'euclidean': 6.471527101003529},
                                                                                    'implicit': {'cosine': 0.09542398266268781,
                                                                                                 'euclidean': 10.678125196533314}},
                                   'Fans of Animation excluded using threshold 4': {'explicit': {'cosine': 0.0002773865850702606,
                                                                                                 'euclidean': 4.4218268860222505},
                                                                                    'implicit': {'cosine': 0.03142205763082795,
                                                                                                 'euclidean': 7.0474742568114745}},
                                   "Fans of Children's excluded using threshold 4": {'exp

                                           'implicit': {'cosine': 0.1464014717047578,
                                                        'euclidean': 2.738823476643563}},
                       'clerical/admin excluded': {'explicit': {'cosine': 9.474394637454253e-05,
                                                                'euclidean': 2.687869890200842},
                                                   'implicit': {'cosine': 0.13797967231294073,
                                                                'euclidean': 2.6032083904166914}},
                       'college/grad student excluded': {'explicit': {'cosine': 7.196562419109487e-05,
                                                                      'euclidean': 2.2516812907787513},
                                                         'implicit': {'cosine': 0.07454373452300589,
                                                                      'euclidean': 2.168546552086627}},
                       '

                                         'euclidean': 1.4555823031064983}},
         '25-34 excluded': {'explicit': {'cosine': 1.3784773739922151e-05,
                                         'euclidean': 0.9757381658074656},
                            'implicit': {'cosine': 0.007190399733747244,
                                         'euclidean': 0.9637605025442166}},
         '35-44 excluded': {'explicit': {'cosine': 1.2616739610837868e-05,
                                         'euclidean': 0.9329099676775646},
                            'implicit': {'cosine': 0.015830655004137384,
                                         'euclidean': 0.8514744700147893}},
         '45-49 excluded': {'explicit': {'cosine': 3.175669638000311e-05,
                                         'euclidean': 1.468498851404147},
                            'implicit': {'cosine': 0.039450308485456054,
                                         'euclidean': 1.3325856974172752}},
         '50-55 excluded': {'

                     'Fans of Thriller excluded using threshold 4': {'explicit': {'cosine': 8.3414222020739e-05,
                                                                                  'euclidean': 2.613324230894253},
                                                                     'implicit': {'cosine': 0.038190437198332816,
                                                                                  'euclidean': 1.4177238485447912}},
                     'Fans of War excluded using threshold 4': {'explicit': {'cosine': 0.00026534819324353087,
                                                                             'euclidean': 4.847213733068988},
                                                                'implicit': {'cosine': 0.025932016690779602,
                                                                             'euclidean': 2.97480612743697}},
                     'Fans of Western excluded using threshold 4': {'explicit': {'cosine': 0.00076431

                             'sales/marketing excluded': {'explicit': {'cosine': 4.647473219698828e-05,
                                                                       'euclidean': 1.8075392460143886},
                                                          'implicit': {'cosine': 0.03718704350207225,
                                                                       'euclidean': 1.3692591737175668}},
                             'scientist excluded': {'explicit': {'cosine': 6.954060994091016e-05,
                                                                 'euclidean': 2.1824268698544933},
                                                    'implicit': {'cosine': 0.05698065958830534,
                                                                 'euclidean': 1.7190689645771646}},
                             'self-employed excluded': {'explicit': {'cosine': 5.174911910299507e-05,
                                                                     'euclidean': 1.8806

                                                                                         'euclidean': 2.1701096128034654},
                                                                            'implicit': {'cosine': 0.07639429412848209,
                                                                                         'euclidean': 2.2689650866700437}},
                               'Fans of Fantasy excluded using threshold 4': {'explicit': {'cosine': 0.0005533073456475135,
                                                                                           'euclidean': 7.411331945000143},
                                                                              'implicit': {'cosine': 0.051631835093283285,
                                                                                           'euclidean': 4.287394016063789}},
                               'Fans of Film-Noir excluded using threshold 4': {'explicit': {'cosine': 0.0009371318444860721,
           

                                                     'implicit': {'cosine': 0.18445473432278703,
                                                                  'euclidean': 2.9797802533971463}},
                                 'female users excluded': {'explicit': {'cosine': 4.0947914356825876e-05,
                                                                        'euclidean': 1.6875717597143498},
                                                           'implicit': {'cosine': 0.03244647475454976,
                                                                        'euclidean': 1.163156383802332}},
                                 'homemaker excluded': {'explicit': {'cosine': 0.00010428488609381947,
                                                                     'euclidean': 2.690249340386402},
                                                        'implicit': {'cosine': 0.08575106374827057,
                                                                     'euclid

                                                          'implicit': {'cosine': 0.3752086353825024,
                                                                       'euclidean': 4.505213371378655}},
                     'Fans of Action excluded using threshold 4': {'explicit': {'cosine': 0.00024670474076582405,
                                                                                'euclidean': 4.267172964445208},
                                                                   'implicit': {'cosine': 0.1902810750700208,
                                                                                'euclidean': 3.0136494387897725}},
                     'Fans of Adventure excluded using threshold 4': {'explicit': {'cosine': 0.0003268980147673739,
                                                                                   'euclidean': 5.071425307293071},
                                                                      'implicit': {'cosine': 0.161324356559269

                           'academic/educator excluded': {'explicit': {'cosine': 2.7004066504798807e-05,
                                                                       'euclidean': 1.355438031455279},
                                                          'implicit': {'cosine': 0.026349941864166393,
                                                                       'euclidean': 1.1294984530367584}},
                           'all': {'explicit': {'cosine': 3.329914543181989e-05,
                                                'euclidean': 1.5082740334497096},
                                   'implicit': {'cosine': 0.036824362908533304,
                                                'euclidean': 1.3335071375347736}},
                           'artist excluded': {'explicit': {'cosine': 5.680681654107733e-05,
                                                            'euclidean': 1.9892712341326417},
                                               'implicit': {'cosine':

                        'writer excluded': {'explicit': {'cosine': 0.00020398634645635294,
                                                         'euclidean': 3.7494470210121627},
                                            'implicit': {'cosine': 0.11899170763976086,
                                                         'euclidean': 2.8962948495570893}}},
 'lawyer excluded': {'18-24 excluded': {'explicit': {'cosine': 0.00012068949925758687,
                                                     'euclidean': 2.871616034703443},
                                        'implicit': {'cosine': 0.12337415397440232,
                                                     'euclidean': 2.494073267233132}},
                     '25-34 excluded': {'explicit': {'cosine': 6.112927589052575e-05,
                                                     'euclidean': 2.0396056346654903},
                                        'implicit': {'cosine': 0.05447513305549323,
                                    

                                                                                     'euclidean': 2.9917200115579234},
                                                                        'implicit': {'cosine': 0.028286726393928152,
                                                                                     'euclidean': 1.3343080910740963}},
                         'Fans of Sci-Fi excluded using threshold 4': {'explicit': {'cosine': 9.425316636668679e-05,
                                                                                    'euclidean': 2.8692255550756536},
                                                                       'implicit': {'cosine': 0.021495161403741192,
                                                                                    'euclidean': 1.0678175318872314}},
                         'Fans of Thriller excluded using threshold 4': {'explicit': {'cosine': 4.534475828921192e-05,
                                                       

                                            'implicit': {'cosine': 0.035317497297880474,
                                                         'euclidean': 1.3116139193023086}},
                    'retired excluded': {'explicit': {'cosine': 9.082246800151683e-05,
                                                      'euclidean': 2.512538259251398},
                                         'implicit': {'cosine': 0.1426606283699332,
                                                      'euclidean': 2.8152367550390918}},
                    'sales/marketing excluded': {'explicit': {'cosine': 2.087878851231295e-05,
                                                              'euclidean': 1.191693061331727},
                                                 'implicit': {'cosine': 0.020103182290785115,
                                                              'euclidean': 0.9933343803302469}},
                    'scientist excluded': {'explicit': {'cosine': 4.247987493888328e-05,
  

                                                                   'implicit': {'cosine': 0.11193849718571147,
                                                                                'euclidean': 4.756307939660847}},
                      'Fans of Documentary excluded using threshold 4': {'explicit': {'cosine': 0.0014667655632113474,
                                                                                      'euclidean': 11.425080021636347},
                                                                         'implicit': {'cosine': 0.17590829302999167,
                                                                                      'euclidean': 10.202367093542174}},
                      'Fans of Drama excluded using threshold 4': {'explicit': {'cosine': 0.0001224801746083637,
                                                                                'euclidean': 3.095373845435195},
                                                                   'impl

                                                                         'euclidean': 1.7618343088209978}},
                              'doctor/health care excluded': {'explicit': {'cosine': 3.135226764572341e-05,
                                                                           'euclidean': 1.4610574628956363},
                                                              'implicit': {'cosine': 0.02991131972742933,
                                                                           'euclidean': 1.2053391774169655}},
                              'executive/managerial excluded': {'explicit': {'cosine': 1.958527675649524e-05,
                                                                             'euclidean': 1.1606352370498674},
                                                                'implicit': {'cosine': 0.02007382591606166,
                                                                             'euclidean': 0.9939085091513169}},
                  

                            '56+ excluded': {'explicit': {'cosine': 8.179660727902771e-05,
                                                          'euclidean': 2.4358509961593753},
                                             'implicit': {'cosine': 0.086997018728733,
                                                          'euclidean': 2.7459955797434823}},
                            'Bottom 10% contributors excluded': {'explicit': {'cosine': 0.00020352574043436267,
                                                                              'euclidean': 4.051903074210635},
                                                                 'implicit': {'cosine': 0.2190891519177679,
                                                                              'euclidean': 4.506111094631293}},
                            'Fans of Action excluded using threshold 4': {'explicit': {'cosine': 6.369857106214294e-05,
                                                                          

                                  'Top 10% contributors excluded': {'explicit': {'cosine': 0.0008023514819709421,
                                                                                 'euclidean': 8.040417128895262},
                                                                    'implicit': {'cosine': 0.07698155960719466,
                                                                                 'euclidean': 11.281647424272794}},
                                  'Under 18 excluded': {'explicit': {'cosine': 6.536457078054614e-05,
                                                                     'euclidean': 2.1361053529464074},
                                                        'implicit': {'cosine': 0.1252151695762702,
                                                                     'euclidean': 2.21404298137113}},
                                  'academic/educator excluded': {'explicit': {'cosine': 5.854965429508141e-05,
                           

                                                                   'implicit': {'cosine': 0.07286883731524096,
                                                                                'euclidean': 1.8386498254832648}},
                                  'unemployed excluded': {'explicit': {'cosine': 0.0001338396710977019,
                                                                       'euclidean': 3.013816628704168},
                                                          'implicit': {'cosine': 0.08046972833795285,
                                                                       'euclidean': 2.2883970697639002}},
                                  'writer excluded': {'explicit': {'cosine': 0.000139738291979552,
                                                                   'euclidean': 3.09310364822198},
                                                      'implicit': {'cosine': 0.10004700462233918,
                                                              

                                                                    'implicit': {'cosine': 0.04089687788469343,
                                                                                 'euclidean': 3.607378090983703}},
                     'Fans of Mystery excluded using threshold 4': {'explicit': {'cosine': 0.0003947035309812552,
                                                                                 'euclidean': 6.360971112377979},
                                                                    'implicit': {'cosine': 0.029490614731073506,
                                                                                 'euclidean': 3.7874659480535615}},
                     'Fans of Romance excluded using threshold 4': {'explicit': {'cosine': 0.00011399229474096106,
                                                                                 'euclidean': 3.1709324344811542},
                                                                    'implicit': {'cosi

# Start here for Dist. vs. Like-User Damage

In [5]:
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
with open('some_distances.json', 'r') as f:
    group_to_group_to_vectype_to_distancetype_to_ = json.load(f)
    
row_dicts = []
for group, group_to_vectype_to_distancetype_to_ in group_to_group_to_vectype_to_distancetype_to_.items():
    if group == 'all':
        continue
    row_dict = {}
    row_dict['name'] = group
    for group2, vectype_to_distancetype_to_ in group_to_vectype_to_distancetype_to_.items():
        if group2 != 'all':
            continue
        for vectype, distancetype_to_ in vectype_to_distancetype_to_.items():
            for distancetype, val in distancetype_to_.items():
                row_dict['{}_{}'.format(vectype, distancetype)] = val
    row_dicts.append(row_dict)

df = pd.DataFrame(row_dicts)
df.name = [
    x.replace('excluded', '')
    .replace('users from', '')
    .replace('using threshold 4', '')
    .replace('Top 10% contributors', 'power users')
    .strip()
    .lower()
    for x in list(df.name)
]

In [6]:
with open('all_ratios.json', 'r') as f:
    all_ratios = json.load(f)
    
with open('all_diffs.json', 'r') as f:
    all_diffs = json.load(f)
    
with open('group_to_num_ratings', 'r') as f:
    group_to_num_ratings = json.load(f)
temp_df = pd.DataFrame.from_dict(group_to_num_ratings, orient='index')
temp_df.index = [
    x.replace('excluded', '')
    .replace('users from', '')
    .replace('using threshold 4', '')
    .replace('Top 10% contributors', 'power users')
    .strip()
    .lower()
    for x in list(temp_df.index)
]
print(temp_df)
group_to_num_ratings = temp_df.to_dict()[0]


#print(group_to_num_ratings)    
lb_ratios = all_ratios['lb']
nb_ratios = all_ratios['nb']


row_dicts = []
for key, val in lb_ratios.items():
    row_dict = {
        'name': key,
        'like-boycott-ratio': val,
        'non-boycott-ratio': nb_ratios[key],
        'like-boycott-diff': all_diffs['lb'][key],
        'non-boycott-diff': all_diffs['lb'][key],
        'num_ratings': group_to_num_ratings[key],
    }
    row_dicts.append(row_dict)
ratios_df = pd.DataFrame(row_dicts)

df = df.merge(right=ratios_df, on='name', how='inner')
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'group_to_num_ratings'

In [None]:
df.head()

In [None]:
df[['name', 'num_ratings', 'like-boycott-ratio']][
    (df.name.isin([
        'male users', 'female users', 'power users', 'bottom 10% contributors'
    ])) | (df.name.str.contains('fans'))
    df.num_ratings > 150000
].to_csv('selected_groups.csv', index=False, header=['Group Name', '# Ratings', 'Similar Users Effect Ratio'])

In [None]:
table_df = df[['name', 'num_ratings', 'like-boycott-ratio']][
        df.name.isin([
        'male users', 'female users', 'power users',
        'fans of drama', 'fans of horror', '25-34', '56+', 'lawyer', 'scientist'
    ])
    #df.num_ratings > 150000
].rename(index=str, columns={'name': 'Name', 'num_ratings': '# Ratings', 'like-boycott-ratio': 'Similar User Effect Ratio'}
)

In [None]:
html = table_df.to_html(
    index=False,
    float_format='%.2f',
    columns=['Name', '# Ratings', 'Similar User Effect Ratio']
)
css = """
<style>
body {column-count: 1 !important;}
table {
    width: 2.5in; height:3in;
        font-size: x-small;
}
th, td, table {
    border-left: none;
    border-right: none;
    padding-left: 10px;
}
</style>
"""

with open('table.html', 'w') as f:
    f.write('<link rel="stylesheet" href="pubcss-acm-sigchi.css">' + css + html)

In [None]:
# Implicit Cosine vs. LB Ratio
sns.set(style="darkgrid")
sns.set_color_codes("dark")

fig, ax = plt.subplots(1,1, figsize=(7, 3.5))
filtered = df[df.num_ratings >= 20000]
print('len, std, mean', len(df.num_ratings), np.std(df.num_ratings), np.mean(df.num_ratings))
#filtered = df

y = 'like-boycott-ratio'

sns.scatterplot(
    x='implicit_cosine', y=y, data=filtered,
    ax=ax, 
    #marker=".", 
    #line_kws={'alpha':.3},
    alpha=0.3
)

def label_point(x, y, val, ax, names):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        if str(point['val']) in names:
            ax.text(
                point['x'], point['y']+0.2, str(point['val']), 
            )
            ax.plot(point['x'], point['y'], 'bx')

label_point(
    filtered.implicit_cosine, filtered[y], filtered.name, plt.gca(),
    names=[
        'male users', 'female users', 'fans of film-noir', 'power users', 'under 18', 'artist',
        'fans of documentary', 'fans of horror',]
)  

plt.xlabel('Group Implicit Rating Cosine Distance from Centroid')
plt.title('Ratio of Strike Effect on Similar Users to Effect on Not Participating Users')
plt.ylabel('Effect Ratio')

ax.axhline(1, color='0.3', linestyle='--')
plt.savefig('implicitcosine_vs_lbratio.png', bbox_inches='tight', dpi=300)
plt.show()
print(pearsonr(filtered['implicit_cosine'], filtered[y]))
print(filtered[['implicit_cosine', 'num_ratings', y, 'name']].sort_values('implicit_cosine'))

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10,15))
sns.barplot(y='name', x='implicit_euclidean', data=df, ax=axes[0,0])
sns.barplot(y='name', x='explicit_euclidean', data=df, ax=axes[0,1])
sns.barplot(y='name', x='implicit_cosine', data=df, ax=axes[1,0])
sns.barplot(y='name', x='explicit_cosine', data=df, ax=axes[1,1])
plt.show()

In [None]:
total = len(users_df.index)
print('total users', total)
for group in group_by_genre(users_df, ratings_df, dfs['movies'], 'ml-1m'):
    users = sorted(list(set(group['df'].user_id)))
    print(group['name'], len(users))

In [None]:
from scipy.stats import pearsonr

#print(pearsonr(df['like-boycott-ratio'], df['non-boycott-ratio']))

print('\n===')
for distance in [
    'explicit_cosine', 'explicit_euclidean',
    'implicit_cosine', 'implicit_euclidean',
]:
    for ratio in [
        'like-boycott-ratio', 
        'non-boycott-ratio'
    ]:
        x = pearsonr(df[distance], df[ratio])
        print(distance, ratio)
        print(x)
        print('\n')

In [None]:
# NUM RATINGS VS LB RATIO

sns.regplot(x='num_ratings', y='like-boycott-ratio', data=df)
plt.show()
print(pearsonr(df['implicit_cosine'], df['like-boycott-ratio']))
#print(df[['implicit_cosine', 'like-boycott-ratio', 'name']].sort_values('implicit_cosine'))

In [None]:
#large_boycotts = df[df.num_ratings > 1e5]
filtered = df
sns.regplot(x='explicit_cosine', y='non-boycott-ratio', data=filtered)
plt.show()
print(pearsonr(filtered['explicit_cosine'], filtered['non-boycott-ratio']))
print(filtered[['implicit_cosine', 'num_ratings', 'like-boycott-ratio', 'name']].sort_values('implicit_cosine'))