In [1]:
import numpy as np
import pandas as pd

## Clustering users according to genre preferences

In [3]:
bk = pd.read_csv("../data/_clean_books.csv", index_col=0)
rt = pd.read_csv("../data/_clean_ratings.csv", index_col=0)

In [4]:
# merging - each row will represent one (user, book, rating) tuple

ds = pd.merge(bk, rt, on="book_id", how="inner")

cols_wanted = ['user_id', 'genres', 'rating']
ds = ds[cols_wanted]
ds.head()

Unnamed: 0,user_id,genres,rating
0,15,Literature and fiction|History|Education and T...,3
1,25,Literature and fiction|History|Education and T...,4
2,40,Literature and fiction|History|Education and T...,5
3,69,Literature and fiction|History|Education and T...,5
4,72,Literature and fiction|History|Education and T...,5


In [5]:
# splitting genres

ds["genres"] = ds["genres"].str.split("|")
ds.head()

Unnamed: 0,user_id,genres,rating
0,15,"[Literature and fiction, History, Education an...",3
1,25,"[Literature and fiction, History, Education an...",4
2,40,"[Literature and fiction, History, Education an...",5
3,69,"[Literature and fiction, History, Education an...",5
4,72,"[Literature and fiction, History, Education an...",5


In [7]:
# i need one genre per row

ds2 = ds.genres.apply(pd.Series) \
    .merge(ds, right_index=True, left_index=True) \
    .drop(["genres"], axis=1) \
    .melt(id_vars=['user_id', 'rating'], value_name="genre") \
    .drop("variable", axis=1) \
    .dropna() \
    .sort_values(by=["user_id", "rating"])

In [8]:
ds2.shape

(17251829, 3)

In [9]:
ds2.head()

Unnamed: 0,user_id,rating,genre
839676,1,1,Literature and fiction
845168,1,1,Literature and fiction
850660,1,1,Literature and fiction
856152,1,1,Literature and fiction
861644,1,1,Literature and fiction


In [44]:
ds3 = ds2.groupby(by=["user_id", "genre"], as_index=False).agg({"rating":"count"})
ds3

Unnamed: 0,user_id,genre,rating
0,1,Action and adventure,4
1,1,Anthology,1
2,1,Art,3
3,1,Autobiography and biography,15
4,1,Business and money,2
...,...,...,...
1019839,53424,Religion and spirituality,5
1019840,53424,Romance,14
1019841,53424,Science fiction and fantasy,25
1019842,53424,Teen and young adult,25


In [45]:
ds3.sort_values(by=["user_id", "rating"], ascending=False, inplace=True)
ds3

Unnamed: 0,user_id,genre,rating
1019836,53424,Literature and fiction,78
1019830,53424,Childrens,41
1019833,53424,History,32
1019841,53424,Science fiction and fantasy,25
1019842,53424,Teen and young adult,25
...,...,...,...
1,1,Anthology,1
6,1,"Cookbooks, Food and Wine",1
11,1,Humor and Entertainment,1
13,1,Maths and science,1


In [46]:
ds3["genres_per_user"] = ds3["user_id"].map(ds3["user_id"].value_counts())
ds3

Unnamed: 0,user_id,genre,rating,genres_per_user
1019836,53424,Literature and fiction,78,16
1019830,53424,Childrens,41,16
1019833,53424,History,32,16
1019841,53424,Science fiction and fantasy,25,16
1019842,53424,Teen and young adult,25,16
...,...,...,...,...
1,1,Anthology,1,23
6,1,"Cookbooks, Food and Wine",1,23
11,1,Humor and Entertainment,1,23
13,1,Maths and science,1,23


In [47]:
reads = ds3.groupby(by="user_id").agg({"rating":"sum"})
ds3["total_reads"] = ds3["user_id"].map(reads["rating"])

In [48]:
ds3.sort_values(by="total_reads", inplace=True)
ds3

Unnamed: 0,user_id,genre,rating,genres_per_user,total_reads
304624,14870,"Mystery, thriller, suspense and horror",1,2,2
641804,32472,Teen and young adult,1,2,2
899361,46590,Teen and young adult,1,2,2
899360,46590,Science fiction and fantasy,1,2,2
824166,42401,Literature and fiction,1,2,2
...,...,...,...,...,...
154322,7225,Horror and mystery,1,28,1029
154324,7225,LGBTQ and gender studies,1,28,1029
154328,7225,Parenting and relationships,1,28,1029
154325,7225,Literature and fiction,315,28,1029


In [50]:
# data collected since 2007
# 10 books per year = 120 total
# get data from frequent readers
ds4 = ds3[ds3["total_reads"]>=120][["user_id", "genre", "rating"]].copy()
ds4.shape

(953061, 3)

In [54]:
ds4.sort_values(by=["user_id", "rating"], ascending=False, inplace=True)
ds5 = ds4.groupby(by="user_id").head(4)
ds5

Unnamed: 0,user_id,genre,rating
1019836,53424,Literature and fiction,78
1019830,53424,Childrens,41
1019833,53424,History,32
1019842,53424,Teen and young adult,25
1019819,53423,Literature and fiction,75
...,...,...,...
41,2,Romance,23
12,1,Literature and fiction,161
9,1,History,118
15,1,Politics and social sciences,57


In [69]:
ds5["order"] = list(range(1,5))*int(ds5.shape[0]/4)
ds5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,genre,rating,order
1019836,53424,Literature and fiction,78,1
1019830,53424,Childrens,41,2
1019833,53424,History,32,3
1019842,53424,Teen and young adult,25,4
1019819,53423,Literature and fiction,75,1
...,...,...,...,...
41,2,Romance,23,4
12,1,Literature and fiction,161,1
9,1,History,118,2
15,1,Politics and social sciences,57,3


In [82]:
ds6 = ds5[['user_id', 'genre', 'order']].set_index(['user_id', 'order']).unstack().swaplevel(0, 1, axis=1).sort_index(axis=1)
ds6.columns = ds6.columns.droplevel(1)
ds6['user_id'] = ds6.index
ds6

order,1,2,3,4,user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Literature and fiction,History,Politics and social sciences,Travel,1
2,Literature and fiction,Politics and social sciences,History,Romance,2
3,Literature and fiction,Politics and social sciences,History,Science fiction and fantasy,3
4,Literature and fiction,History,Science fiction and fantasy,Teen and young adult,4
5,Literature and fiction,"Mystery, thriller, suspense and horror",History,Science fiction and fantasy,5
...,...,...,...,...,...
53420,Literature and fiction,History,Romance,"Mystery, thriller, suspense and horror",53420
53421,Literature and fiction,History,Science fiction and fantasy,Education and Teaching,53421
53422,Literature and fiction,Science fiction and fantasy,History,Teen and young adult,53422
53423,Literature and fiction,Science fiction and fantasy,"Mystery, thriller, suspense and horror",Teen and young adult,53423


In [94]:
ds7 = ds6.groupby(by=[1, 2, 3, 4], as_index=False).agg({"user_id":"count"})
ds7.columns=["Favourite genre", "2nd fav. genre", "3rd fav. genre", "4th fav. genre", "Number of readers"]
ds7.sort_values(by="Number of readers", ascending=False).head(15)

Unnamed: 0,Favourite genre,2nd fav. genre,3rd fav. genre,4th fav. genre,Number of readers
244,Literature and fiction,History,Teen and young adult,Romance,2412
205,Literature and fiction,History,Politics and social sciences,Science fiction and fantasy,1890
222,Literature and fiction,History,Romance,Teen and young adult,1791
231,Literature and fiction,History,Science fiction and fantasy,Politics and social sciences,1687
234,Literature and fiction,History,Science fiction and fantasy,Teen and young adult,1481
208,Literature and fiction,History,Politics and social sciences,Travel,1403
245,Literature and fiction,History,Teen and young adult,Science fiction and fantasy,1343
188,Literature and fiction,History,"Mystery, thriller, suspense and horror",Romance,1308
202,Literature and fiction,History,Politics and social sciences,"Mystery, thriller, suspense and horror",1189
217,Literature and fiction,History,Romance,"Mystery, thriller, suspense and horror",1154
