# CSE 158 Assignment 2 

Authors: Lucas Tindall and Kyle Smurlo

[Render notebook in browser](http://nbviewer.jupyter.org/github/ltindall/cse158_anime_recommender/blob/master/cse158assignment2.ipynb)
(the plotly plots don't load in github)

Dataset: [Anime recommendations](https://www.kaggle.com/CooperUnion/anime-recommendations-database)



## Imports

In [23]:
import csv
import random
from collections import defaultdict

import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools

import numpy as np

import plotly
plotly.tools.set_credentials_file(username='ltindall', api_key='KdrHKAMyc6KdVcSvPMdN')

## Get animes from csv

In [4]:
animes = []
with open('anime.csv', 'rb') as file: 
    reader = csv.DictReader(file, delimiter=',')
    for row in reader: 
        animes.append(row)
        


In [5]:
# get dictionary of lists of genre ratings 
genreRatings = defaultdict(list)


# for each anime 
for anime in animes: 
    # split all the genres listed for that review 
    for genre in anime['genre'].split(','): 
        # add rating to list for that genre
        if len(anime['rating']) > 0:
            genreRatings[genre.strip()].append(float(anime['rating']))

topGenres = []
for genre in sorted(genreRatings, key=lambda k: len(genreRatings[k]), reverse=True):
    topGenres.append((genre,genreRatings[genre]))


## Density plot of top 10 genre ratings 

In [6]:
# Group data together
hist_data = [genre[1] for genre in topGenres[:10]]
#hist_data = [topGenres[0][1], topGenres[1][1], topGenres[2][1], topGenres[3][1]]

group_labels = [genre[0] for genre in topGenres[:10]]
#group_labels = [topGenres[0][0], topGenres[1][0], topGenres[2][0], topGenres[3][0]]

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)

# Plot!
py.iplot(fig, filename='Density plot of top genres')

In [9]:
# get dictionary of lists of type ratings 
typeAnimes = defaultdict(list)


# for each anime 
for anime in animes: 
    if len(anime['rating']) > 0: 
        typeAnimes[anime['type']].append(float(anime['rating']))
       

# Group data together
hist_data = [typeAnimes[typeAnime] for typeAnime in typeAnimes]
#hist_data = [topGenres[0][1], topGenres[1][1], topGenres[2][1], topGenres[3][1]]

group_labels = [typeAnime for typeAnime in typeAnimes]
#group_labels = [topGenres[0][0], topGenres[1][0], topGenres[2][0], topGenres[3][0]]

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)

# Plot!
py.iplot(fig, filename='Density plot of ratings of animes types')


In [56]:
# get dictionary of lists of type ratings 
typeAnimes = defaultdict(list)

for anime in animes: 
    if len(anime['rating']) > 0: 
        if anime['episodes'] != 'Unknown':
            typeAnimes[anime['type']].append((float(anime['episodes']),float(anime['rating'])))
    
traces = []
for animeTyp in typeAnimes: 
    trace = go.Scatter(
        x = [anime[1] for anime in typeAnimes[animeTyp] ],
        y = [anime[0] for anime in typeAnimes[animeTyp] ],
        mode = 'markers', 
        name = animeTyp
    )
    traces.append(trace)



#data = [trace0, trace1, trace2]
data = [traces[1]]

layout = go.Layout(
    autosize=False,
    width=500,
    height=500
)


#fig = go.Figure(data=data, layout=layout)
plot_titles = [animeTyp for animeTyp in typeAnimes]

fig = tools.make_subplots(rows=2, cols=3, subplot_titles=(tuple(plot_titles)))

fig.append_trace(traces[0], 1, 1)
fig.append_trace(traces[1], 1, 2)
fig.append_trace(traces[2], 1, 3)
fig.append_trace(traces[3], 2, 1)
fig.append_trace(traces[4], 2, 2)
fig.append_trace(traces[5], 2, 3)

fig['layout']['xaxis1'].update(title='rating')
fig['layout']['xaxis2'].update(title='rating')
fig['layout']['xaxis3'].update(title='rating')
fig['layout']['xaxis4'].update(title='rating')
fig['layout']['xaxis5'].update(title='rating')
fig['layout']['xaxis6'].update(title='rating')

fig['layout']['yaxis1'].update(title='episodes')
fig['layout']['yaxis2'].update(title='episodes')
fig['layout']['yaxis3'].update(title='episodes')
fig['layout']['yaxis4'].update(title='episodes')
fig['layout']['yaxis5'].update(title='episodes')
fig['layout']['yaxis6'].update(title='episodes')
fig['layout'].update(height=900, width=1000, title='Ratings vs. # of episodes for each anime type')


py.iplot(fig, filename='scatter-mode')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]



In [36]:
print [traces[1]]

[{'y': [9.37, 9.1, 9.05, 8.93, 8.84, 8.81, 8.81, 8.75, 8.74, 8.73, 8.68, 8.64, 8.61, 8.61, 8.6, 8.59, 8.58, 8.58, 8.57, 8.55, 8.53, 8.53, 8.5, 8.49, 8.48, 8.47, 8.45, 8.45, 8.44, 8.43, 8.42, 8.42, 8.4, 8.39, 8.38, 8.38, 8.37, 8.35, 8.35, 8.35, 8.34, 8.34, 8.34, 8.34, 8.33, 8.33, 8.32, 8.32, 8.32, 8.32, 8.32, 8.31, 8.3, 8.29, 8.28, 8.27, 8.27, 8.26, 8.25, 8.24, 8.23, 8.23, 8.23, 8.22, 8.21, 8.21, 8.21, 8.21, 8.2, 8.2, 8.19, 8.18, 8.17, 8.17, 8.16, 8.16, 8.16, 8.16, 8.15, 8.15, 8.15, 8.14, 8.13, 8.13, 8.13, 8.13, 8.12, 8.12, 8.12, 8.1, 8.11, 8.1, 8.1, 8.09, 8.09, 8.09, 8.08, 8.06, 8.06, 8.05, 8.05, 8.05, 8.04, 8.04, 8.04, 8.03, 8.03, 8.03, 8.03, 8.02, 8.01, 8.0, 8.0, 7.99, 7.99, 7.99, 7.98, 7.98, 7.98, 7.98, 7.97, 7.97, 7.96, 7.96, 7.95, 7.95, 7.94, 7.94, 7.94, 7.94, 7.94, 7.94, 7.94, 7.93, 7.93, 7.94, 7.93, 7.93, 7.93, 7.91, 7.92, 7.92, 7.92, 7.91, 7.91, 7.91, 7.92, 7.9, 7.9, 7.9, 7.9, 7.89, 7.89, 7.89, 7.89, 7.88, 7.88, 7.88, 7.88, 7.87, 7.87, 7.87, 7.87, 7.87, 7.86, 7.86, 7.86, 7.85, 

## Get ratings from csv

In [7]:
# shuffle ratings since original file is sorted by user id 
'''
with open('rating.csv', 'rb') as file: 
    reader = csv.reader(file)
    header, rows = next(reader), list(reader)
    random.shuffle(rows)
    
with open('ratings_shuffled.csv', 'wb') as file: 
    csv.writer(file).writerows([header] + rows)
'''     


"\nwith open('rating.csv', 'rb') as file: \n    reader = csv.reader(file)\n    header, rows = next(reader), list(reader)\n    random.shuffle(rows)\n    \nwith open('ratings_shuffled.csv', 'wb') as file: \n    csv.writer(file).writerows([header] + rows)\n"

In [8]:
ratings = []
i = 0
with open('ratings_shuffled.csv', 'rb') as file: 
    reader = csv.DictReader(file, delimiter=',')
    for row in reader: 
        if i >= 2000000: 
            break
        ratings.append(row)
        i = i + 1
        

IOError: [Errno 2] No such file or directory: 'ratings_shuffled.csv'