In [1]:
# Importa o pacote train_test_split
from sklearn.model_selection import train_test_split

# Cria classificador
from sklearn import tree                    # Importa o pacote de arvore de decisao
clf = tree.DecisionTreeClassifier()         # Cria classificador

#Importa o crawler
import json
import crawler
import pandas as pd

In [2]:
class_names = ["No Score", "Appalling", "Horrible", "Very Bad", "Bad", "Average", "Fine", "Good", "Very Good", "Great", "Masterpiece"]

# Listas com todos os atributos disponíveis
atributos_anime_todos = ["title", "duration", "episodes", "genres", "popularity", "public_score", "rank", "rating", "source", "studios", "type", "year"]
atributos_avaliacao_todos = ["num_watched_episodes", "user_score", "status"]

# Listas com alguns atributos mais relevantes
atributos_anime_padrao = ["duration", "episodes", "genres", "type", "year"]
atributos_avaliacao_padrao = ["user_score"]

In [3]:
def filtro(usuario, f_selecao, atributos_anime = atributos_anime_padrao, atributos_avaliacao = atributos_avaliacao_padrao, agrupar_episodios = False, force_update = False):
	if "user_score" not in atributos_avaliacao:
		atributos_avaliacao.append("user_score")

	dados = []
	lista_animes = crawler.get_lista(usuario)
    
	print lista_animes

	generos = []
	estudios = []
	for anime in lista_animes:
		if anime.genres != None:
			generos.extend(anime.genres)
		if anime.studios != None:
			estudios.extend(anime.studios)

	generos = list(set(generos))
	estudios = list(set(estudios))

	for anime in lista_animes:
		dado = {}
		dado_filtrado = {}

		# Pegando os dados do anime
		for atributo in atributos_anime_todos:
			dado[atributo] = anime.__dict__[atributo]

		# Pegando os dados da avaliação
		with open("data/users/" + usuario + "/" + str(anime.id) + ".json") as f:
			avaliacao = json.loads(f.read())

			for atributo in atributos_avaliacao_todos:
				dado[atributo] = avaliacao[atributo]

		if f_selecao == None or f_selecao(dado):
			dado_filtrado = {}

			for atributo in atributos_anime:
				if atributo == "genres":
					for genero in generos:
						if dado["genres"] != None and genero in dado["genres"]:
							dado_filtrado["Genre:" + genero] = 1
						else:
							dado_filtrado["Genre:" + genero] = -1
					continue

				if atributo == "studios":
					for estudio in estudios:
						if dado["studios"] != None and estudio in dado["studios"]:
							dado_filtrado["Studio:" + estudio] = 1
						else:
							dado_filtrado["Studio:" + estudio] = -1
					continue

				if agrupar_episodios and "episodes" in atributo:
					try:
						epis = int(dado[atributo])
						if epis <= 6:
							dado_filtrado[atributo] = 1
						elif epis <= 14:
							dado_filtrado[atributo] = 2
						elif epis <= 26:
							dado_filtrado[atributo] = 3
						elif epis <= 70:
							dado_filtrado[atributo] = 4
						else:
							dado_filtrado[atributo] = 5
						continue
					except:
						dado_filtrado[atributo] = dado[atributo]
						continue

				if dado[atributo] == None:
					dado_filtrado[atributo] = 0
				else:
					dado_filtrado[atributo] = dado[atributo]

			for atributo in atributos_avaliacao:
				if dado[atributo] == None:
					dado_filtrado[atributo] = 0
				else:
					dado_filtrado[atributo] = dado[atributo]

			dados.append(dado_filtrado)

	return dados

In [4]:
def carregar_dataset(usuario, f_selecao, atributos_anime = atributos_anime_padrao, atributos_avaliacao = atributos_avaliacao_padrao, agrupar_episodios = False, force_update = False):
	lista_final = filtro(usuario, f_selecao, atributos_anime, atributos_avaliacao, agrupar_episodios, force_update)

	df = pd.read_json(json.dumps(lista_final))

	target = df['user_score']
	data = df.drop('user_score', axis = 1)

	from sklearn import preprocessing
	le = preprocessing.LabelEncoder()
	for attr in data.columns:
		if data[attr].dtype == object:
			data[attr] = le.fit_transform(data[attr])
	
	return data, target

In [5]:
x, y = carregar_dataset("Master_Exploder", lambda d: d["status"] == 2 or d["status"] == "2", force_update=False)

[<crawler.Anime instance at 0x7fa76518c290>, <crawler.Anime instance at 0x7fa72a444950>, <crawler.Anime instance at 0x7fa72a444518>, <crawler.Anime instance at 0x7fa72a444680>, <crawler.Anime instance at 0x7fa72a444758>, <crawler.Anime instance at 0x7fa72a4446c8>, <crawler.Anime instance at 0x7fa72a4445f0>, <crawler.Anime instance at 0x7fa72a444830>, <crawler.Anime instance at 0x7fa72a444ef0>, <crawler.Anime instance at 0x7fa72a444d88>, <crawler.Anime instance at 0x7fa72a444ea8>, <crawler.Anime instance at 0x7fa72a444e60>, <crawler.Anime instance at 0x7fa72a444dd0>, <crawler.Anime instance at 0x7fa72a444e18>, <crawler.Anime instance at 0x7fa72a444fc8>, <crawler.Anime instance at 0x7fa72a444f38>, <crawler.Anime instance at 0x7fa72a444f80>, <crawler.Anime instance at 0x7fa72a1cb5a8>, <crawler.Anime instance at 0x7fa72a1cb1b8>, <crawler.Anime instance at 0x7fa72a1cb170>, <crawler.Anime instance at 0x7fa72a4500e0>, <crawler.Anime instance at 0x7fa72a450290>, <crawler.Anime instance at 0x7f

In [6]:
cont = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

for c in y.values:
    cont[c] += 1

for i in range(11):
    print i, " - ", class_names[i] + ": ", cont[i]


0  -  No Score:  0
1  -  Appalling:  0
2  -  Horrible:  0
3  -  Very Bad:  0
4  -  Bad:  0
5  -  Average:  0
6  -  Fine:  8
7  -  Good:  4
8  -  Very Good:  11
9  -  Great:  8
10  -  Masterpiece:  2


In [7]:
# MODELO 1

# Dividir conjuntos de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(    # Divide conjuntos nao-aleatoriamente
    x, y, shuffle=False)

# Treinamento da Arvore de Decisao
from sklearn import tree                    # Importa o pacote de arvore de decisao
clf = tree.DecisionTreeClassifier()         # Cria classificador
clf = clf.fit(x_train, y_train)             # Treina o classificador

# Avaliacao dos resultados
clf.score(x_test, y_test)

0.22222222222222221

In [8]:
# Calcula validacao cruzada
from sklearn.model_selection import cross_val_score             # Importa o pacote de validacao cruzada
scores = cross_val_score(clf, x, y, cv=3)     # Calcula os scores de 5-folds estratificados
print scores

[ 0.23076923  0.41666667  0.625     ]




In [9]:
# Apresentacao dos resultados
print 'Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)

Accuracy: 0.42 (+/- 0.32)


In [10]:
# Visualizacao da Arvore do Modelo 1
import graphviz

cn = []
for v in sorted(set(y_train.values)):
    cn.append(str(v) + ": " + class_names[v])

dot_data = tree.export_graphviz(clf, out_file=None,
     feature_names=x.columns,
     class_names=cn,
     filled=True, rounded=True,
     special_characters=True)
graph = graphviz.Source(dot_data, format='png')
graph.render('modelo_1', view=True)

'modelo_1.png'

In [11]:
x.__dict__

{'_data': BlockManager
 Items: Index([u'Genre:Action', u'Genre:Adventure', u'Genre:Cars', u'Genre:Comedy',
        u'Genre:Demons', u'Genre:Drama', u'Genre:Ecchi', u'Genre:Fantasy',
        u'Genre:Game', u'Genre:Harem', u'Genre:Horror', u'Genre:Magic',
        u'Genre:Martial Arts', u'Genre:Mecha', u'Genre:Military',
        u'Genre:Mystery', u'Genre:Parody', u'Genre:Police',
        u'Genre:Psychological', u'Genre:Romance', u'Genre:School',
        u'Genre:Sci-Fi', u'Genre:Seinen', u'Genre:Shounen',
        u'Genre:Slice of Life', u'Genre:Sports', u'Genre:Super Power',
        u'Genre:Supernatural', u'Genre:Thriller', u'duration', u'episodes',
        u'type', u'year'],
       dtype='object')
 Axis 1: RangeIndex(start=0, stop=33, step=1)
 IntBlock: slice(0, 33, 1), 33 x 33, dtype: int64,
 '_is_copy': None,
 '_item_cache': {}}

In [12]:
x

Unnamed: 0,Genre:Action,Genre:Adventure,Genre:Cars,Genre:Comedy,Genre:Demons,Genre:Drama,Genre:Ecchi,Genre:Fantasy,Genre:Game,Genre:Harem,...,Genre:Shounen,Genre:Slice of Life,Genre:Sports,Genre:Super Power,Genre:Supernatural,Genre:Thriller,duration,episodes,type,year
0,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,1,-1,-1,-1,-1,3,4,1,2014
1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,-1,-1,-1,1,1,23,26,4,2011
2,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,1,24,1,3,2012
3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,24,12,4,2012
4,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,130,1,0,2013
5,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,23,22,4,2012
6,1,1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,-1,-1,1,-1,-1,149,1,0,2012
7,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,29,4,2,2013
8,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,-1,-1,-1,1,1,23,37,4,2006
9,1,-1,-1,-1,-1,1,-1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,24,25,4,2006


In [13]:
#x['episodes'][634]

In [14]:
#print x.columns[0]
#for i in range(len(x.values[634])):
#    if x[x.columns[i]][634] == 1:
#        print x.columns[i]

In [None]:
#print x.values[634]

In [16]:
for i in range(len(y.values)):
    if y.values[i] > 8:
        nc, np = y.values[i], clf.predict([x.values[i]])[0]
        print i, ": ", nc, np
        if y.values[i] == clf.predict([x.values[i]])[0]:
            print i, y.values[i]
            print x.values[i]
            print "*" * 20

2 :  9 9
2 9
[  -1   -1   -1    1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1
   -1   -1   -1   -1   -1   -1    1   -1   -1   -1   -1   -1   -1    1   24
    1    3 2012]
********************
5 :  9 9
5 9
[   1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1
   -1   -1    1    1   -1   -1    1   -1   -1   -1   -1   -1   -1   -1   23
   22    4 2012]
********************
8 :  10 10
8 10
[  -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1
    1   -1    1    1   -1   -1   -1   -1    1   -1   -1   -1    1    1   23
   37    4 2006]
********************
9 :  9 9
9 9
[   1   -1   -1   -1   -1    1   -1   -1   -1   -1   -1   -1   -1    1    1
   -1   -1   -1   -1   -1    1    1   -1   -1   -1   -1    1   -1   -1   24
   25    4 2006]
********************
18 :  9 9
18 9
[   1   -1   -1   -1   -1    1   -1   -1   -1   -1   -1   -1   -1    1    1
   -1   -1   -1   -1   -1   -1    1   -1   -1   -1   -1    1   -1   -1   24
   25    4 2008]
*

In [17]:
x.columns

Index([u'Genre:Action', u'Genre:Adventure', u'Genre:Cars', u'Genre:Comedy',
       u'Genre:Demons', u'Genre:Drama', u'Genre:Ecchi', u'Genre:Fantasy',
       u'Genre:Game', u'Genre:Harem', u'Genre:Horror', u'Genre:Magic',
       u'Genre:Martial Arts', u'Genre:Mecha', u'Genre:Military',
       u'Genre:Mystery', u'Genre:Parody', u'Genre:Police',
       u'Genre:Psychological', u'Genre:Romance', u'Genre:School',
       u'Genre:Sci-Fi', u'Genre:Seinen', u'Genre:Shounen',
       u'Genre:Slice of Life', u'Genre:Sports', u'Genre:Super Power',
       u'Genre:Supernatural', u'Genre:Thriller', u'duration', u'episodes',
       u'type', u'year'],
      dtype='object')

In [18]:
x.values[8]

array([  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
         -1,   -1,   -1,   -1,    1,   -1,    1,    1,   -1,   -1,   -1,
         -1,    1,   -1,   -1,   -1,    1,    1,   23,   37,    4, 2006])

In [19]:
for i in range(len(x.values[8])):
    print x.columns[i], ": ", x.values[8][i]

Genre:Action :  -1
Genre:Adventure :  -1
Genre:Cars :  -1
Genre:Comedy :  -1
Genre:Demons :  -1
Genre:Drama :  -1
Genre:Ecchi :  -1
Genre:Fantasy :  -1
Genre:Game :  -1
Genre:Harem :  -1
Genre:Horror :  -1
Genre:Magic :  -1
Genre:Martial Arts :  -1
Genre:Mecha :  -1
Genre:Military :  -1
Genre:Mystery :  1
Genre:Parody :  -1
Genre:Police :  1
Genre:Psychological :  1
Genre:Romance :  -1
Genre:School :  -1
Genre:Sci-Fi :  -1
Genre:Seinen :  -1
Genre:Shounen :  1
Genre:Slice of Life :  -1
Genre:Sports :  -1
Genre:Super Power :  -1
Genre:Supernatural :  1
Genre:Thriller :  1
duration :  23
episodes :  37
type :  4
year :  2006
