In [1]:
# Importa o pacote train_test_split
from sklearn.model_selection import train_test_split

# Cria classificador
from sklearn import tree                    # Importa o pacote de arvore de decisao
clf = tree.DecisionTreeClassifier()         # Cria classificador

#Importa o crawler
import json
import crawler
import pandas as pd

In [2]:
class_names = ["No Score", "Appalling", "Horrible", "Very Bad", "Bad", "Average", "Fine", "Good", "Very Good", "Great", "Masterpiece"]

# Listas com todos os atributos disponíveis
atributos_anime_todos = ["title", "duration", "episodes", "genres", "popularity", "public_score", "rank", "rating", "source", "studios", "type", "year"]
atributos_avaliacao_todos = ["num_watched_episodes", "user_score", "status"]

# Listas com alguns atributos mais relevantes
atributos_anime_padrao = atributos_anime_todos#["duration", "episodes", "genres", "type", "year"]
atributos_avaliacao_padrao = atributos_avaliacao_todos#["user_score"]

In [3]:
def filtro(usuario, f_selecao, atributos_anime = atributos_anime_padrao, atributos_avaliacao = atributos_avaliacao_padrao, agrupar_episodios = False, force_update = False):
	if "user_score" not in atributos_avaliacao:
		atributos_avaliacao.append("user_score")

	dados = []
	lista_animes = crawler.get_lista(usuario)
    
	print lista_animes

	generos = []
	estudios = []
	for anime in lista_animes:
		if anime.genres != None:
			generos.extend(anime.genres)
		if anime.studios != None:
			estudios.extend(anime.studios)

	generos = list(set(generos))
	estudios = list(set(estudios))

	for anime in lista_animes:
		dado = {}
		dado_filtrado = {}

		# Pegando os dados do anime
		for atributo in atributos_anime_todos:
			dado[atributo] = anime.__dict__[atributo]

		# Pegando os dados da avaliação
		with open("data/users/" + usuario + "/" + str(anime.id) + ".json") as f:
			avaliacao = json.loads(f.read())

			for atributo in atributos_avaliacao_todos:
				dado[atributo] = avaliacao[atributo]

		if f_selecao == None or f_selecao(dado):
			dado_filtrado = {}

			for atributo in atributos_anime:
				if atributo == "genres":
					for genero in generos:
						if dado["genres"] != None and genero in dado["genres"]:
							dado_filtrado["Genre:" + genero] = 1
						else:
							dado_filtrado["Genre:" + genero] = -1
					continue

				if atributo == "studios":
					for estudio in estudios:
						if dado["studios"] != None and estudio in dado["studios"]:
							dado_filtrado["Studio:" + estudio] = 1
						else:
							dado_filtrado["Studio:" + estudio] = -1
					continue

				if agrupar_episodios and "episodes" in atributo:
					try:
						epis = int(dado[atributo])
						if epis <= 6:
							dado_filtrado[atributo] = 1
						elif epis <= 14:
							dado_filtrado[atributo] = 2
						elif epis <= 26:
							dado_filtrado[atributo] = 3
						elif epis <= 70:
							dado_filtrado[atributo] = 4
						else:
							dado_filtrado[atributo] = 5
					except:
						dado_filtrado[atributo] = dado[atributo]

			for atributo in atributos_avaliacao:
				dado_filtrado[atributo] = dado[atributo]

			dados.append(dado_filtrado)
	
	return dados

In [4]:
def carregar_dataset(usuario, f_selecao, atributos_anime = atributos_anime_padrao, atributos_avaliacao = atributos_avaliacao_padrao, agrupar_episodios = False, force_update = False):
	lista_final = filtro(usuario, f_selecao, atributos_anime, atributos_avaliacao, agrupar_episodios, force_update)

	df = pd.read_json(json.dumps(lista_final))

	target = df['user_score']
	data = df.drop('user_score', axis = 1)

	from sklearn import preprocessing
	le = preprocessing.LabelEncoder()
	for attr in data.columns:
		if data[attr].dtype == object:
			data[attr] = le.fit_transform(data[attr])
	
	return data, target

In [5]:
x, y = carregar_dataset("swordhermit", lambda d: d["status"] == 2 or d["status"] == "2", force_update=False)

[<crawler.Anime instance at 0x7f8b2b9a6368>, <crawler.Anime instance at 0x7f8ae9d45b00>, <crawler.Anime instance at 0x7f8ae9d455a8>, <crawler.Anime instance at 0x7f8ae9d45638>, <crawler.Anime instance at 0x7f8ae9d455f0>, <crawler.Anime instance at 0x7f8ae9d45f38>, <crawler.Anime instance at 0x7f8ae9d45bd8>, <crawler.Anime instance at 0x7f8ae9d45c68>, <crawler.Anime instance at 0x7f8ae9d45cf8>, <crawler.Anime instance at 0x7f8ae9d45cb0>, <crawler.Anime instance at 0x7f8ae9d45c20>, <crawler.Anime instance at 0x7f8ae9d45b90>, <crawler.Anime instance at 0x7f8ae9d45ef0>, <crawler.Anime instance at 0x7f8ae9d45e18>, <crawler.Anime instance at 0x7f8ae9d45ea8>, <crawler.Anime instance at 0x7f8ae9d45e60>, <crawler.Anime instance at 0x7f8ae9d45d88>, <crawler.Anime instance at 0x7f8ae9d45fc8>, <crawler.Anime instance at 0x7f8ae9acdfc8>, <crawler.Anime instance at 0x7f8ae9d52ea8>, <crawler.Anime instance at 0x7f8ae9d52e60>, <crawler.Anime instance at 0x7f8ae9d52f38>, <crawler.Anime instance at 0x7f

In [6]:
cont = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

for c in y.values:
    cont[c] += 1

for i in range(11):
    print i, " - ", class_names[i] + ": ", cont[i]


0  -  No Score:  0
1  -  Appalling:  138
2  -  Horrible:  308
3  -  Very Bad:  567
4  -  Bad:  830
5  -  Average:  1096
6  -  Fine:  535
7  -  Good:  260
8  -  Very Good:  79
9  -  Great:  25
10  -  Masterpiece:  3


In [17]:
# MODELO 1

# Dividir conjuntos de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(    # Divide conjuntos nao-aleatoriamente
    x, y, shuffle=False)

# Treinamento da Arvore de Decisao
from sklearn import tree                    # Importa o pacote de arvore de decisao
clf = tree.DecisionTreeClassifier()         # Cria classificador
clf = clf.fit(x_train, y_train)             # Treina o classificador

# Avaliacao dos resultados
clf.score(x_test, y_test)

0.26534859521331944

In [9]:
# Calcula validacao cruzada
from sklearn.model_selection import cross_val_score             # Importa o pacote de validacao cruzada
scores = cross_val_score(clf, x, y, cv=4)     # Calcula os scores de 5-folds estratificados
print scores



[ 0.3219107   0.27650728  0.31145833  0.2876569 ]


In [8]:
# Apresentacao dos resultados
print 'Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)

Accuracy: 0.29 (+/- 0.02)
