In [1]:
# Importa o pacote train_test_split
from sklearn.model_selection import train_test_split

#Importa o crawler
import json
import crawler
import pandas as pd

In [2]:
class_names = ["No Score", "Appalling", "Horrible", "Very Bad", "Bad", "Average", "Fine", "Good", "Very Good", "Great", "Masterpiece"]

# Listas com todos os atributos disponíveis
atributos_anime_todos = ["title", "duration", "episodes", "genres", "popularity", "public_score", "rank", "rating", "source", "studios", "type", "year"]
atributos_avaliacao_todos = ["num_watched_episodes", "user_score", "status"]

# Listas com alguns atributos mais relevantes
atributos_anime_padrao = atributos_anime_todos#["duration", "episodes", "genres", "type", "year"]
atributos_avaliacao_padrao = atributos_avaliacao_todos#["user_score"]

In [3]:
def filtro(usuario, f_selecao, atributos_anime = atributos_anime_padrao, atributos_avaliacao = atributos_avaliacao_padrao, agrupar_episodios = False, force_update = False):
	if "user_score" not in atributos_avaliacao:
		atributos_avaliacao.append("user_score")

	dados = []
	lista_animes = crawler.get_lista(usuario)
    
	print lista_animes

	generos = []
	estudios = []
	for anime in lista_animes:
		if anime.genres != None:
			generos.extend(anime.genres)
		if anime.studios != None:
			estudios.extend(anime.studios)

	generos = list(set(generos))
	estudios = list(set(estudios))

	for anime in lista_animes:
		dado = {}
		dado_filtrado = {}

		# Pegando os dados do anime
		for atributo in atributos_anime_todos:
			dado[atributo] = anime.__dict__[atributo]

		# Pegando os dados da avaliação
		with open("data/users/" + usuario + "/" + str(anime.id) + ".json") as f:
			avaliacao = json.loads(f.read())

			for atributo in atributos_avaliacao_todos:
				dado[atributo] = avaliacao[atributo]

		if f_selecao == None or f_selecao(dado):
			dado_filtrado = {}

			for atributo in atributos_anime:
				if atributo == "genres":
					for genero in generos:
						if dado["genres"] != None and genero in dado["genres"]:
							dado_filtrado["Genre:" + genero] = 1
						else:
							dado_filtrado["Genre:" + genero] = -1
					continue

				if atributo == "studios":
					for estudio in estudios:
						if dado["studios"] != None and estudio in dado["studios"]:
							dado_filtrado["Studio:" + estudio] = 1
						else:
							dado_filtrado["Studio:" + estudio] = -1
					continue

				if agrupar_episodios and "episodes" in atributo:
					try:
						epis = int(dado[atributo])
						if epis <= 6:
							dado_filtrado[atributo] = 1
						elif epis <= 14:
							dado_filtrado[atributo] = 2
						elif epis <= 26:
							dado_filtrado[atributo] = 3
						elif epis <= 70:
							dado_filtrado[atributo] = 4
						else:
							dado_filtrado[atributo] = 5
					except:
						dado_filtrado[atributo] = dado[atributo]

			for atributo in atributos_avaliacao:
				dado_filtrado[atributo] = dado[atributo]

			dados.append(dado_filtrado)
	
	return dados

In [4]:
def carregar_dataset(usuario, f_selecao, atributos_anime = atributos_anime_padrao, atributos_avaliacao = atributos_avaliacao_padrao, agrupar_episodios = False, force_update = False):
	lista_final = filtro(usuario, f_selecao, atributos_anime, atributos_avaliacao, agrupar_episodios, force_update)

	df = pd.read_json(json.dumps(lista_final))

	target = df['user_score']
	data = df.drop('user_score', axis = 1)

	from sklearn import preprocessing
	le = preprocessing.LabelEncoder()
	for attr in data.columns:
		if data[attr].dtype == object:
			data[attr] = le.fit_transform(data[attr])
	
	return data, target

In [5]:
x, y = carregar_dataset("swordhermit", lambda d: d["status"] == 2 or d["status"] == "2", force_update=False)

[<crawler.Anime instance at 0x7f6c61e36bd8>, <crawler.Anime instance at 0x7f6c5b091cf8>, <crawler.Anime instance at 0x7f6c5b091dd0>, <crawler.Anime instance at 0x7f6c5b091d88>, <crawler.Anime instance at 0x7f6c5b091cb0>, <crawler.Anime instance at 0x7f6c5b091e18>, <crawler.Anime instance at 0x7f6c5b091e60>, <crawler.Anime instance at 0x7f6c5b0946c8>, <crawler.Anime instance at 0x7f6c5b0944d0>, <crawler.Anime instance at 0x7f6c5b0943b0>, <crawler.Anime instance at 0x7f6c5b094488>, <crawler.Anime instance at 0x7f6c5b094440>, <crawler.Anime instance at 0x7f6c5b0943f8>, <crawler.Anime instance at 0x7f6c5b094680>, <crawler.Anime instance at 0x7f6c5b0945a8>, <crawler.Anime instance at 0x7f6c5b094638>, <crawler.Anime instance at 0x7f6c5b094560>, <crawler.Anime instance at 0x7f6c5b094518>, <crawler.Anime instance at 0x7f6c5b094758>, <crawler.Anime instance at 0x7f6c5b0947a0>, <crawler.Anime instance at 0x7f6c5b094710>, <crawler.Anime instance at 0x7f6c5b094998>, <crawler.Anime instance at 0x7f

In [6]:
# MODELO 1

# Dividir conjuntos de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(    # Divide conjuntos nao-aleatoriamente
    x, y, shuffle=False)

# Treinamento da Arvore de Decisao
from sklearn import tree                    # Importa o pacote de arvore de decisao
clf = tree.DecisionTreeClassifier()         # Cria classificador
clf = clf.fit(x_train, y_train)             # Treina o classificador

# Avaliacao dos resultados
clf.score(x_test, y_test)

0.27887617065556713

In [7]:
# Visualizacao da Arvore do Modelo 1
import graphviz

cn = []
for v in sorted(set(y_train.values)):
    cn.append(str(v) + ": " + class_names[v])

dot_data = tree.export_graphviz(clf, out_file=None,
     feature_names=x.columns,
     class_names=cn,
     filled=True, rounded=True,
     special_characters=True)
graph = graphviz.Source(dot_data, format='png')
graph.render('modelo_1', view=True)

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.382707 to fit


'modelo_1.png'

In [8]:
# MODELO 2

# Dividir novamente os conjuntos de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(    # Divide conjuntos aleatoriamente
    x, y, shuffle=True)                                 # Por padrao em 75% treino e 25% teste

# Re-treina o classificador
clf = clf.fit(x_train, y_train)

# Avaliacao dos resultados
clf.score(x_test, y_test)

0.27991675338189387

In [9]:
dot_data = tree.export_graphviz(clf, out_file=None,
     feature_names=x.columns,
     class_names=cn,
     filled=True, rounded=True,
     special_characters=True)
graph = graphviz.Source(dot_data, format='png')
graph.render('modelo_2', view=True)

IndexError: list index out of range

In [None]:
# MODELO 3

# Dividir novamente os conjuntos de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(    # Divide aleatoria e estratificadamente,
    x, y, stratify=y)                                   # i.e., mantem proporcionalidade das classes

# Re-treina o classificador
clf = clf.fit(x_train, y_train)

# Avaliacao dos resultados
clf.score(x_test, y_test)

In [None]:
dot_data = tree.export_graphviz(clf, out_file=None,
     feature_names=x.columns,
     class_names=cn,
     filled=True, rounded=True,
     special_characters=True)
graph = graphviz.Source(dot_data, format='png')
graph.render('modelo_3', view=True)