## Imports

In [1]:
import pandas as pd
import re

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

# Links to forum posts
index_link = "https://myanimelist.net/"
topic_link_anime = "https://myanimelist.net/forum/?topicid=1812367"
topic_link_manga = "https://myanimelist.net/forum/?topicid=1812368"
topic_link_image = "https://myanimelist.net/forum/?topicid=1812369"

## HTML Request Functions

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

## Scraper

In [3]:
def scraper(url, version, santa_list):
    
    # Get posts from url
    raw_html = simple_get(url)
    html = BeautifulSoup(raw_html, 'html.parser')

    posts = html.find_all("div", class_="forum_border_around")
    
    # Parse each post for info
    for i in posts: 
        
        links = i.find_all('a')

        post_num = links[2].get_text()
        if post_num == '1': continue
        
        post_id = links[2].attrs["href"]
        post_link = url + post_id

        user_name = links[3].strong.get_text()
        user_link = index_link + "profile/" + user_name

        post_content = i.find("div", id="message"+post_id[4:]).get_text()
        result = re.findall(r':(.*?)\n', post_content)

        # Img specific parsing
        if version == "img":
            if len(result) < 6:
                img_type       = ""
                user_skill_lvl = ""
                gift_skill_lvl = ""
                prev_imgs      = ""
            else:
                img_type       = result[0]
                user_skill_lvl = result[3]
                gift_skill_lvl = result[4]
                prev_imgs      = result[5]
            user_form = [post_num, post_link, user_name, user_link, 
                         img_type, user_skill_lvl, gift_skill_lvl, prev_imgs]
            
        elif version == "anime" or "manga":
            # Get posts from url
            #profile_html = simple_get(user_link)
            profile_html = None
            if profile_html is None:
                watching      = ""
                completed     = ""
                on_hold       = ""
                dropped       = ""
                plan_to_watch = ""
            else:
                profile = BeautifulSoup(profile_html, 'html.parser')

                stats_container = profile.find_all("ul", class_="stats-status")

                if version == "anime":
                    stats = stats_container[0].find_all("span", class_="di-ib")
                    watching      = stats[0].get_text()
                    completed     = stats[1].get_text()
                    on_hold       = stats[2].get_text()
                    dropped       = stats[3].get_text()
                    plan_to_watch = stats[4].get_text()
                else:
                    stats = stats_container[1].find_all("span", class_="di-ib")
                    watching      = stats[0].get_text()
                    completed     = stats[1].get_text()
                    on_hold       = stats[2].get_text()
                    dropped       = stats[3].get_text()
                    plan_to_watch = stats[4].get_text()
                
            if len(result) < 7:
                likes         = ""
                dislikes      = ""
                num_completed = ""
            else:
                likes         = result[1]
                dislikes      = result[2]
                completed     = result[5]
            user_form = [post_num, post_link, user_name, user_link, 
                         likes, dislikes, watching, completed, on_hold, 
                         dropped, plan_to_watch]
            
        santa_list.append(user_form)
    return santa_list

## Image

In [4]:
# Image Format
#   Type of image (profile pic, about me, forum avi, sig, profile set, forum set, or surprise me): 
#   Likes (artists, characters, tags, etc.): 
#   Dislikes (artists, characters, tags, etc.): 
#   Your image editing skill level (Beginner, Intermediate, or Advanced): 
#   Would you like to receive an image of the same level? (Yes or Surprise Me): 
#   Link(s) to some previous images you've used: 
#   Link to your profile: 

header = ['post_num', 'post_link', 'user_name', 'user_link', 
          'img_type', 'user_skill_lvl', 'gift_skill_lvl', 'prev_imgs']
santa_list_img = [[None] * len(header)]

page_num = 1
url = topic_link_image + "&show=" + str(50 * (page_num-1))
scraper(url, "img", santa_list_img)

df_img = pd.DataFrame(santa_list_img, columns=header)
df_img = df_img.drop([0])
export_csv = df_img.to_csv (r'.\secret_santa_list_image.csv', index = None, header=True) 

df_img.iloc[0]

post_num                                                          2
post_link         https://myanimelist.net/forum/?topicid=1812369...
user_name                                                   Lashkjx
user_link                   https://myanimelist.net/profile/Lashkjx
img_type                                                Profile set
user_skill_lvl                                             Advanced
gift_skill_lvl                                                  Yes
prev_imgs          My collection of images I used and my gifts f...
Name: 1, dtype: object

## Recommendation (Anime)

In [5]:
# Recommendation Format
#   Min-Max length (episodes):
#   Likes (genres, demographics, themes):
#   Dislikes (genres, demographics, themes): 
#   Any sub/stream restrictions? (e.g. must it be available on CR):
#   Can the recommendation be on your Plan to Watch list? (Yes or No):
#   Number of Completed entries on your anime list:
#   Does your list include everything you've seen (except childhood anime)?:
#   Link to anime list:

header = ['post_num', 'post_link', 'user_name', 'user_link', 
          'likes', 'dislikes', 'watching', 'completed', 'on_hold', 'dropped', 'plan_to_watch']
santa_list_anime = [[None] * len(header)]

page_num = 2
url = topic_link_anime + "&show=" + str(50 * (page_num-1))
scraper(url, "anime", santa_list_anime)

df_anime = pd.DataFrame(santa_list_anime, columns=header)
df_anime = df_anime.drop([0])
export_csv = df_anime.to_csv (r'.\secret_santa_list_anime.csv', index=None, header=True) 

df_anime.iloc[0]
#df_anime

post_num                                                        51
post_link        https://myanimelist.net/forum/?topicid=1812367...
user_name                                                   marzus
user_link                   https://myanimelist.net/profile/marzus
likes                                                           \r
dislikes                                             Music, Top 20
watching                                                          
completed                                                      109
on_hold                                                           
dropped                                                           
plan_to_watch                                                     
Name: 1, dtype: object

## Recommendation (Manga)

In [6]:
# Recommendation Format
#   Min-Max length (episodes):
#   Likes (genres, demographics, themes):
#   Dislikes (genres, demographics, themes): 
#   Any sub/stream restrictions? (e.g. must it be available on CR):
#   Can the recommendation be on your Plan to Watch list? (Yes or No):
#   Number of Completed entries on your anime list:
#   Does your list include everything you've seen (except childhood anime)?:
#   Link to anime list:

header = ['post_num', 'post_link', 'user_name', 'user_link', 
          'likes', 'dislikes', 'watching', 'completed', 'on_hold', 'dropped', 'plan_to_read']
santa_list_manga = [[None] * len(header)]

page_num = 1
url = topic_link_manga + "&show=" + str(50 * (page_num-1))
scraper(url, "manga", santa_list_manga)

df_manga = pd.DataFrame(santa_list_manga, columns=header)
df_manga = df_manga.drop([0])
export_csv = df_manga.to_csv (r'.\secret_santa_list_manga.csv', index=None, header=True) 

df_manga.iloc[0]
#df_manga

AssertionError: 7 columns passed, passed data had 11 columns