## Imports

In [1]:
import pandas as pd
import re

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

# Links to forum posts
index_link = "https://myanimelist.net/"
topic_link_img = "https://myanimelist.net/forum/?topicid=1753247"
topic_link_rec = "https://myanimelist.net/forum/?topicid=1753246"

pages_img = 3
pages_rec = 6

## HTML Request Functions

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

## Scraper

In [6]:
def scraper(url, version, santa_list):
    
    # Get posts from url
    raw_html = simple_get(url)
    html = BeautifulSoup(raw_html, 'html.parser')
    
    posts = html.find_all("div", class_="forum_border_around ")
    
    # Parse each post for info
    for i in posts: 
        
        links = i.find_all('a')

        post_num = links[2].get_text()
        if post_num == '1': continue
        
        post_id = links[2].attrs["href"]
        post_link = topic_link_img + post_id

        user_name = links[3].strong.get_text()
        user_link = index_link + "profile/" + user_name

        post_content = i.find("div", id="message"+post_id[4:]).get_text()
        result = re.findall(r':(.*?)\n', post_content)
        
        # Img specific parsing
        if version == "img":
            if len(result) < 6:
                img_type       = ""
                user_skill_lvl = ""
                gift_skill_lvl = ""
                prev_imgs      = ""
            else:
                img_type       = result[0]
                user_skill_lvl = result[3]
                gift_skill_lvl = result[4]
                prev_imgs      = result[5]
            user_form = [post_num, post_link, user_name, user_link, 
                         img_type, user_skill_lvl, gift_skill_lvl, prev_imgs]
            
        # Rec specific parsing
        if version == "rec":
            if len(result) < 8:
                rec_type      = ""
                likes         = ""
                dislikes      = ""
                num_completed = ""
            else:
                rec_type      = result[0]
                likes         = result[2]
                dislikes      = result[3]
                num_completed = result[6]
            user_form = [post_num, post_link, user_name, user_link, 
                         rec_type, likes, dislikes, num_completed]
            
        santa_list.append(user_form)
        
    return santa_list

## Image

In [7]:
# Image Format
#   Type of image (profile pic, about me, forum avi, sig, profile set, forum set, or surprise me): 
#   Likes (artists, characters, tags, etc.): 
#   Dislikes (artists, characters, tags, etc.): 
#   Your image editing skill level (Beginner, Intermediate, or Advanced): 
#   Would you like to receive an image of the same level? (Yes or Surprise Me): 
#   Link(s) to some previous images you've used: 
#   Link to your profile: 

header = ['post_num', 'post_link', 'user_name', 'user_link', 'img_type', 'user_skill_lvl', 'gift_skill_lvl', 'prev_imgs']
santa_list_img = [[None] * len(header)]

for i in range(pages_img):
    url = topic_link_img + "&show=" + str(50 * i)
    scraper(url, "img", santa_list_img)

df_img = pd.DataFrame(santa_list_img, columns=header)
df_img = df_img.drop([0])
# export_csv = df_img.to_csv (r'.\secret_santa_list.csv', index = None, header=True) 

df_img.iloc[0]

post_num                                                          2
post_link         https://myanimelist.net/forum/?topicid=1753247...
user_name                                                   Dar9586
user_link                   https://myanimelist.net/profile/Dar9586
img_type                                               Profile pic 
user_skill_lvl                                             Advanced
gift_skill_lvl                                                  Yes
prev_imgs                           https://imgur.com/gallery/gltq9
Name: 1, dtype: object

## Recommendation

In [8]:
# Recommendation Format
#   Rec type (anime, manga, either, or both):
#   Min-Max length (episodes/volumes): 
#   Likes (genres, demographics, themes): 
#   Dislikes (genres, demographics, themes):
#   Any fansub/fanscan restrictions? (i.e. must it be available on CR, in print, etc.): 
#   Can the recommendation be on your Plan to Watch list? (Yes or No): 
#   Number of Completed entries on your anime/manga list: 
#   Link to anime/manga list: 

header = ['post_num', 'post_link', 'user_name', 'user_link', 'rec_type', 'likes', 'dislikes', 'num_completed']
santa_list_rec = [[None] * len(header)]

for i in range(pages_rec):
    url = topic_link_rec + "&show=" + str(50 * i)
    scraper(url, "rec", santa_list_rec)

df_rec = pd.DataFrame(santa_list_rec, columns=header)
df_rec = df_rec.drop([0])
# export_csv = df_rec.to_csv (r'.\secret_santa_list.csv', index = None, header=True) 

df_rec.iloc[0]

post_num                                                         2
post_link        https://myanimelist.net/forum/?topicid=1753247...
user_name                                          ScarredSceptile
user_link          https://myanimelist.net/profile/ScarredSceptile
rec_type                                                          
likes                                                             
dislikes                                                          
num_completed                                                     
Name: 1, dtype: object