# **Extract MyAnimeList Data**

In [1]:
import numpy as np
import pandas as pd

#for api
import requests
import time
import csv
import os

## Check Features Extracted

see the columns that can be extracted out using one anime first

In [2]:
url = f"https://api.jikan.moe/v4/anime/?q=Naruto&limit=1"
response = requests.get(url)
data = response.json()

#print the data out
data

{'pagination': {'last_visible_page': 30,
  'has_next_page': True,
  'current_page': 1,
  'items': {'count': 1, 'total': 30, 'per_page': 1}},
 'data': [{'mal_id': 20,
   'url': 'https://myanimelist.net/anime/20/Naruto',
   'images': {'jpg': {'image_url': 'https://cdn.myanimelist.net/images/anime/1141/142503.jpg',
     'small_image_url': 'https://cdn.myanimelist.net/images/anime/1141/142503t.jpg',
     'large_image_url': 'https://cdn.myanimelist.net/images/anime/1141/142503l.jpg'},
    'webp': {'image_url': 'https://cdn.myanimelist.net/images/anime/1141/142503.webp',
     'small_image_url': 'https://cdn.myanimelist.net/images/anime/1141/142503t.webp',
     'large_image_url': 'https://cdn.myanimelist.net/images/anime/1141/142503l.webp'}},
   'trailer': {'youtube_id': None,
    'url': None,
    'embed_url': None,
    'images': {'image_url': None,
     'small_image_url': None,
     'medium_image_url': None,
     'large_image_url': None,
     'maximum_image_url': None}},
   'approved': True,

In [13]:
#get the anime id
def get_anime_id(anime_name):
    url = f"https://api.jikan.moe/v4/anime/?q={anime_name}&limit=1"
    response = requests.get(url)
    data = response.json()
    
    if 'data' in data and data['data']:
        anime_id = data['data'][0]['mal_id']
        return anime_id
    else:
        print("Anime not found.")
        return None

# Example usage:
anime_name = "Naruto"
anime_id = get_anime_id(anime_name)
print("Anime ID:", anime_id)

Anime ID: 20


In [16]:
def get_anime_features(anime_id):
    url = f"https://api.jikan.moe/v4/anime/{anime_id}/full"
    response = requests.get(url)
    details = response.json()

    if(response.status_code == 200):
        for col in details['data'].keys():
            print(col)
    else:
        print("Anime Not Found")
    

# Example usage:
print("All the features available:")
anime_features = get_anime_features(anime_id)

All the features available:
mal_id
url
images
trailer
approved
titles
title
title_english
title_japanese
title_synonyms
type
source
episodes
status
airing
aired
duration
rating
score
scored_by
rank
popularity
members
favorites
synopsis
background
season
year
broadcast
producers
licensors
studios
genres
explicit_genres
themes
demographics
relations
theme
external
streaming


## Extract Anime Data Using Jikan API

In [17]:
# Global variables for rate limiting
REQUESTS_PER_MINUTE = 60
REQUESTS_PER_SECOND = 3
MINUTE = 60

# Track the time of last request
last_request_time = time.time()

def get_anime_details(start_id, end_id):
    anime_data = []

    global last_request_time  # Accessing global variable

    for anime_id in range(start_id, end_id):
        # Check if we need to wait before making the next request
        elapsed_time = time.time() - last_request_time
        if elapsed_time < 1 / REQUESTS_PER_SECOND:
            time.sleep((1 / REQUESTS_PER_SECOND) - elapsed_time)
        
        url = f"https://api.jikan.moe/v4/anime/{anime_id}/full"
        response = requests.get(url)

        print(f'Status ({anime_id}):', response.status_code)

        if response.status_code == 200:
            anime_details = response.json()['data']
            anime_data.append(anime_details)
            last_request_time = time.time()  # Update last request time
        else:
            print(f"Anime ID {anime_id} Not Found")

    return anime_data

def write_anime_to_csv(anime_data, file_name):
    with open(file_name, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = [
            "mal_id", "url", "images", "trailer", "approved", "titles", "title", 
            "title_english", "title_japanese", "title_synonyms", "type", "source", 
            "episodes", "status", "airing", "aired", "duration", "rating", "score", 
            "scored_by", "rank", "popularity", "members", "favorites", "synopsis", 
            "background", "season", "year", "broadcast", "producers", "licensors", 
            "studios", "genres", "explicit_genres", "themes", "demographics", 
            "relations", "theme", "external", "streaming"
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for anime in anime_data:
            row = {}
            for field in fieldnames:
                if field in anime:
                    if isinstance(anime[field], dict):
                        row[field] = anime[field] if anime[field] else ''
                    elif isinstance(anime[field], list):
                        if all(isinstance(item, dict) for item in anime[field]):
                            row[field] = ','.join([item.get("name", "") for item in anime[field]]) if anime[field] else ''
                        else:
                            row[field] = anime[field]
                    else:
                        row[field] = anime[field]
                else:
                    row[field] = ''

            writer.writerow(row)  # Writing each anime as a row in the CSV

# Example usage:
chunk_size = 5000
total_anime_ids = 60000
num_chunks = total_anime_ids // chunk_size

for i in range(num_chunks):
    start_id = i * chunk_size
    end_id = min((i + 1) * chunk_size, total_anime_ids)
    anime_data = get_anime_details(start_id, end_id)
    if anime_data:
        file_name = f'anime_data_{start_id}_{end_id}.csv'
        write_anime_to_csv(anime_data, file_name)
        print(f"Anime data for IDs {start_id} to {end_id} written to {file_name}.")


Status (0): 400
Anime ID 0 Not Found
Status (1): 200
Status (2): 404
Anime ID 2 Not Found
Status (3): 404
Anime ID 3 Not Found
Status (4): 404
Anime ID 4 Not Found
Status (5): 200
Status (6): 200
Status (7): 200
Status (8): 200
Status (9): 404
Anime ID 9 Not Found
Status (10): 404
Anime ID 10 Not Found
Status (11): 404
Anime ID 11 Not Found
Status (12): 404
Anime ID 12 Not Found
Status (13): 404
Anime ID 13 Not Found
Status (14): 404
Anime ID 14 Not Found
Status (15): 200
Status (16): 200
Status (17): 200
Status (18): 200
Status (19): 200
Status (20): 200
Status (21): 200
Status (22): 200
Status (23): 200
Status (24): 200
Status (25): 200
Status (26): 200
Status (27): 200
Status (28): 200
Status (29): 200
Status (30): 200
Status (31): 200
Status (32): 200
Status (33): 200
Status (34): 404
Anime ID 34 Not Found
Status (35): 404
Anime ID 35 Not Found
Status (36): 404
Anime ID 36 Not Found
Status (37): 404
Anime ID 37 Not Found
Status (38): 404
Anime ID 38 Not Found
Status (39): 404
Anime

## Merge All The Files

In [25]:
# Path to the folder containing CSV files
folder_path = 'data'

# List of CSV files to merge in the specified order
csv_files = ['anime_data_0_5000.csv', 'anime_data_5000_10000.csv', 'anime_data_10000_15000.csv', 'anime_data_15000_20000.csv', 'anime_data_20000_25000.csv', 'anime_data_25000_30000.csv', 'anime_data_30000_35000.csv', 'anime_data_35000_40000.csv', 'anime_data_40000_45000.csv', 'anime_data_45000_50000.csv', 'anime_data_50000_55000.csv', 'anime_data_55000_60000.csv']

# Initialize an empty DataFrame to store merged data
merged_df = pd.DataFrame()

# Loop through each CSV file and merge its data into the final DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    merged_df = pd.concat([merged_df, df], ignore_index=True)

# Write the merged DataFrame to a new CSV file
merged_df.to_csv('data/anime_final.csv', index=False)

print("Merged CSV file 'anime_final.csv' has been created.")


Merged CSV file 'anime_final.csv' has been created.


## Ensure The Files Is Correctly Merged

In [26]:
# Path to the folder containing CSV files
folder_path = 'data'

# List of CSV files to check the length
csv_files = ['anime_data_0_5000.csv', 'anime_data_5000_10000.csv', 'anime_data_10000_15000.csv', 'anime_data_15000_20000.csv', 'anime_data_20000_25000.csv', 'anime_data_25000_30000.csv', 'anime_data_30000_35000.csv', 'anime_data_35000_40000.csv', 'anime_data_40000_45000.csv', 'anime_data_45000_50000.csv', 'anime_data_50000_55000.csv', 'anime_data_55000_60000.csv']

total_length = 0

# Loop through each CSV file and print its length
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    file_length = len(df)
    print(f"Length of {file}: {file_length}")
    total_length += file_length

print(f"Total length of all CSV files: {total_length}")

Length of anime_data_0_5000.csv: 3901
Length of anime_data_5000_10000.csv: 2105
Length of anime_data_10000_15000.csv: 1177
Length of anime_data_15000_20000.csv: 993
Length of anime_data_20000_25000.csv: 1045
Length of anime_data_25000_30000.csv: 1024
Length of anime_data_30000_35000.csv: 2430
Length of anime_data_35000_40000.csv: 3081
Length of anime_data_40000_45000.csv: 2699
Length of anime_data_45000_50000.csv: 3052
Length of anime_data_50000_55000.csv: 2751
Length of anime_data_55000_60000.csv: 2625
Total length of all CSV files: 26883


In [27]:
# Path to the merged CSV file
merged_file_path = 'data/anime_final.csv'

# Read the merged CSV file
merged_df = pd.read_csv(merged_file_path)

# Print the length of the merged DataFrame
print(f"Length of merged file: {len(merged_df)}")


Length of merged file: 26883


#### Extract Other Info From Another API (MAL API)

In [30]:
#get all the anime ids
def read_anime_ids(file_name):
    anime_ids = []
    try:
        with open(file_name, mode='r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            # Extract mal_id from each row
            anime_ids = [row['mal_id'] for row in reader if 'mal_id' in row]
    except FileNotFoundError:
        print(f"File not found: {file_name}")
    except Exception as e:
        print(f"An error occurred: {e}")
    return anime_ids

# Example usage
file_name = 'data/anime_final.csv'
anime_ids = read_anime_ids(file_name)
print(anime_ids)  # This prints the list of all mal_ids


['1', '5', '6', '7', '8', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '71', '72', '73', '74', '75', '76', '77', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '173', '174', '175', '176', '177'

In [31]:
# Global variables for rate limiting
REQUESTS_PER_MINUTE = 60
REQUESTS_PER_SECOND = 3

# Track the time of last request
last_request_time = time.time()

def get_anime_details(anime_ids):
    anime_data = []
    global last_request_time  # Accessing global variable

    for mal_id in anime_ids:
        attempts = 0
        max_attempts = 5
        while attempts < max_attempts:
            # Check if we need to wait before making the next request
            elapsed_time = time.time() - last_request_time
            if elapsed_time < 1 / REQUESTS_PER_SECOND:
                time.sleep((1 / REQUESTS_PER_SECOND) - elapsed_time)

            url = f'https://api.myanimelist.net/v2/anime/{mal_id}'
            fields = 'id,title,start_date,end_date,statistics'
            headers = {'X-MAL-CLIENT-ID': 'ddc9a53b7d2f6c371c6c10f1934fe657'}
            response = requests.get(url, headers=headers, params={'fields': fields})
            print(f'Status ({mal_id}):', response.status_code)

            if response.status_code == 200:
                anime_details = response.json()
                anime_data.append(anime_details)
                last_request_time = time.time()  # Update last request time
                break  # Exit the retry loop since the request was successful
            elif response.status_code == 504:
                attempts += 1
                time.sleep(10)  # Wait for 10 seconds before retrying
                print(f"Retrying ({attempts}/{max_attempts}) after 504 Gateway Timeout for Anime ID {mal_id}.")
            else:
                print(f"Anime ID {mal_id} Not Found or Error with status code {response.status_code}")
                break  # Stop trying if the error is not a 504

    return anime_data

def write_anime_to_csv(anime_data, file_name):
    fieldnames = ['id', 'title', 'start_date', 'end_date', 'statistics']
    with open(file_name, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for anime in anime_data:
            writer.writerow({field: anime.get(field, '') for field in fieldnames})


chunk_size = 2000  # Set chunk size to 2000
num_chunks = len(anime_ids) // chunk_size + (1 if len(anime_ids) % chunk_size else 0)

for i in range(num_chunks):
    start_index = i * chunk_size
    end_index = min((i + 1) * chunk_size, len(anime_ids))
    current_ids = anime_ids[start_index:end_index]
    anime_data = get_anime_details(current_ids)
    if anime_data:
        file_name = f'data/mal_data_{start_index}_{end_index}.csv'
        write_anime_to_csv(anime_data, file_name)
        print(f"Anime data for IDs {start_index} to {end_index} written to {file_name}.")


Status (1): 200
Status (5): 200
Status (6): 200
Status (7): 200
Status (8): 200
Status (15): 200
Status (16): 200
Status (17): 200
Status (18): 200
Status (19): 200
Status (20): 200
Status (21): 200
Status (22): 200
Status (23): 200
Status (24): 200
Status (25): 200
Status (26): 200
Status (27): 200
Status (28): 200
Status (29): 200
Status (30): 200
Status (31): 200
Status (32): 200
Status (33): 200
Status (43): 200
Status (44): 200
Status (45): 200
Status (46): 200
Status (47): 200
Status (48): 200
Status (49): 200
Status (50): 200
Status (51): 200
Status (52): 200
Status (53): 200
Status (54): 200
Status (55): 200
Status (56): 200
Status (57): 200
Status (58): 200
Status (59): 200
Status (60): 200
Status (61): 200
Status (62): 200
Status (63): 200
Status (64): 200
Status (65): 200
Status (66): 200
Status (67): 200
Status (68): 200
Status (69): 200
Status (71): 200
Status (72): 200
Status (73): 200
Status (74): 200
Status (75): 200
Status (76): 200
Status (77): 200
Status (79): 200
St

#### Merge All The Files For MAL

In [92]:
# Path to the folder containing CSV files
folder_path = 'data'

# List of CSV files to merge in the specified order
csv_files = ['mal_data_0_2000.csv', 'mal_data_2000_4000.csv', 'mal_data_4000_6000.csv', 'mal_data_6000_8000.csv', 'mal_data_8000_10000.csv', 'mal_data_10000_12000.csv', 'mal_data_12000_14000.csv', 'mal_data_14000_16000.csv', 'mal_data_16000_18000.csv', 'mal_data_18000_20000.csv', 'mal_data_20000_22000.csv', 'mal_data_22000_24000.csv', 'mal_data_24000_26000.csv', 'mal_data_26000_26883.csv']

# Initialize an empty DataFrame to store merged data
merged_df = pd.DataFrame()

# Loop through each CSV file and merge its data into the final DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    merged_df = pd.concat([merged_df, df], ignore_index=True)

# Write the merged DataFrame to a new CSV file
merged_df.to_csv('data/mal_final.csv', index=False)

print("Merged CSV file 'mal_final.csv' has been created.")



Merged CSV file 'mal_final.csv' has been created.


#### Print the length of the mal dataset

In [95]:
# Path to the merged CSV file
merged_file_path = 'data/mal_final.csv'

# Read the merged CSV file
merged_df = pd.read_csv(merged_file_path)

# Print the length of the merged DataFrame
print(f"Length of merged file: {len(merged_df)}")

Length of merged file: 26883


#### Combine the MAL and Jikan Data Together

In [97]:
import pandas as pd

# Load the datasets
anime_final_df = pd.read_csv('data/anime_final.csv')
mal_final_df = pd.read_csv('data/mal_final.csv')

# Merge the datasets on the specified ID columns
# Assuming 'mal_id' in anime_final.csv corresponds to 'id' in mal_final.csv
combined_df = pd.merge(anime_final_df, mal_final_df, left_on='mal_id', right_on='id', how='inner')

# Save the combined dataframe to a new CSV file
combined_df.to_csv('final_data/anime_merge_final.csv', index=False)

Now we have the final merged dataset called anime_merge_final.csv., we can now finally do data cleaning on it