### DSC 478 - Programming Machine Learning A
### Team Project: Code Sages

### June 9, 2024

### Dataset: Anime Recommendations Database
### This source contains 2 datasets: anime.csv, rating.csv

#### https://www.kaggle.com/code/hasibalmuzdadid/anime-ratings-analysis-recommender-system/input

### Preprocessing performed by: Ken Vellian
### This Notebook explores, cleans, preprocesses, and combines the datasets
### Instructions on how to run this Notebook below

In [1]:
# Import statements up to Week 9

# Numerical Operations and Data Manipulation 
import pandas as pd
import numpy as np
import numpy.linalg as la
from collections import Counter
import math
import random
 
# Linear Algebra and Matrix Operations
from scipy.sparse import csr_matrix
from scipy.linalg import svd
from scipy.sparse.linalg import svds

# Data Preprocessing and Machine Learning 
from sklearn import datasets
from sklearn import metrics
from sklearn.metrics import silhouette_samples, confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, completeness_score, homogeneity_score
from sklearn.feature_extraction import DictVectorizer, text
from sklearn import preprocessing, model_selection, decomposition, feature_selection
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Plotting and Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import graphviz
from IPython.display import Image

# Time-Related Functions
import time

# Miscellaneous
from operator import itemgetter, attrgetter
import importlib
import re

In [2]:
# # Setting pandas to display all rows and  columns values
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

# Instructions on how to run this Notebook:
## 1. Download the anime.csv, and rating.csv files from the Kaggle source listed above.
## 2. In the cell below, comment/uncomment the 4 'anime_pathname', and 'rating_pathname' lines and add the file location of the 2 datasets.
## 3. In the last cell of this Notebook, comment/uncomment the 2 'write_pathname_full' lines and add the file location to write the fully merged/cleaned dataset to be used for other parts of this project.
## 4. Run the Notebook.
## 5. Note: Due to the large dataset, this Notebook has an extremely long runtime.

In [3]:
# Loading data

# FULL DATASET
anime_pathname = '/Users/kvellian/Desktop/DSC_478_Group_Project/anime.csv'
rating_pathname = '/Users/kvellian/Desktop/DSC_478_Group_Project/rating.csv'

# anime_pathname = 'ADD PATHNAME HERE'
# rating_pathname = 'ADD PATHNAME HERE'

anime = pd.read_csv(anime_pathname)
rating = pd.read_csv(rating_pathname)

## Data Exploration: anime.csv

In [4]:
# Display basic data information
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
# Count missing values in each column
missing_values_anime = anime.isna().sum()
print(missing_values_anime)

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [6]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [7]:
anime['type'].value_counts()

type
TV         3787
OVA        3311
Movie      2348
Special    1676
ONA         659
Music       488
Name: count, dtype: int64

In [8]:
anime['rating'].describe()

count    12064.000000
mean         6.473902
std          1.026746
min          1.670000
25%          5.880000
50%          6.570000
75%          7.180000
max         10.000000
Name: rating, dtype: float64

In [9]:
anime['members'].describe()

count    1.229400e+04
mean     1.807134e+04
std      5.482068e+04
min      5.000000e+00
25%      2.250000e+02
50%      1.550000e+03
75%      9.437000e+03
max      1.013917e+06
Name: members, dtype: float64

## Data Preprocessing: anime.csv
## Cleaning 335 instances of non English characters with regular expressions

In [10]:
# Checking the data for non English characters with regex and lambda function
non_english_char_names = anime[anime['name'].apply(lambda x: bool(re.search(r'[^\x00-\x7F]', x)))]

non_english_char_names

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
41,32366,Gintama°: Aizome Kaori-hen,"Comedy, Parody",OVA,2,8.69,16947
59,11577,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,"Sci-Fi, Thriller",Movie,1,8.61,192424
96,9756,Mahou Shoujo Madoka★Magica,"Drama, Magic, Psychological, Thriller",TV,12,8.51,462974
102,11981,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,"Drama, Magic, Psychological, Thriller",Movie,1,8.50,135735
...,...,...,...,...,...,...,...
12106,4034,Sailor Senshi Venus♥Five,"Hentai, Parody, Super Power",OVA,2,5.53,909
12170,4818,Houkago Renai Club: Koi no Étude,Hentai,OVA,2,5.39,605
12179,13917,Star☆Jewel Gaiden: Natsumi Oblivion,"Hentai, Yuri",OVA,1,5.35,883
12232,3541,Kouin Tenshi: Haitoku no Lycéenne,Hentai,OVA,1,4.99,652


In [11]:
len(non_english_char_names)

335

In [12]:
# Creating function with regular expressions to return all non English characters in the name column
def find_non_english_chars(text):
    return set(re.findall(r'[^\x00-\x7F]', text))

In [13]:
# Applying find_non_english_chars() through a lambda fucntion to the 'name' column and finding all non English characters
non_english_chars = set()
anime['name'].apply(lambda x: non_english_chars.update(find_non_english_chars(x)))

# Displaying the set of unique non English characters
non_english_chars

{'®',
 '°',
 '²',
 '³',
 '½',
 'Ä',
 'É',
 'Ü',
 'ß',
 'à',
 'â',
 'ä',
 'è',
 'é',
 'ö',
 'ü',
 'ă',
 'ē',
 'š',
 'Δ',
 'Ψ',
 'μ',
 '“',
 '”',
 '†',
 '…',
 '℃',
 '←',
 '→',
 '√',
 '∞',
 '␣',
 '◎',
 '◯',
 '★',
 '☆',
 '♡',
 '♥',
 '♪',
 '♭',
 '＊'}

In [14]:
# Creating a dictionary to store the replacements for the non English characters
replacements = {
    '&quot;': '',
    '&#039;': "'",
    '&amp;': 'and',
    '.hack//': '',
    'A&#039;s': '',
    'I&#039;': "I'",
    '®': '',
    '°': '',
    '²': '',
    '³': '',
    '½': '',
    'Ä': 'A',
    'É': 'E',
    'Ü': 'U',
    'ß': 'ss',
    'à': 'a',
    'â': 'a',
    'ä': 'a',
    'è': 'e',
    'é': 'e',
    'ö': 'o',
    'ü': 'u',
    'ă': 'a',
    'ē': 'e',
    'š': 's',
    'Δ': 'Delta',
    'Ψ': 'Psi',
    'μ': 'mu',
    '“': '"',
    '”': '"',
    '†': '',
    '…': '...',
    '℃': 'C',
    '←': '<-',
    '→': '->',
    '√': 'sqrt',
    '∞': 'infinity',
    '␣': ' ',
    '◎': '',
    '◯': '',
    '★': '',
    '☆': '',
    '♡': '',
    '♥': '',
    '♪': '',
    '♭': '',
    '＊': '*'
}

In [15]:
# There are 47 unique non English characters scattered throughout the dataset
len(replacements)

47

In [16]:
# Creating clean_text() to replace instances of non English characters with their assigned replacement value
def clean_text(text, replacements):
    for key, value in replacements.items():
        text = text.replace(key, value)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

In [17]:
# Applying clean_text() to the 'name' column
anime['name'] = anime['name'].apply(lambda x: clean_text(x, replacements))

In [18]:
# Checking the cleaned data for any remaining non English characters
cleaned_non_english_names = anime[anime['name'].apply(lambda x: bool(re.search(r'[^\x00-\x7F]', x)))]


# Empty results indicates we cleaned the data
cleaned_non_english_names

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members


In [19]:
# Checking the 'name' column again for unique non English values
cleaned_non_english_chars = set()
anime['name'].apply(lambda x: cleaned_non_english_chars.update(find_non_english_chars(x)))

# The empty set confirms our results
cleaned_non_english_chars

set()

## Data Exploration: anime.csv
## Identifying all unique genres

In [20]:
# Many shows have many genres.
anime['genre'].value_counts()

genre
Hentai                                                  823
Comedy                                                  523
Music                                                   301
Kids                                                    199
Comedy, Slice of Life                                   179
                                                       ... 
Adventure, Drama, Fantasy, Game, Sci-Fi                   1
Adventure, Demons, Fantasy, Historical                    1
Action, Comedy, Drama, Mecha, Music, Sci-Fi, Shounen      1
Action, Comedy, Fantasy, Mecha, Sci-Fi, Shounen           1
Hentai, Slice of Life                                     1
Name: count, Length: 3264, dtype: int64

In [21]:
# Extracting unique genres from the genre column in the anime.csv dataset
all_genres = set()
for genres in anime['genre'].dropna().str.split(', '):
    all_genres.update(genres)

# Sorting genres alphabetically for better organization
sorted_genres = sorted(all_genres)
sorted_genres

['Action',
 'Adventure',
 'Cars',
 'Comedy',
 'Dementia',
 'Demons',
 'Drama',
 'Ecchi',
 'Fantasy',
 'Game',
 'Harem',
 'Hentai',
 'Historical',
 'Horror',
 'Josei',
 'Kids',
 'Magic',
 'Martial Arts',
 'Mecha',
 'Military',
 'Music',
 'Mystery',
 'Parody',
 'Police',
 'Psychological',
 'Romance',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shoujo Ai',
 'Shounen',
 'Shounen Ai',
 'Slice of Life',
 'Space',
 'Sports',
 'Super Power',
 'Supernatural',
 'Thriller',
 'Vampire',
 'Yaoi',
 'Yuri']

In [22]:
# There are a total of 43 unique genres from the anime.csv
len(sorted_genres)

43

## Data Preprocessing: anime.csv
## Creating dummy variables for 'genre' and 'type'

In [23]:
# Replacing missing value in 'genre' column with "Unknown"
anime['genre'].fillna('Unknown', inplace=True) 

# Creating dummy variables for genres
genre_dummies = anime['genre'].str.get_dummies(sep = ', ')

# Adding prefix to each column name
genre_dummies.columns = ['genre_' + col for col in genre_dummies.columns]

# Merging dummy variables
anime_with_genres = pd.concat([anime, genre_dummies], axis = 1)

# Displaying 
anime_with_genres.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,...,genre_Slice of Life,genre_Space,genre_Sports,genre_Super Power,genre_Supernatural,genre_Thriller,genre_Unknown,genre_Vampire,genre_Yaoi,genre_Yuri
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,9969,Gintama',"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Filling in missing values in the 'type' column with 'Unknown'
anime_with_genres['type'].fillna('Unknown', inplace=True)

# Creating dummy variables for the 'type' column including 'Unknown'
type_dummies = pd.get_dummies(anime_with_genres['type'], prefix='type').astype(int)

# Merging dummy variables with anime_with_genres
anime_with_types = pd.concat([anime_with_genres, type_dummies], axis = 1)

## Data Cleaning: anime.csv
### Dropping genre and type columns now that dummy variables are made.

In [25]:
# List of columns to drop
genre_type_columns = ['genre', 'type']

# Dropping the specified columns
anime_cleaned = anime_with_types.drop(columns = genre_type_columns, axis = 1)


## Data Cleaning: anime.csv
### Dropping empty rows in rating column
### Replacing 'Unknown' in episode column to -1

In [26]:
# Dropping rows where rating is 0
anime_cleaned = anime_cleaned.dropna(subset=['rating'])


In [27]:
# Replacing 'Unknown' to -1
anime_cleaned['episodes'].replace('Unknown', -1, inplace=True)
anime_cleaned['episodes'] = anime_cleaned['episodes'].astype('int64')


In [28]:
# Count missing values in each column
missing_values_anime = anime_cleaned.isna().sum()
missing_values_anime

anime_id               0
name                   0
episodes               0
rating                 0
members                0
genre_Action           0
genre_Adventure        0
genre_Cars             0
genre_Comedy           0
genre_Dementia         0
genre_Demons           0
genre_Drama            0
genre_Ecchi            0
genre_Fantasy          0
genre_Game             0
genre_Harem            0
genre_Hentai           0
genre_Historical       0
genre_Horror           0
genre_Josei            0
genre_Kids             0
genre_Magic            0
genre_Martial Arts     0
genre_Mecha            0
genre_Military         0
genre_Music            0
genre_Mystery          0
genre_Parody           0
genre_Police           0
genre_Psychological    0
genre_Romance          0
genre_Samurai          0
genre_School           0
genre_Sci-Fi           0
genre_Seinen           0
genre_Shoujo           0
genre_Shoujo Ai        0
genre_Shounen          0
genre_Shounen Ai       0
genre_Slice of Life    0


In [29]:
# Display basic data information
anime_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12064 entries, 0 to 12293
Data columns (total 56 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   anime_id             12064 non-null  int64  
 1   name                 12064 non-null  object 
 2   episodes             12064 non-null  int64  
 3   rating               12064 non-null  float64
 4   members              12064 non-null  int64  
 5   genre_Action         12064 non-null  int64  
 6   genre_Adventure      12064 non-null  int64  
 7   genre_Cars           12064 non-null  int64  
 8   genre_Comedy         12064 non-null  int64  
 9   genre_Dementia       12064 non-null  int64  
 10  genre_Demons         12064 non-null  int64  
 11  genre_Drama          12064 non-null  int64  
 12  genre_Ecchi          12064 non-null  int64  
 13  genre_Fantasy        12064 non-null  int64  
 14  genre_Game           12064 non-null  int64  
 15  genre_Harem          12064 non-null  int6

## Data Exploration: rating.csv


In [30]:
missing_values_rating = rating.isna().sum()
missing_values_rating

user_id     0
anime_id    0
rating      0
dtype: int64

In [31]:
# Display basic data information
# 7.8 million rows
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [32]:
# Display summary statistics
rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [33]:
# There are over 1.4 million -1s.
rating['rating'].value_counts()

rating
 8     1646019
-1     1476496
 7     1375287
 9     1254096
 10     955715
 6      637775
 5      282806
 4      104291
 3       41453
 2       23150
 1       16649
Name: count, dtype: int64

## Data Cleaning: rating.csv
### Dropping duplicate rows and rows where the rating is -1

In [34]:
# Finding total number of duplicate ratings
dup_rating = rating[rating.duplicated()].shape[0]
dup_rating

1

In [35]:
# Dropping the duplicated row and checking shape
rating.drop_duplicates(keep='first',inplace=True)
rating.shape[0]

7813736

In [36]:
# Dropping rows where the rating is -1
rating = rating[rating['rating'] != -1]

rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,6337240.0,6337240.0,6337240.0
mean,36747.91,8902.865,7.808497
std,21013.4,8882.0,1.572496
min,1.0,1.0,1.0
25%,18984.0,1239.0,7.0
50%,36815.0,6213.0,8.0
75%,54873.0,14075.0,9.0
max,73516.0,34475.0,10.0


In [37]:
# -1s have been removed
rating['rating'].value_counts()

rating
8     1646018
7     1375287
9     1254096
10     955715
6      637775
5      282806
4      104291
3       41453
2       23150
1       16649
Name: count, dtype: int64

In [38]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10


## Data Preprocessing: anime.csv, rating.csv.
### Joining tables and dropping inappropriate genres

In [39]:
# Renaming the rating columns in each dataset before merging.

anime_cleaned.rename(columns = {'rating': 'average_rating'}, inplace = True)
rating.rename(columns = {'rating': 'user_rating'}, inplace = True)


In [40]:
# Merging the tables
anime_rating_merged = pd.merge(anime_cleaned, rating, on = 'anime_id', how = 'inner')

In [41]:
anime_rating_merged.head()

Unnamed: 0,anime_id,name,episodes,average_rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,...,genre_Yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,type_Unknown,user_id,user_rating
0,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,1,0,0,0,0,0,0,99,5
1,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,1,0,0,0,0,0,0,152,10
2,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,1,0,0,0,0,0,0,244,10
3,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,1,0,0,0,0,0,0,271,10
4,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,1,0,0,0,0,0,0,322,10


## Data Cleaning: df_anime_rating
### Removing rows with unwanted genres

In [42]:
# Calculate the percentage of rows where they are inappropriate
inappropriate_rows = anime_rating_merged[(anime_rating_merged['genre_Ecchi'] == 1) | (anime_rating_merged['genre_Harem'] == 1) |(anime_rating_merged['genre_Hentai'] == 1) | (anime_rating_merged['genre_Yaoi'] == 1) | (anime_rating_merged['genre_Yuri'] == 1)]
percentage = (len(inappropriate_rows) / len(anime_rating_merged)) * 100

print(f"Percentage of rows with inappropriate genre : {percentage:.2f}%")

Percentage of rows with inappropriate genre : 17.20%


### About 17.2% of the data consists of unwanted genres that we are dropping.

In [43]:
# Dropping all rows where unwanted genres  =  1
df_anime_rating = anime_rating_merged[(anime_rating_merged['genre_Ecchi'] !=  1) & (anime_rating_merged['genre_Harem'] !=  1) & (anime_rating_merged['genre_Hentai'] !=  1) & (anime_rating_merged['genre_Yaoi'] !=  1) & (anime_rating_merged['genre_Yuri'] !=  1) ]


In [44]:
# Dropping the columns for the unwanted genres
df_anime_rating = df_anime_rating.drop(columns = ['genre_Ecchi', 'genre_Harem','genre_Hentai','genre_Yaoi','genre_Yuri'])

In [45]:
df_anime_rating.columns

Index(['anime_id', 'name', 'episodes', 'average_rating', 'members',
       'genre_Action', 'genre_Adventure', 'genre_Cars', 'genre_Comedy',
       'genre_Dementia', 'genre_Demons', 'genre_Drama', 'genre_Fantasy',
       'genre_Game', 'genre_Historical', 'genre_Horror', 'genre_Josei',
       'genre_Kids', 'genre_Magic', 'genre_Martial Arts', 'genre_Mecha',
       'genre_Military', 'genre_Music', 'genre_Mystery', 'genre_Parody',
       'genre_Police', 'genre_Psychological', 'genre_Romance', 'genre_Samurai',
       'genre_School', 'genre_Sci-Fi', 'genre_Seinen', 'genre_Shoujo',
       'genre_Shoujo Ai', 'genre_Shounen', 'genre_Shounen Ai',
       'genre_Slice of Life', 'genre_Space', 'genre_Sports',
       'genre_Super Power', 'genre_Supernatural', 'genre_Thriller',
       'genre_Unknown', 'genre_Vampire', 'type_Movie', 'type_Music',
       'type_ONA', 'type_OVA', 'type_Special', 'type_TV', 'type_Unknown',
       'user_id', 'user_rating'],
      dtype='object')

In [46]:
df_anime_rating.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5247447 entries, 0 to 6249517
Data columns (total 53 columns):
 #   Column               Dtype  
---  ------               -----  
 0   anime_id             int64  
 1   name                 object 
 2   episodes             int64  
 3   average_rating       float64
 4   members              int64  
 5   genre_Action         int64  
 6   genre_Adventure      int64  
 7   genre_Cars           int64  
 8   genre_Comedy         int64  
 9   genre_Dementia       int64  
 10  genre_Demons         int64  
 11  genre_Drama          int64  
 12  genre_Fantasy        int64  
 13  genre_Game           int64  
 14  genre_Historical     int64  
 15  genre_Horror         int64  
 16  genre_Josei          int64  
 17  genre_Kids           int64  
 18  genre_Magic          int64  
 19  genre_Martial Arts   int64  
 20  genre_Mecha          int64  
 21  genre_Military       int64  
 22  genre_Music          int64  
 23  genre_Mystery        int64  
 24  gen

In [47]:
# Count missing values in each column
missing_values_anime = df_anime_rating.isna().sum()
missing_values_anime

anime_id               0
name                   0
episodes               0
average_rating         0
members                0
genre_Action           0
genre_Adventure        0
genre_Cars             0
genre_Comedy           0
genre_Dementia         0
genre_Demons           0
genre_Drama            0
genre_Fantasy          0
genre_Game             0
genre_Historical       0
genre_Horror           0
genre_Josei            0
genre_Kids             0
genre_Magic            0
genre_Martial Arts     0
genre_Mecha            0
genre_Military         0
genre_Music            0
genre_Mystery          0
genre_Parody           0
genre_Police           0
genre_Psychological    0
genre_Romance          0
genre_Samurai          0
genre_School           0
genre_Sci-Fi           0
genre_Seinen           0
genre_Shoujo           0
genre_Shoujo Ai        0
genre_Shounen          0
genre_Shounen Ai       0
genre_Slice of Life    0
genre_Space            0
genre_Sports           0
genre_Super Power      0


In [48]:
len(df_anime_rating)

5247447

### Creating subset of df_anime_rating to reduce runtime for testing code

In [49]:
# Subset of 1,000,000 rows. Change the value as needed
df_anime_rating_subset = df_anime_rating[:1000000]

### Saving full dataset and subset to .csv files

In [50]:
# # Saving to SUBSET

# write_pathname_subset = '/Users/kvellian/Desktop/DSC_478_Group_Project/df_anime_rating_subset.csv'

# # write_pathname_subset = 'ADD PATHNAME HERE'

# df_anime_rating_subset.to_csv(write_pathname_subset, index = False)

In [51]:
# Saving to FULL MERGED

write_pathname_full = '/Users/kvellian/Desktop/DSC_478_Group_Project/df_anime_rating.csv'

# write_pathname_full = 'ADD PATHNAME HERE'

df_anime_rating.to_csv(write_pathname_full, index = False)