In [7]:
import duckdb
import time

In [8]:
%%time
# Connect to the DuckDB using persistence storage
con = duckdb.connect(database = "my-db.duckdb", read_only = False)

CPU times: total: 15.6 ms
Wall time: 67.8 ms


In [5]:
%%time
# Reading the user details anime dataset
user_details = duckdb.read_csv('data/users-details-2023.csv')

CPU times: total: 62.5 ms
Wall time: 135 ms


In [8]:
%%time
user_details.show()

┌────────┬─────────────────┬─────────┬─────────────────────┬───┬───────────────┬───────────┬──────────────────┐
│ Mal ID │    Username     │ Gender  │      Birthday       │ … │ Total Entries │ Rewatched │ Episodes Watched │
│ int64  │     varchar     │ varchar │      timestamp      │   │    double     │  double   │      double      │
├────────┼─────────────────┼─────────┼─────────────────────┼───┼───────────────┼───────────┼──────────────────┤
│      1 │ Xinil           │ Male    │ 1985-03-04 00:00:00 │ … │         399.0 │      60.0 │           8458.0 │
│      3 │ Aokaado         │ Male    │ NULL                │ … │         343.0 │      15.0 │           4072.0 │
│      4 │ Crystal         │ Female  │ NULL                │ … │        1000.0 │      10.0 │          12781.0 │
│      9 │ Arcane          │ NULL    │ NULL                │ … │          66.0 │       0.0 │           1817.0 │
│     18 │ Mad             │ NULL    │ NULL                │ … │         153.0 │      42.0 │           3

In [6]:
%%time
# Reading the user ratings and anime details datasets
user_ratings = duckdb.read_csv('data/users-score-2023.csv')
anime_details = duckdb.read_csv('data/anime-dataset-2023.csv')

CPU times: total: 93.8 ms
Wall time: 168 ms


In [10]:
%%time
user_ratings.show()

┌─────────┬──────────┬──────────┬─────────────────────────────┬────────┐
│ user_id │ Username │ anime_id │         Anime Title         │ rating │
│  int64  │ varchar  │  int64   │           varchar           │ int64  │
├─────────┼──────────┼──────────┼─────────────────────────────┼────────┤
│       1 │ Xinil    │       21 │ One Piece                   │      9 │
│       1 │ Xinil    │       48 │ .hack//Sign                 │      7 │
│       1 │ Xinil    │      320 │ A Kite                      │      5 │
│       1 │ Xinil    │       49 │ Aa! Megami-sama!            │      8 │
│       1 │ Xinil    │      304 │ Aa! Megami-sama! Movie      │      8 │
│       1 │ Xinil    │      306 │ Abenobashi Mahou☆Shoutengai │      8 │
│       1 │ Xinil    │       53 │ Ai Yori Aoshi               │      7 │
│       1 │ Xinil    │       47 │ Akira                       │      5 │
│       1 │ Xinil    │      591 │ Amaenaide yo!!              │      6 │
│       1 │ Xinil    │       54 │ Appleseed (Movie)

In [15]:
%%time
anime_details.show()

┌──────────┬──────────────────────┬──────────────────────┬───┬───────────┬─────────┬──────────────────────┐
│ anime_id │         Name         │     English name     │ … │ Scored By │ Members │      Image URL       │
│  int64   │       varchar        │       varchar        │   │  varchar  │  int64  │       varchar        │
├──────────┼──────────────────────┼──────────────────────┼───┼───────────┼─────────┼──────────────────────┤
│        1 │ Cowboy Bebop         │ Cowboy Bebop         │ … │ 914193.0  │ 1771505 │ https://cdn.myanim…  │
│        5 │ Cowboy Bebop: Teng…  │ Cowboy Bebop: The …  │ … │ 206248.0  │  360978 │ https://cdn.myanim…  │
│        6 │ Trigun               │ Trigun               │ … │ 356739.0  │  727252 │ https://cdn.myanim…  │
│        7 │ Witch Hunter Robin   │ Witch Hunter Robin   │ … │ 42829.0   │  111931 │ https://cdn.myanim…  │
│        8 │ Bouken Ou Beet       │ Beet the Vandel Bu…  │ … │ 6413.0    │   15001 │ https://cdn.myanim…  │
│       15 │ Eyeshield 21   

In [57]:
%%time
# Join anime_details, user_ratings and user details datasets and get the required columns
# # Re-order the columns and rename the columns
anime_ratings = duckdb.sql('''
                                SELECT ad.anime_id as anime_id,
                                        ad.Name as anime_name,
                                        ad.Genres as genres,
                                        split(ad.Genres,',')::VARCHAR[] as genres_list,
                                        ad.Type as type,
                                        ad.Episodes as episodes,
                                        ad.Duration as episode_duration,
                                        ad.Aired as aired_dates,
                                        ud."Mal ID" AS user_id,
                                        ud.Username as user_name,
                                        ud.Gender as user_gender,
                                        ud.Birthday as user_birthday,
                                        ud.Location as user_location,
                                        ud."Mean Score" as user_mean_score,
                                        ud.Watching as animes_watching_user,
                                        ud.Completed as animes_completed_user,
                                        ud."On Hold" as animes_onHold_user,
                                        ud.Dropped as animes_dropped_user,
                                        ud."Episodes Watched" as total_animes_user_watched,
                                        ur.rating as user_rating
                                        
                                FROM anime_details ad
                                LEFT JOIN user_ratings ur
                                    ON ad.anime_id = ur.anime_id
                                LEFT JOIN user_details ud
                                    ON ur.user_id = "Mal ID"
                                
                            ''')

CPU times: total: 0 ns
Wall time: 0 ns


In [59]:
%%time
anime_ratings.show()

┌──────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬─────────────┐
│ anime_id │      anime_name      │        genres        │ … │ total_animes_user_…  │ user_rating │
│  int64   │       varchar        │       varchar        │   │        double        │    int64    │
├──────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼─────────────┤
│    30376 │ Isshuukan Friends.…  │ Comedy, Slice of L…  │ … │              19843.0 │           7 │
│    30378 │ Sanzoku Diary        │ Slice of Life        │ … │              27349.0 │           1 │
│    30379 │ Nagato Yuki-chan n…  │ Comedy               │ … │               3995.0 │           6 │
│    30381 │ Sousei no Aquarion…  │ Romance, Sci-Fi      │ … │              14202.0 │           8 │
│    30382 │ Aquarion Logos       │ Action, Fantasy, R…  │ … │              18891.0 │           6 │
│    30384 │ Miss Monochrome Th…  │ Comedy, Slice of L…  │ … │              13208.0 │           7 │


In [38]:
%%time
# Total rows in the table
start_time = time.time()
duckdb.sql('SELECT COUNT(*) FROM anime_ratings').show()
end_time = time.time()
print(f'Total time taken: {end_time - start_time}')

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     24333071 │
└──────────────┘

Total time taken: 6.538724422454834
CPU times: total: 11 s
Wall time: 6.54 s


In [39]:
%%time
# Total anime in the table
start_time = time.time()
duckdb.sql('SELECT COUNT(DISTINCT anime_id) FROM anime_ratings').show()
end_time = time.time()
print(f'Total time taken: {end_time - start_time}')

┌──────────────────────────┐
│ count(DISTINCT anime_id) │
│          int64           │
├──────────────────────────┤
│                    24905 │
└──────────────────────────┘

Total time taken: 6.865374326705933
CPU times: total: 15.5 s
Wall time: 6.87 s


In [40]:
%%time
# Get the distinct anime in the file and their average ratings sort by rating descending
start_time = time.time()
duckdb.sql('''SELECT anime_name, 
                    AVG(user_rating) AS avg_rating,
                    COUNT(DISTINCT user_id) AS total_user_ratings
                FROM anime_ratings
                GROUP BY anime_name
                ORDER BY AVG(user_rating) DESC''').show()
end_time = time.time()
print(f'Total time taken: {end_time - start_time}')

┌───────────────────────────────────────────────┬────────────┬────────────────────┐
│                  anime_name                   │ avg_rating │ total_user_ratings │
│                    varchar                    │   double   │       int64        │
├───────────────────────────────────────────────┼────────────┼────────────────────┤
│ Chibi Maruko-chan: Hi no Youjin               │       10.0 │                  1 │
│ Virtua Fighter: Costomize Clip                │       10.0 │                  1 │
│ Lose It                                       │       10.0 │                  1 │
│ Setsugetsuka                                  │       10.0 │                  1 │
│ Iiwake Love Song                              │       10.0 │                  1 │
│ Hajimemashite no Kimochi wo                   │       10.0 │                  1 │
│ Kanchigaisei Han Kiboushou feat. Hatsune Miku │       10.0 │                  1 │
│ Play "Tag"                                    │       10.0 │              

We see that in the above output, there are a lot of animes where very few users gave ratings. The top animes we see in the above output are only rated by just one user. Let us filter to anime that are rated by atleast 1000 users so we can get a better list.

In [74]:
%%time
# Get the distinct anime in the file and their average ratings 
# sort by rating descending for anime that have more than 1000 user ratings
start_time = time.time()
duckdb.sql('''SELECT anime_name, 
                    AVG(user_rating) AS avg_rating,
                    COUNT(DISTINCT user_id) AS total_user_ratings
                FROM anime_ratings
                GROUP BY anime_name
                HAVING COUNT(DISTINCT user_id) > 1000
                ORDER BY AVG(user_rating) DESC''').show()
end_time = time.time()
print(f'Total time taken: {end_time - start_time}')

┌───────────────────────────────────────────────────────┬────────────────────┬────────────────────┐
│                      anime_name                       │     avg_rating     │ total_user_ratings │
│                        varchar                        │       double       │       int64        │
├───────────────────────────────────────────────────────┼────────────────────┼────────────────────┤
│ Fullmetal Alchemist: Brotherhood                      │  9.139212618954504 │              74966 │
│ Gintama°                                              │  9.115830115830116 │               6091 │
│ Gintama'                                              │  9.096198784857455 │              12551 │
│ Gintama': Enchousen                                   │  9.048557629552278 │               7490 │
│ Clannad: After Story                                  │   9.03095777429285 │              50435 │
│ Steins;Gate                                           │   9.01011105585944 │              41328 │


In [65]:
%%time
# Get the distinct anime  genres in the file and their average ratings
start_time = time.time()
duckdb.sql('''SELECT genres_unnested,
                        COUNT(DISTINCT anime_id) as total_animes,
                        AVG(user_rating) as avg_user_rating,
                        COUNT(DISTINCT user_id) AS total_users_watched
                FROM
                (SELECT *,
                        trim(unnest(genres_list),' ') as genres_unnested
                        
                FROM anime_ratings
                ) t
                WHERE user_rating IS NOT NULL
                GROUP BY genres_unnested
                
                ''').show()
end_time = time.time()
print(f'Total time taken: {end_time - start_time}')

┌─────────────────┬──────────────┬───────────────────┬─────────────────────┐
│ genres_unnested │ total_animes │  avg_user_rating  │ total_users_watched │
│     varchar     │    int64     │      double       │        int64        │
├─────────────────┼──────────────┼───────────────────┼─────────────────────┤
│ Horror          │          462 │ 7.507185940201596 │              173272 │
│ Mystery         │          774 │   7.8245364768336 │              195021 │
│ Comedy          │         5360 │ 7.499463181010322 │              232065 │
│ Romance         │         1878 │ 7.591861934646181 │              223936 │
│ Girls Love      │          106 │ 7.132914372098045 │               73038 │
│ Avant Garde     │          494 │ 7.857067889836932 │              105342 │
│ UNKNOWN         │         1844 │ 6.734660194174757 │               45041 │
│ Drama           │         2274 │ 7.817602417897131 │              230616 │
│ Adventure       │         2810 │ 7.682486953357381 │              228727 │