In [1]:
import pandas as pd
import matplotlib.pyplot as plt

GitHubでCSVファイルを直接読み込む場合は、必ずraw.githubusercontent.comのURLを使う必要があります。github.com/blob/のURLはHTMLページを返すため、pd.read_csv()でエラーになります。

In [2]:
# 正しいraw URLを使用
url1 = 'https://raw.githubusercontent.com/makaishi2/samples/main/data/rental5-jp.csv'
url2 = 'https://raw.githubusercontent.com/makaishi2/samples/main/data/customer-jp.csv'

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)

In [3]:
df1.head()

Unnamed: 0,貸出ID,DVD_ID,顧客ID,映画ID,貸出日時,タイトル,概要,公開年,レンタル代,カテゴリ名
0,361,6,587,1,2005-05-27 07:03:28,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,0.99,Documentary
1,14624,4,344,1,2005-08-21 18:32:42,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,0.99,Documentary
2,10883,4,301,1,2005-08-02 00:47:19,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,0.99,Documentary
3,12651,8,34,1,2005-08-18 18:36:16,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,0.99,Documentary
4,10141,8,8,1,2005-07-31 22:08:29,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,0.99,Documentary


In [4]:
df2.head()

Unnamed: 0,顧客ID,店舗ID,名,姓,メール,住所ID,有効,作成日,更新日
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36.000,2021-03-06 15:53:36
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36.000,2021-03-06 15:53:36
2,3,1,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,7,1,2006-02-14 22:04:36.000,2021-03-06 15:53:36
3,4,2,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,8,1,2006-02-14 22:04:36.000,2021-03-06 15:53:36
4,5,1,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,9,1,2006-02-14 22:04:36.000,2021-03-06 15:53:36


In [5]:
print('貸出情報の件数：', len(df1))

貸出情報の件数： 15862


In [6]:
print('顧客情報の件数：', len(df2))

顧客情報の件数： 599


# ソート処理をする

In [7]:
df3 = df1.copy()
df3 = df3.sort_values('貸出日時')

df3.head()

Unnamed: 0,貸出ID,DVD_ID,顧客ID,映画ID,貸出日時,タイトル,概要,公開年,レンタル代,カテゴリ名
1256,1,367,130,80,2005-05-24 22:53:30,BLANKET BEVERLY,A Emotional Documentary of a Student And a Gir...,2006,2.99,Family
5271,2,1525,459,333,2005-05-24 22:54:33,FREAKY POCUS,A Fast-Paced Documentary of a Pastry Chef And ...,2006,2.99,Music
5905,3,1711,408,373,2005-05-24 23:03:39,GRADUATE LORD,A Lacklusture Epistle of a Girl And a A Shark ...,2006,2.99,Children
8489,4,2452,333,535,2005-05-24 23:04:41,LOVE SUICIDES,A Brilliant Panorama of a Hunter And a Explore...,2006,0.99,Horror
7203,5,2079,222,450,2005-05-24 23:05:21,IDOLS SNATCHERS,A Insightful Drama of a Car And a Composer who...,2006,2.99,Children


# インデックスを初期化する

In [8]:
df4 = df3.reset_index(drop=True)

df4.head()

Unnamed: 0,貸出ID,DVD_ID,顧客ID,映画ID,貸出日時,タイトル,概要,公開年,レンタル代,カテゴリ名
0,1,367,130,80,2005-05-24 22:53:30,BLANKET BEVERLY,A Emotional Documentary of a Student And a Gir...,2006,2.99,Family
1,2,1525,459,333,2005-05-24 22:54:33,FREAKY POCUS,A Fast-Paced Documentary of a Pastry Chef And ...,2006,2.99,Music
2,3,1711,408,373,2005-05-24 23:03:39,GRADUATE LORD,A Lacklusture Epistle of a Girl And a A Shark ...,2006,2.99,Children
3,4,2452,333,535,2005-05-24 23:04:41,LOVE SUICIDES,A Brilliant Panorama of a Hunter And a Explore...,2006,0.99,Horror
4,5,2079,222,450,2005-05-24 23:05:21,IDOLS SNATCHERS,A Insightful Drama of a Car And a Composer who...,2006,2.99,Children


In [9]:
df5 = df4[['カテゴリ名', 'レンタル代']]

df6 = df5.groupby('カテゴリ名').sum()

display(df6.head(2))

Unnamed: 0_level_0,レンタル代
カテゴリ名,Unnamed: 1_level_1
Action,2924.05
Animation,3155.55


In [10]:
df7 = df6.reset_index()

display(df7.head(2))

Unnamed: 0,カテゴリ名,レンタル代
0,Action,2924.05
1,Animation,3155.55


# 特定の行を参照する

In [11]:
# loc属性
print(df3.loc[1])

貸出ID                                                  14624
DVD_ID                                                    4
顧客ID                                                    344
映画ID                                                      1
貸出日時                                    2005-08-21 18:32:42
タイトル                                       ACADEMY DINOSAUR
概要        A Epic Drama of a Feminist And a Mad Scientist...
公開年                                                    2006
レンタル代                                                  0.99
カテゴリ名                                           Documentary
Name: 1, dtype: object


In [12]:
print(df3.loc[1, 'レンタル代'])

0.99


In [14]:
# 行を指定する属性なのでエラーになる
print(df3.loc['タイトル'])

KeyError: 'タイトル'

In [15]:
# 列の指定方法
df3['タイトル']

Unnamed: 0,タイトル
1256,BLANKET BEVERLY
5271,FREAKY POCUS
5905,GRADUATE LORD
8489,LOVE SUICIDES
7203,IDOLS SNATCHERS
...,...
2674,COMANCHEROS ENEMY
15065,VOYAGE LEGALLY
7226,ILLUSION AMELIE
6989,HUNCHBACK IMPOSSIBLE


In [16]:
# 複数列の指定方法
df3[['タイトル', '貸出日時']]

Unnamed: 0,タイトル,貸出日時
1256,BLANKET BEVERLY,2005-05-24 22:53:30
5271,FREAKY POCUS,2005-05-24 22:54:33
5905,GRADUATE LORD,2005-05-24 23:03:39
8489,LOVE SUICIDES,2005-05-24 23:04:41
7203,IDOLS SNATCHERS,2005-05-24 23:05:21
...,...,...
2674,COMANCHEROS ENEMY,2005-08-23 22:25:26
15065,VOYAGE LEGALLY,2005-08-23 22:26:47
7226,ILLUSION AMELIE,2005-08-23 22:42:48
6989,HUNCHBACK IMPOSSIBLE,2005-08-23 22:43:07
