# 1 Series

In [1]:
import numpy as np
import pandas as pd
np.random.seed(1)

## 1.1 Seriesの作成

In [2]:
sample_arr = np.random.randint(1, 10, 4)
sample_list = [1, 2, "a", "b"]
sample_series1 = pd.Series(sample_arr)
sample_series2 = pd.Series(sample_list)

In [3]:
print(sample_series1)

0    6
1    9
2    6
3    1
dtype: int64


In [4]:
print(sample_series2)

0    1
1    2
2    a
3    b
dtype: object


In [5]:
print(type(sample_series1))

<class 'pandas.core.series.Series'>


## 1.2 Seriesの構成要素

In [6]:
print(sample_series1)

0    6
1    9
2    6
3    1
dtype: int64


In [7]:
print("index")
print(sample_series1.index)

index
RangeIndex(start=0, stop=4, step=1)


In [8]:
sample_series1.index = ["a", "b", "c", "d"]
print(sample_series1)
print(sample_series1.index)

a    6
b    9
c    6
d    1
dtype: int64
Index(['a', 'b', 'c', 'd'], dtype='object')


In [9]:
print("values")
print(sample_series1.values, type(sample_series1.values))

values
[6 9 6 1] <class 'numpy.ndarray'>


In [10]:
print("dtype")
print(sample_series1.dtype)
print(sample_series2.dtype)

dtype
int64
object


In [11]:
print(sample_series1.astype(float))

a    6.0
b    9.0
c    6.0
d    1.0
dtype: float64


In [12]:
print("name")
print(sample_series1.name, type(sample_series1.name))

name
None <class 'NoneType'>


In [13]:
sample_series1.name = "Hello"
print(sample_series1)

a    6
b    9
c    6
d    1
Name: Hello, dtype: int64


In [14]:
print(sample_series1.name)

Hello


## 1.3 演算と関数

In [15]:
sample_series3 = pd.Series(np.random.randint(1, 10, 4))
sample_series4 = pd.Series(np.random.randint(1, 10, 4))

print("sample_series3\n", sample_series3)
print("\nsample_series4\n", sample_series4)

sample_series3
 0    1
1    2
2    8
3    7
dtype: int64

sample_series4
 0    3
1    5
2    6
3    3
dtype: int64


In [16]:
print("数学の関数")
print(np.sqrt(sample_series3))
print(np.log(sample_series3))
print(np.power(sample_series3, 2))

数学の関数
0    1.000000
1    1.414214
2    2.828427
3    2.645751
dtype: float64
0    0.000000
1    0.693147
2    2.079442
3    1.945910
dtype: float64
0     1
1     4
2    64
3    49
dtype: int64


In [17]:
print("四則演算")
print(sample_series3 + sample_series4)
print(sample_series3 - sample_series4)
print(sample_series3 * sample_series4)
print(sample_series3 / sample_series4)

四則演算
0     4
1     7
2    14
3    10
dtype: int64
0   -2
1   -3
2    2
3    4
dtype: int64
0     3
1    10
2    48
3    21
dtype: int64
0    0.333333
1    0.400000
2    1.333333
3    2.333333
dtype: float64


In [18]:
print("統計量に関する関数")
print(np.mean(sample_series3))
print(np.std(sample_series3))
print(np.sum(sample_series3))
print(np.median(sample_series3))

print("\n最小値と最大値")
print(np.min(sample_series3))
print(np.max(sample_series3))

print("\n最小値と最大値のインデックス")
print(np.argmin(sample_series3))
print(np.argmax(sample_series3))

統計量に関する関数
4.5
3.0413812651491097
18
4.5

最小値と最大値
1
8

最小値と最大値のインデックス
0
2


In [19]:
print("特殊な関数")
print(np.unique(sample_series3))
print(np.sort(sample_series3))

特殊な関数
[1 2 7 8]
[1 2 7 8]


# 2 DataFrame

## 2.1 DataFrameの作成

In [20]:
sample_dict = {"StudentID" : np.arange(10, 14),
               "Japanese" : np.random.randint(1, 100, 4),
               "Math" : np.random.randint(1, 100, 4)}
sample_df = pd.DataFrame(sample_dict)
print(sample_df)
print(type(sample_df))

   StudentID  Japanese  Math
0         10        85    15
1         11        12    51
2         12        29    69
3         13        30    88
<class 'pandas.core.frame.DataFrame'>


## 2.2 DataFrameの構成要素

In [21]:
sample_df

Unnamed: 0,StudentID,Japanese,Math
0,10,85,15
1,11,12,51
2,12,29,69
3,13,30,88


In [22]:
print(sample_df.index)

RangeIndex(start=0, stop=4, step=1)


In [23]:
print(sample_df.columns)

Index(['StudentID', 'Japanese', 'Math'], dtype='object')


In [24]:
print(sample_df.values)

[[10 85 15]
 [11 12 51]
 [12 29 69]
 [13 30 88]]


In [25]:
print(sample_df.dtypes)

StudentID    int64
Japanese     int64
Math         int64
dtype: object


## 2.3 参照

In [26]:
sample_df.head(3)

Unnamed: 0,StudentID,Japanese,Math
0,10,85,15
1,11,12,51
2,12,29,69


In [27]:
print(sample_df["StudentID"])
print(sample_df.StudentID)

0    10
1    11
2    12
3    13
Name: StudentID, dtype: int64
0    10
1    11
2    12
3    13
Name: StudentID, dtype: int64


In [28]:
print(sample_df.loc[1]) # 行指定

StudentID    11
Japanese     12
Math         51
Name: 1, dtype: int64


In [29]:
print(sample_df.loc[1, "StudentID"]) # 行、列指定

11


In [30]:
print(sample_df.loc[1, ["Japanese", "Math"]]) # 複数列指定

Japanese    12
Math        51
Name: 1, dtype: int64


In [31]:
print(sample_df.iloc[1, 2]) #　数値で指定

51


In [32]:
print(sample_df.iloc[1, 1:3]) #スライス

Japanese    12
Math        51
Name: 1, dtype: int64


In [33]:
print(sample_df[sample_df["Japanese"] > 50]) # 条件で指定

   StudentID  Japanese  Math
0         10        85    15


In [34]:
print(sample_df.loc[sample_df["Japanese"] > 50, "Math"]) # 条件、列指定

0    15
Name: Math, dtype: int64


## 2.4 代入と変更

In [35]:
print("行の指定")
sample_df.loc[0] = 0
print(sample_df)
sample_df.loc[0] = [1, 2, 3]
print(sample_df)

行の指定
   StudentID  Japanese  Math
0          0         0     0
1         11        12    51
2         12        29    69
3         13        30    88
   StudentID  Japanese  Math
0          1         2     3
1         11        12    51
2         12        29    69
3         13        30    88


In [36]:
print("列の指定")
sample_df.Math = -1
print(sample_df)
sample_df["Math"] = np.arange(4)
print(sample_df)

列の指定
   StudentID  Japanese  Math
0          1         2    -1
1         11        12    -1
2         12        29    -1
3         13        30    -1
   StudentID  Japanese  Math
0          1         2     0
1         11        12     1
2         12        29     2
3         13        30     3


In [37]:
print("条件の指定")
sample_df[sample_df["Japanese"] > 15] = 100
print(sample_df)
sample_df.loc[sample_df["Japanese"] > 15, "Math"] = -100
print(sample_df)

条件の指定
   StudentID  Japanese  Math
0          1         2     0
1         11        12     1
2        100       100   100
3        100       100   100
   StudentID  Japanese  Math
0          1         2     0
1         11        12     1
2        100       100  -100
3        100       100  -100


In [38]:
print("列の追加")
sample_df["English"] = np.random.randint(1, 100, 4)
print(sample_df)

列の追加
   StudentID  Japanese  Math  English
0          1         2     0       88
1         11        12     1       95
2        100       100  -100       97
3        100       100  -100       87


## 2.5 関数

In [39]:
sample_dict = {"StudentID" : np.arange(10, 14),
               "Japanese" : np.random.randint(1, 100, 4),
               "Math" : np.random.randint(1, 100, 4)}
sample_df = pd.DataFrame(sample_dict)
sample_df

Unnamed: 0,StudentID,Japanese,Math
0,10,14,62
1,11,10,23
2,12,8,58
3,13,64,2


In [40]:
print(np.log(sample_df)) # 全ての値に対して

   StudentID  Japanese      Math
0   2.302585  2.639057  4.127134
1   2.397895  2.302585  3.135494
2   2.484907  2.079442  4.060443
3   2.564949  4.158883  0.693147


In [41]:
print(np.log(sample_df.loc[1])) # 指定した行に対して

StudentID    2.397895
Japanese     2.302585
Math         3.135494
Name: 1, dtype: float64


In [42]:
print(np.log(sample_df["Math"])) # 指定した列に対して

0    4.127134
1    3.135494
2    4.060443
3    0.693147
Name: Math, dtype: float64


In [43]:
np.max(sample_df, axis=0) # axis0に対して

StudentID    13
Japanese     64
Math         62
dtype: int64

In [44]:
np.max(sample_df, axis=1) # axis1に対して

0    62
1    23
2    58
3    64
dtype: int64

## 2.6 csv

In [45]:
sample_dict2 = {"StudentID" : np.arange(10, 110),
               "Japanese" : np.random.randint(1, 100, 100)*\
                np.random.choice([np.nan, 1], 100, p=[0.1, 0.9]),
               "Math" : np.random.randint(1, 100, 100),
               "Sex" : np.random.choice(["Male", "Female"], 100),
               "Class" : np.random.choice(["A", "B", "C", None], 100)}
sample_df2 = pd.DataFrame(sample_dict2)

In [46]:
sample_df2.head()

Unnamed: 0,StudentID,Japanese,Math,Sex,Class
0,10,1.0,55,Female,A
1,11,61.0,1,Female,C
2,12,,87,Female,
3,13,9.0,17,Female,C
4,14,,20,Male,A


In [47]:
sample_df2.to_csv("csv_data/sample_df.csv", index_label=False)

In [48]:
read_df = pd.read_csv("csv_data/sample_df.csv")

In [49]:
read_df.head()

Unnamed: 0,StudentID,Japanese,Math,Sex,Class
0,10,1.0,55,Female,A
1,11,61.0,1,Female,C
2,12,,87,Female,
3,13,9.0,17,Female,C
4,14,,20,Male,A
