## 3장. 시리즈와 데이터프레임 직접 만들기(51쪽)
### 3.1 시리즈 만들기

In [2]:
import pandas as pd
s = pd.Series(['banana', 42])
print(s)

0    banana
1        42
dtype: object


In [3]:
s.index

RangeIndex(start=0, stop=2, step=1)

In [4]:
s = pd.Series(['Wes McKinney', 'Creator of Pandas'])
print(s)

0         Wes McKinney
1    Creator of Pandas
dtype: object


In [5]:
s = pd.Series(['Wes McKinney', 'Creator of Pandas'], index=['Person', 'Who'])
print(s)

Person         Wes McKinney
Who       Creator of Pandas
dtype: object


#### 2. 데이터프레임 만들기

In [6]:
scientists1 = pd.DataFrame({ 
    'Name': ['Rosaline Franklin', 'William Gosset'], 
    'Occupation': ['Chemist', 'Statistician'], 
    'Born': ['1920-07-25', '1876-06-13'], 
    'Died': ['1958-04-16', '1937-10-16'], 
    'Age': [37, 61]}) 
scientists1

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [7]:
scientists1.set_index('Name')

Unnamed: 0_level_0,Occupation,Born,Died,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [8]:
scientists2 = pd.DataFrame( 
    data=[['Chemist', '1920-07-25', '1958-04-16', 37], 
          ['Statistician', '1876-06-13', '1937-10-16', 61]],
    index=['Rosaline Franklin', 'William Gosset'],
    columns=['Occupation', 'Born', 'Age', 'Died']) 
scientists2

Unnamed: 0,Occupation,Born,Age,Died
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [9]:
scientists2.reset_index()

Unnamed: 0,index,Occupation,Born,Age,Died
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [40]:
scientists2.reset_index().rename({'index':'Name'}, axis=1)

Unnamed: 0,Name,Occupation,Born,Age,Died
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [41]:
scientists2.reset_index().rename(columns={'index':'Name'})

Unnamed: 0,Name,Occupation,Born,Age,Died
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [10]:
from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([ 
    ('Name', ['Rosaline Franklin', 'William Gosset']),
    ('Occupation', ['Chemist', 'Statistician']), 
    ('Born', ['1920-07-25', '1876-06-13']), 
    ('Died', ['1958-04-16', '1937-10-16']), 
    ('Age', [37, 61])
])
) 
print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


### 3.2 데이터프레임에서 시리즈 선택하기(54쪽)

In [11]:
scientists = pd.DataFrame(
    data={'Occupation': ['Chemist', 'Statistician'], 
          'Born': ['1920-07-25', '1876-06-13'], 
          'Died': ['1958-04-16', '1937-10-16'],
          'Age': [37, 61]},
    index=['Rosaline Franklin', 'William Gosset'],
    columns=['Occupation', 'Born', 'Died', 'Age'])
scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [13]:
first_row = scientists.loc['William Gosset'] 
print(type(first_row))

<class 'pandas.core.series.Series'>


In [28]:
print(first_row)

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


In [30]:
# Direct point indexing against DataFrame refer column.
# scientists['William Gosset']

### 3.3 index, values 속성과 keys 메서드 사용하기(55쪽)

#### 1. index 속성 사용하기

In [14]:
first_row

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object

In [15]:
print(first_row.index)

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


#### 2. values 속성 사용하기

In [17]:
first_row.values

array(['Statistician', '1876-06-13', '1937-10-16', 61], dtype=object)

#### 3. keys 메서드 사용하기

In [18]:
print(first_row.keys())

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


#### 4. index 속성 응용하기

In [19]:
print(first_row.index[0])

Occupation


In [20]:
first_row.index[:2]

Index(['Occupation', 'Born'], dtype='object')

#### 5. keys 메서드와 index 속성 응용하기

In [21]:
print(first_row.keys()[0])

Occupation


In [22]:
first_row[['Occupation', 'Age']]

Occupation    Statistician
Age                     61
Name: William Gosset, dtype: object

In [23]:
first_row.index[[0, 3]]

Index(['Occupation', 'Age'], dtype='object')

In [24]:
first_row[first_row.index[[0, 3]]]

Occupation    Statistician
Age                     61
Name: William Gosset, dtype: object

### 3.3 시리즈의 mean, min, max, std 메서드 사용하기(56쪽)

In [25]:
ages = scientists['Age'] 
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [26]:
print(ages.mean())

49.0


In [27]:
print(ages.min())

37


In [28]:
print(ages.max())

61


In [29]:
print(ages.std())

16.97056274847714


### 3.4 시리즈와 불린 추출 사용하기(59쪽)

In [30]:
scientists = pd.read_csv('../data/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [31]:
ages = scientists['Age'] 

print(ages.max())

90


In [32]:
print(ages.mean())

59.125


In [33]:
ages > ages.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [34]:
print(ages[ages > ages.mean()])

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [25]:
print(type(ages > ages.mean()))

<class 'pandas.core.series.Series'>


index를 초기화하고자 하며, 기존 index 정보를 제거하고 싶다면:

In [38]:
ages[ages > ages.mean()].reset_index(drop=True)

0    61
1    90
2    66
3    77
Name: Age, dtype: int64

In [35]:
ages[ages > ages.mean()][0:2]

1    61
2    90
Name: Age, dtype: int64

In [39]:
manual_bool_values = [True, True, False, False, True, True, False, True] 
print(ages[manual_bool_values])

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64


boolean container와 Series의 size가 다르면, 갖도록 맞추어준 후, 색인한다.

In [58]:
manual_bool_values = [True, True, False, False, True, True, False] 
print(ages[:-1][manual_bool_values])

0    37
1    61
4    56
5    45
Name: Age, dtype: int64


### 3.5 벡터와 스칼라로 브로드캐스팅 수행하기(61쪽)

In [40]:
print(ages + ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [41]:
ages[:4] + ages[-4:]

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
Name: Age, dtype: float64

결측값 대체를 위해 add외에도 subtract, multiply, devide 등이 존재한다.

In [63]:
# add, subtract, multiply, divide 메소드의 fill_value 옵션을 사용.
ages[:4].add(ages[-4:], fill_value=0)

0    37.0
1    61.0
2    90.0
3    66.0
4    56.0
5    45.0
6    41.0
7    77.0
Name: Age, dtype: float64

In [43]:
ages[:4].multiply(ages[-4:], fill_value=1)

0    37.0
1    61.0
2    90.0
3    66.0
4    56.0
5    45.0
6    41.0
7    77.0
Name: Age, dtype: float64

In [64]:
print(ages * ages)

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


In [29]:
print(ages + 100)

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64


In [68]:
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [42]:
print(pd.Series([1, 100]))

0      1
1    100
dtype: int64


index 기준으로 연산이 이뤄진다.

In [70]:
print(ages + pd.Series([1, 100]))


0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


In [45]:
rev_ages = ages.sort_index(ascending=False) 
print(rev_ages)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64


역시 index 기준으로 연산이 이뤄진다.

In [46]:
print(ages + rev_ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [47]:
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [48]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [51]:
scientists.sort_values('Age').sort_index()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 3.6 불린 추출과 브로드캐스팅(65쪽)

In [52]:
scientists[scientists['Age'] > scientists['Age'].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


direct bool indexing의 length가 다르면 에러를 발생시키지만, loc을 사용하면 에러가 발생하지 않는다.

In [53]:
print(scientists[[True, True, False, True]])

ValueError: Item wrong length 4 instead of 8.

pandas 0.25 버전에서는 에러가 발생한다.

In [55]:
scientists.loc[[True, True, False, True]]

IndexError: Item wrong length 4 instead of 8.

In [57]:
[True, True, False, True]*2

[True, True, False, True, True, True, False, True]

In [58]:
print(scientists[[True, True, False, True]*2])

                Name        Born        Died  Age     Occupation
0  Rosaline Franklin  1920-07-25  1958-04-16   37        Chemist
1     William Gosset  1876-06-13  1937-10-16   61   Statistician
3        Marie Curie  1867-11-07  1934-07-04   66        Chemist
4      Rachel Carson  1907-05-27  1964-04-14   56      Biologist
5          John Snow  1813-03-15  1858-06-16   45      Physician
7       Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [59]:
[True, True, False, True] + [False]

[True, True, False, True, False]

In [80]:
[True, True, False, True].append(False)

In [62]:
bl = [True, True, False, True]
bl.extend([False])
bl
abl = bl + [True]
bl

[True, True, False, True, False]

In [65]:
'abc'*2

'abcabc'

In [39]:
print(scientists * 2)

                                       Name                  Born  \
0        Rosaline FranklinRosaline Franklin  1920-07-251920-07-25   
1              William GossetWilliam Gosset  1876-06-131876-06-13   
2  Florence NightingaleFlorence Nightingale  1820-05-121820-05-12   
3                    Marie CurieMarie Curie  1867-11-071867-11-07   
4                Rachel CarsonRachel Carson  1907-05-271907-05-27   
5                        John SnowJohn Snow  1813-03-151813-03-15   
6                    Alan TuringAlan Turing  1912-06-231912-06-23   
7                  Johann GaussJohann Gauss  1777-04-301777-04-30   

                   Died  Age                            Occupation  
0  1958-04-161958-04-16   74                        ChemistChemist  
1  1937-10-161937-10-16  122              StatisticianStatistician  
2  1910-08-131910-08-13  180                            NurseNurse  
3  1934-07-041934-07-04  132                        ChemistChemist  
4  1964-04-141964-04-14  112     

In [86]:
'abc'*2

'abcabc'

<font color="red">[Quz]</font>다음과 같이 scientists를 수평/수직 결합하라.

In [73]:
pd.concat([scientists]*2, axis=1)

Unnamed: 0,Name,Born,Died,Age,Occupation,Name.1,Born.1,Died.1,Age.1,Occupation.1
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [75]:
pd.concat([scientists]*2, axis=0, ignore_index=True)

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician
8,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
9,William Gosset,1876-06-13,1937-10-16,61,Statistician


In [79]:
pd.merge(scientists, scientists, left_index=True, right_index=True)

Unnamed: 0,Name_x,Born_x,Died_x,Age_x,Occupation_x,Name_y,Born_y,Died_y,Age_y,Occupation_y
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [80]:
pd.merge(scientists.T, scientists.T, left_index=True, right_index=True).T

Unnamed: 0,Name,Born,Died,Age,Occupation
0_x,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1_x,William Gosset,1876-06-13,1937-10-16,61,Statistician
2_x,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3_x,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4_x,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5_x,John Snow,1813-03-15,1858-06-16,45,Physician
6_x,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7_x,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician
0_y,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1_y,William Gosset,1876-06-13,1937-10-16,61,Statistician


### 3.7 시리즈와 데이터프레임의 데이터 처리하기(67쪽)

In [81]:
print(scientists['Born'].dtype)
print(scientists['Died'].dtype)

object
object


In [82]:
scientists['Born'][0]

'1920-07-25'

In [83]:
born_datetime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d') 
print(born_datetime)

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]


In [84]:
died_datetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')
print(died_datetime)

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]


In [85]:
scientists['born_dt'], scientists['died_dt'] = (born_datetime, died_datetime)
print(scientists.head())

                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   61  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse 1820-05-12   
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist 1867-11-07   
4         Rachel Carson  1907-05-27  1964-04-14   56     Biologist 1907-05-27   

     died_dt  
0 1958-04-16  
1 1937-10-16  
2 1910-08-13  
3 1934-07-04  
4 1964-04-14  


In [86]:
print(scientists.shape)

(8, 7)


In [87]:
from datetime import datetime
datetime(1958, 4, 16) - datetime(1920, 7, 25)

datetime.timedelta(days=13779)

In [88]:
scientists['age_days_dt'] = (scientists['died_dt'] - \
                             scientists['born_dt'])
print(scientists)

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist   
1        William Gosset  1876-06-13  1937-10-16   61        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist   
5             John Snow  1813-03-15  1858-06-16   45           Physician   
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician   

     born_dt    died_dt age_days_dt  
0 1920-07-25 1958-04-16  13779 days  
1 1876-06-13 1937-10-16  22404 days  
2 1820-05-12 1910-08-13  32964 days  
3 1867-11-07 1934-07-04  24345 days  
4 1907-05-27 1964-04-14  20777 days  
5 1813-03-15 1858-06-16  16529 days  
6 1912-06-23 1954-06-07  15324 days  
7 1777-04-3

<font color="red">[Quz]</font> age_days_dt로 부터 연도 단위의 age_years_dt를 생성해보자.

In [117]:
print(scientists['age_days_dt'].dtype)

timedelta64[ns]


In [125]:
(scientists['age_days_dt'] / 365.25).dt.days

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: age_days_dt, dtype: int64

In [126]:
np.timedelta64(1, 'Y')

numpy.timedelta64(1,'Y')

In [136]:
scientists['age_days_dt'].astype(np.timedelta64(1,'Y'))

0    37.0
1    61.0
2    90.0
3    66.0
4    56.0
5    45.0
6    41.0
7    77.0
Name: age_days_dt, dtype: float64

In [133]:
scientists['age_days_dt'].astype('timedelta64[Y]')

0    37.0
1    61.0
2    90.0
3    66.0
4    56.0
5    45.0
6    41.0
7    77.0
Name: age_days_dt, dtype: float64

### 3.5. 시리즈, 데이터프레임의 데이터 섞기

In [137]:
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


#### View에 대한 seting은 값을 변경시킨다.
DataFrame의 하나의 열은 Series로서 메모리에 연속된 값을 저장하는 numpy array로 구성되어 있다. 이때, 하나의 열을 색인하면 이는 View 이며, 이것의 값을 변경시키면, 원래의 DataFrame의 값이 변경된다.

shuffle의 경우, 일단 copy가 발생하고, copy에 대한 변경(섞기)이 이뤄진 후, 다시 DataFrame에 복사가 이뤄진다. 이는 shuffle 함수가 inplace 옵션을 내재하기 때문이다.

In [138]:
import random

random.seed(42)
random.shuffle(scientists['Age'])
print(scientists['Age'])

0    66
1    56
2    41
3    77
4    90
5    45
6    37
7    61
Name: Age, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i], x[j] = x[j], x[i]


In [139]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,66,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,56,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,41,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,77,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,90,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,37,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,61,Mathematician,1777-04-30,1855-02-23,28422 days


In [140]:
import random

random.seed(42)
random.shuffle(scientists['Age'][:5])
print(scientists['Age'])

0    77
1    56
2    41
3    90
4    66
5    45
6    37
7    61
Name: Age, dtype: int64


scientists 행을 섞기위해서는:

In [147]:
idx = np.arange(len(scientists))
np.random.shuffle(idx)

In [150]:
scientists.iloc[idx]

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_days_dt
2,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
1,Rachel Carson,1907-05-27,1964-04-14,66,Biologist,1907-05-27,1964-04-14,20777 days
7,Rosaline Franklin,1920-07-25,1958-04-16,77,Chemist,1920-07-25,1958-04-16,13779 days
0,William Gosset,1876-06-13,1937-10-16,56,Statistician,1876-06-13,1937-10-16,22404 days
5,Alan Turing,1912-06-23,1954-06-07,37,Computer Scientist,1912-06-23,1954-06-07,15324 days
6,Florence Nightingale,1820-05-12,1910-08-13,41,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,90,Chemist,1867-11-07,1934-07-04,24345 days
4,Johann Gauss,1777-04-30,1855-02-23,61,Mathematician,1777-04-30,1855-02-23,28422 days


### 7. 데이터프레임의 열 삭제하기

In [151]:
print(scientists.columns)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'age_days_dt'],
      dtype='object')


In [154]:
scientists_dropped = scientists.drop(['Age'], axis=1)

print(scientists_dropped.columns)

Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt',
       'age_days_dt'],
      dtype='object')


In [155]:
scientists_dropped.drop(['Born'], axis=1, inplace=True)
scientists_dropped.head()

Unnamed: 0,Name,Died,Occupation,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,1958-04-16,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1937-10-16,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1910-08-13,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1934-07-04,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1964-04-14,Biologist,1907-05-27,1964-04-14,20777 days


### 데이터를 피클, CSV, TSV 파일로 저장하고 불러오기(72쪽)

#### 1. 피클로 저장하기

In [156]:
names = scientists['Name']

In [157]:
names.to_pickle('../output/scientists_names_series.pickle')

In [158]:
scientists.to_pickle('../output/scientists_df.pickle')

In [159]:
scientist_names_from_pickle = pd.read_pickle('../output/scientists_names_series.pickle')
print(scientist_names_from_pickle)

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object


In [160]:
scientists_from_pickle = pd.read_pickle('../output/scientists_df.pickle')
print(scientists_from_pickle)

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   77             Chemist   
1        William Gosset  1876-06-13  1937-10-16   56        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   41               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   90             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   66           Biologist   
5             John Snow  1813-03-15  1858-06-16   45           Physician   
6           Alan Turing  1912-06-23  1954-06-07   37  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   61       Mathematician   

     born_dt    died_dt age_days_dt  
0 1920-07-25 1958-04-16  13779 days  
1 1876-06-13 1937-10-16  22404 days  
2 1820-05-12 1910-08-13  32964 days  
3 1867-11-07 1934-07-04  24345 days  
4 1907-05-27 1964-04-14  20777 days  
5 1813-03-15 1858-06-16  16529 days  
6 1912-06-23 1954-06-07  15324 days  
7 1777-04-3

### 4. CSV 파일과 TSV 파일로 저장하기

In [161]:
names.to_csv('../output/scientist_names_series.csv')

  """Entry point for launching an IPython kernel.


In [162]:
scientists.to_csv('../output/scientists_df.tsv', sep='\t')

In [163]:
scientists.to_csv('../output/scientists_df_no_index.csv', index=False)

In [164]:
names_df = names.to_frame()
names_df.to_excel('../output/scientists_names_series_df.xls')

In [78]:
!pip install xlwt

Collecting xlwt
  Downloading https://files.pythonhosted.org/packages/44/48/def306413b25c3d01753603b1a222a011b8621aed27cd7f89cbc27e6b0f4/xlwt-1.3.0-py2.py3-none-any.whl (99kB)
Installing collected packages: xlwt
Successfully installed xlwt-1.3.0


In [None]:
## openpyxl 3.0.2에서는 에러 발생
!pip install openpyxl==3.0.1

In [None]:
names_df = names.to_frame()

import xlwt 
names_df.to_excel('../output/scientists_names_series_df.xls')

import openpyxl 
names_df.to_excel('../output/scientists_names_series_df.xlsx')

In [129]:
names_df_fromXL = pd.read_excel('../output/scientists_names_series_df.xls')
names_df_fromXL = names_df_fromXL.iloc[:, -1:]
names_df_fromXL

Unnamed: 0,Name
0,Rosaline Franklin
1,William Gosset
2,Florence Nightingale
3,Marie Curie
4,Rachel Carson
5,John Snow
6,Alan Turing
7,Johann Gauss
