In [2]:
import pandas as pd

In [4]:
s = pd.Series(['banana', 42])
s

0    banana
1        42
dtype: object

In [5]:
s = pd.Series(['Wes Mckinney', 'Creator of Pandas'])
s

0         Wes Mckinney
1    Creator of Pandas
dtype: object

In [7]:
s = pd.Series(['Wes Mckinney', 'Creator of Pandas'], index=['Person', 'who'])
s

Person         Wes Mckinney
who       Creator of Pandas
dtype: object

In [8]:
scientists = pd.DataFrame({
    'Name': ['Roasline Frankin', 'William Gosset'],
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
    'Age': [37, 61]
})

In [10]:
scientists

Unnamed: 0,Name,0ccupation,Born,Died,Age
0,Roasline Frankin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [11]:
from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([ # python dictionary 는 순서를 보장하지 않으므로 순서가 중요할 땐 ordereddict을 사용함.
    ('Name', ['Roasline Frankin', 'William Gosset']),
    ('Occupation', ['Chemist', 'Statistician']),
    ('Born', ['1920-07-25', '1876-06-13']),
    ('Died', ['1958-04-16', '1937-10-16']),
    ('Age', [37, 61])
]))

scientists

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Roasline Frankin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [15]:
scientists = pd.DataFrame(
    data={'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
    'Age': [37, 61]},
    index=['Rosaline Franklin', 'William Gosset'],
    columns=['Occupation', 'Born', 'Died', 'Age']
)

In [16]:
scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [19]:
first_row = scientists.loc['William Gosset']
type(first_row)

pandas.core.series.Series

In [20]:
first_row

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object

In [21]:
first_row.index 

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

In [22]:
first_row.values # 데이터

array(['Statistician', '1876-06-13', '1937-10-16', 61], dtype=object)

In [23]:
first_row.keys() # index와 같음

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

In [24]:
first_row.index[0]

'Occupation'

In [26]:
ages = scientists['Age']
ages

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64

In [27]:
ages.mean() # 평균

49.0

In [28]:
ages.min()

37

In [29]:
ages.max()

61

In [30]:
ages.std() # 표준편차

16.97056274847714

In [33]:
scientists = pd.read_csv('../../doit_pandas/data/scientists.csv')

In [35]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [36]:
ages = scientists['Age']

In [37]:
ages.max()

90

In [38]:
ages.mean()

59.125

### 불린추출

In [39]:
ages[ages > ages.mean()] # 평균 나이보다 많은 나이 추출

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [40]:
ages > ages.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [41]:
type(ages > ages.mean()) # 리스트 형태로 참이나 거짓을 담아 시리즈에 전달하면 참인 인덱스의 데이터만 추출 가능함 -> 불린 추출

pandas.core.series.Series

In [43]:
manual_bool_values = [True, True, False, False, True, True, False, True]
ages[manual_bool_values]

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64

### 브로드캐스팅 
시리즈나 데이터프레임에 있는 모든 데이터에 대해 한번에 연산하는 것  
벡터 : 여러개의 값을 가진 데이터 (시리즈)  
스칼라 : 단순 크기를 나타내는 데이터

In [45]:
ages + ages

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [46]:
ages * ages

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

In [47]:
ages + 100 # 벡터에 스칼라 연산 

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

In [49]:
pd.Series([1, 100])

0      1
1    100
dtype: int64

In [50]:
ages + pd.Series([1, 100]) # 인덱스가 같은 값만 계산함

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

In [51]:
rev_ages = ages.sort_index(ascending=False) # 인덱스의 역순 not 데이터의 역순
rev_ages

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64

In [54]:
ages * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [57]:
ages + rev_ages # 인덱스가 일치하는 값 끼리 연산하기 때문에 같음

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [58]:
ages

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [59]:
scientists[scientists['Age'] > scientists['Age'].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [60]:
scientists.loc[[True, True, False, True]] # bool 벡터

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist


In [61]:
scientists * 2

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


In [63]:
print(scientists['Born'].dtype)

object


In [64]:
print(scientists['Died'].dtype)

object


In [66]:
born_datetime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
born_datetime

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]

In [68]:
died_datetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d') # object type -> dateimte type
died_datetime

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]

In [69]:
scientists.shape # 행 (데이터 개수), 열

(8, 5)

In [71]:
scientists['born_dt'], scientists['died_dt'] = (born_datetime, died_datetime)
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23


In [72]:
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14


In [76]:
scientists['age_dats_dt'] = (scientists['died_dt'] - scientists['born_dt']) 
scientists # 몇일 샀는지 

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_dats_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,28422 days


In [78]:
scientists['Age']

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [80]:
import random
random.seed(42) # 난수 기준값 설정
random.shuffle(scientists['Age']) # 데이터 섞기
scientists['Age']

0    77
1    90
2    37
3    61
4    41
5    45
6    66
7    56
Name: Age, dtype: int64

In [81]:
scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'age_dats_dt'],
      dtype='object')

In [83]:
scientists_dropped = scientists.drop(['Age'], axis=1) # age삭제 
scientists_dropped.columns


Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt',
       'age_dats_dt'],
      dtype='object')

### 저장, 불러오기

In [89]:
names = scientists['Name']

In [93]:
names.to_pickle('../scientists_names_series.pickle')

In [94]:
names.to_csv('../sientists_names_series.csv')