## pd.Series

One-Dimentional container like Python's list. each element should be the same `dtype`. If you mix the number and the string, then `dtype` will be `object`.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
pd.Series([42, "the answer"])

0            42
1    the answer
dtype: object

In [4]:
sr = pd.Series([42, "the answer"], index=["one", "two"])

In [5]:
sr.loc["one"]

42

In [6]:
sr.iloc[0]

42

In [11]:
sr.index

Index(['one', 'two'], dtype='object')

In [13]:
sr.index.values

array(['one', 'two'], dtype=object)

In [14]:
sr.values

array([42, 'the answer'], dtype=object)

### pd.DataFrame

- index = row label name
- columns = column name

In [19]:
df = pd.DataFrame({"Name": ["Taro", "Bob"], "Age": [20, 30], "Nationality": ["Japan", "USA"]}, index=["Taro", "Bob"], columns=["Name", "Nationality", "Age"])
df

Unnamed: 0,Name,Nationality,Age
Taro,Taro,Japan,20
Bob,Bob,USA,30


In [21]:
# df.loc returns Series
type(df.loc["Taro"])

pandas.core.series.Series

In [22]:
df.loc["Taro"]

Name            Taro
Nationality    Japan
Age               20
Name: Taro, dtype: object

In [28]:
df.keys() == df.columns

array([ True,  True,  True], dtype=bool)

### Series methods and attributes

- attributes: loc, iloc, dtypes, T, shape, size, values
- methods: append(), corr(), cov(), describe(), get_values(), hist(), min(), max(), median(), mean(), mode(), quantile(), replace(), sample(), sort_values(), transpose(), unique()


In [29]:
DATADIR = "/Users/tomoya/src/github.com/chendaniely/pandas_for_everyone/data/"

scientists = pd.read_csv(DATADIR + "scientists.csv")

In [33]:
scientists["Age"].describe()

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64

In [36]:
# We can subset values with a vector of Booleans
scientists[scientists["Age"] > 50]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [38]:
# Vectors with common index labels are automatically aligned

ages = scientists["Age"]
ages_rev = ages.sort_values(ascending=True)
ages * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [40]:
# Note that we get the same result with above, instead of adding the first and the last
ages + ages_rev

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [44]:
# Add new column

scientists["born_at"] = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_at
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30


In [48]:
# Directly modify the column

rand = scientists["Age"].sample(len(scientists["Age"]), random_state=42)
rand

1    56
5    90
0    45
7    66
2    61
4    37
3    41
6    77
Name: Age, dtype: int64

In [55]:
# We need to call reset_index() to remove the original index and `drop=True` to get only the values
scientists["Age"] = rand.reset_index(drop=True)
scientists

Unnamed: 0,Name,Born,Died,Occupation,born_at,Age
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,56
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,90
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,45
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,66
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,61
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,37
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,41
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,77


In [56]:
scientists.drop(["Age"], inplace=True, axis=1)
scientists

Unnamed: 0,Name,Born,Died,Occupation,born_at
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30


In [57]:
# Feather format can be used to interface with R
# -- but do not use it for the persistence purpose.