In [2]:
import pandas as pd
import numpy as np

In [3]:
# Display the help document
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'C:\\Users\\user\\anaconda3\\lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\users\user\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point da

In [4]:
# Print the version of pandas
pd.__version__

'1.0.5'

In [8]:
"""
Pandas Object: Series
"""
# Create a series from an array

ser = pd.Series([0.25, 0.5, 0.75, 1.0]) # contructor method
print(ser)
print(type(ser), '\n')

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'> 



In [11]:
# Two main attributes: 'values' and 'index'

arr = ser.values
print(arr)

ind = ser.index
print(ind)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [13]:
# Label-based Indexing

ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(ser)

ind = ser.index
print(ind)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
Index(['a', 'b', 'c', 'd'], dtype='object')


In [15]:
"""
Dictionary and Series
"""

dict = {'a': 1, 2: 'two', 'third': True}
print(dict)

{'a': 1, 2: 'two', 'third': True}


In [23]:
"""
Create a series from a dictionary
"""

population_dict = {'California' : 38332521,
                  'Texas' : 26448193,
                  'New York' : 19651127,
                  'Florida' : 19552860,
                  'Illinois' : 12882135}

population = pd.Series(population_dict)
print(population)

print(population['Texas' : 'Illinois'])

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
Texas       26448193
New York    19651127
Florida     19552860
Illinois    12882135
dtype: int64


In [20]:
"""
Pandas object: DataFrame
"""

area_dict = {'California' : 423967, 'Texas' : 695662, 'New York' : 141297,
            'Florida' : 170312, 'Illinois' : 149995}
area = pd.Series(area_dict)
print(area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


In [35]:
# Construct a DataFrame containing 'population' and 'area' Series

states = pd.DataFrame({'population' : population, 'area' : area})
print(states)
print(states.index)
print(states.columns)
print(states['area'])

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [39]:
# Construct a DataFrame from a 2D Numpy array
arr = np.random.rand(3, 2)
print(arr, '\n')

pd.DataFrame(arr, columns=['foo', 'bar'], index=['a','b', 'c'])

[[0.5755671  0.84965459]
 [0.3839598  0.59128607]
 [0.27932374 0.43461447]] 



Unnamed: 0,foo,bar
a,0.575567,0.849655
b,0.38396,0.591286
c,0.279324,0.434614


In [46]:
"""
Series object manipulation: dictionary-style
"""

ser = pd.Series([0.25, 0.5, 0.75, 1.0],
               index = ['a', 'b', 'c', 'd'])
print(ser)
print(ser['b'])

print('a' in ser)
print(0.25 in ser)
print(ser.keys())
print(ser.index)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['a', 'b', 'c', 'd'], dtype='object')


In [49]:
"""
Series object manipulation: array-style
"""

ser['e'] = 1.25
ser['a'] = 0.125
print(ser, '\n')

print(ser['a' : 'c'], '\n') # slicing 명시적 인덱싱은 stop도 포함

a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64 

a    0.125
b    0.500
c    0.750
dtype: float64 



In [56]:
"""
Caution: Slicing Series object using explicit/implicit indexing
"""

states['density'] = states['population'] / states['area']
print(states)

            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763


In [59]:
# Indexer: loc, iloc, ix

print(states)
states.loc['New York' : 'Illinois']

            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763


Unnamed: 0,population,area,density
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [60]:
"""
DataFrame as two-dimensional array
"""


'\nDataFrame as two-dimensional array\n'

In [63]:
# Masking and fancy indexing using the lox indexer
print(states.loc[states.density > 100, ['population', 'density']], '\n')

          population     density
New York    19651127  139.076746
Florida     19552860  114.806121 

