# Pandas Series and DataFrame Objects

## Pandas Series

Series are similar to NumPy arrays, except that we gan give them a named or datetime index.

In [2]:
import numpy as np
import pandas as pd

In [3]:
# creating a series by converting object types to a series
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30]) # numpy array
d = {'a':10, 'b':20, 'c':100} # dictionary

In [4]:
# create a series from a list
pd.Series(my_list)

0    10
1    20
2    30
dtype: int64

In [5]:
# create a series from a list with index
pd.Series(my_list,index=labels)

a    10
b    20
c    30
dtype: int64

In [6]:
# create a series from a numpy array
pd.Series(arr,index=labels)

a    10
b    20
c    30
dtype: int64

In [7]:
# create a series from a dictionary
pd.Series(d)

a     10
b     20
c    100
dtype: int64

In [9]:
pd.Series(data=labels) # strings as data

0    a
1    b
2    c
dtype: object

In [11]:
# passing list of build in functions to a series
pd.Series([sum,print,len]) # extremely uncommon and rarely used

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [12]:
# using an index with a pandas series
ser1 = pd.Series([1,2,3,4],index=['USA','CHINA','FRANCE','GERMANY'])

In [13]:
ser1

USA        1
CHINA      2
FRANCE     3
GERMANY    4
dtype: int64

In [14]:
ser2 = pd.Series([1,2,3,4],index=['USA','CHINA','ITALY','JAPAN'])

In [15]:
# selecting by index
ser1['USA']

1

In [16]:
ser2['JAPAN']

4

In [19]:
# trying to select something that's not in the series
# ser1['CANADA'] # returns a key error

In [22]:
# adding series adds up values
# adds values if there's a match, otherwise a null value
ser1 + ser2

CHINA      4.0
FRANCE     NaN
GERMANY    NaN
ITALY      NaN
JAPAN      NaN
USA        2.0
dtype: float64

## Pandas DataFrame

In [24]:
# numpy and pandas are already imported
from numpy.random import randn

In [25]:
# setting a seed to make sure we're getting the same random numbers
np.random.seed(101)

In [34]:
# dataframe with random data and index (rows/columns)
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [36]:
df # each column that gets returned is a pandas series

Unnamed: 0,W,X,Y,Z
A,-1.467514,-0.494095,-0.162535,0.485809
B,0.392489,0.221491,-0.855196,1.54199
C,0.666319,-0.538235,-0.568581,1.407338
D,0.641806,-0.9051,-0.391157,1.028293
E,-1.972605,-0.866885,0.720788,-1.223082


In [40]:
type(df)

pandas.core.frame.DataFrame

In [41]:
# grabbing a column from a data frame
df['W'] # returns series

A   -1.467514
B    0.392489
C    0.666319
D    0.641806
E   -1.972605
Name: W, dtype: float64

In [44]:
df.W # also works but not recommended

A   -1.467514
B    0.392489
C    0.666319
D    0.641806
E   -1.972605
Name: W, dtype: float64

In [45]:
type(df['W'])

pandas.core.series.Series

In [47]:
# grabbing a list of columns
df[['W','Y']]

Unnamed: 0,W,Y
A,-1.467514,-0.162535
B,0.392489,-0.855196
C,0.666319,-0.568581
D,0.641806,-0.391157
E,-1.972605,0.720788


In [64]:
# creating column by defining it like it already exists
df['new'] = df['W'] + df['Y']

In [65]:
df

Unnamed: 0,W,X,Y,Z,new
A,-1.467514,-0.494095,-0.162535,0.485809,-1.630049
B,0.392489,0.221491,-0.855196,1.54199,-0.462707
C,0.666319,-0.538235,-0.568581,1.407338,0.097738
D,0.641806,-0.9051,-0.391157,1.028293,0.250649
E,-1.972605,-0.866885,0.720788,-1.223082,-1.251818


In [61]:
# remove a column
# axis needs to be specified for column
df.drop('new',axis=1) # doesn't occur in place

Unnamed: 0,W,X,Y,Z
A,-1.467514,-0.494095,-0.162535,0.485809
B,0.392489,0.221491,-0.855196,1.54199
C,0.666319,-0.538235,-0.568581,1.407338
D,0.641806,-0.9051,-0.391157,1.028293
E,-1.972605,-0.866885,0.720788,-1.223082


In [67]:
df.drop('new',axis=1,inplace=True) # inplace needs to be true

In [68]:
df

Unnamed: 0,W,X,Y,Z
A,-1.467514,-0.494095,-0.162535,0.485809
B,0.392489,0.221491,-0.855196,1.54199
C,0.666319,-0.538235,-0.568581,1.407338
D,0.641806,-0.9051,-0.391157,1.028293
E,-1.972605,-0.866885,0.720788,-1.223082


In [71]:
# df.shape returns a tuple with the zero index as the number of rows
df.shape # rows are zero axis, columns are one axis

(5, 4)

In [80]:
# selecting ROWS in a data frame
# (1) loc (location) method
# passing in the label of the index returns series
df.loc['C'] # returns a series

W    0.666319
X   -0.538235
Y   -0.568581
Z    1.407338
Name: C, dtype: float64

In [81]:
# grabbing a row by index position
df.iloc[2]

W    0.666319
X   -0.538235
Y   -0.568581
Z    1.407338
Name: C, dtype: float64

In [83]:
# selecting subsets of rows and columns
df.loc['B']['Y'] # returns single cell

-0.8551960407780934

In [84]:
# subset
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,-1.467514,-0.162535
B,0.392489,-0.855196
