In [12]:
import numpy as np
import pandas as pd
# NumPy structured arrays in which the rows and columns are identified with labels rather than simple integer indices
# Pandas Series is a one-dimensional array of indexed data. It can be created from a list or array as follows:
data = pd.Series([0.25, 0.5, 0.75, 1.0])  # Series wraps both a sequence of values and a sequence of indices
print(data)
print(data.values)
print(data.index)# like NumPy array, but with an explicitly defined index associated with the values
print(data[1])
print(data[1:3])# slicing by explicit index
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])# Series as generalized NumPy array using strings as index
print(data)
print(data['b'])# access using index
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])# Series as specialized dictionary
print(data)
print(data[5])# access using index





0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)
0.5
1    0.50
2    0.75
dtype: float64
a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64
0.5


In [20]:
#series as dictionary
population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)
print(population['California'])
print(population['California':'Illinois'])# slicing by explicit index
print(population[0:3])# slicing by implicit integer index
pd.Series(5, index=[100, 200, 300])# scalar value will be repeated to match the length of index
print(pd.Series)
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])# explicit index will override the implicit index
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2, 4])# explicit index will override the implicit index
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2, 4]).index#  repeated to match the length of index
pd.Series({2:'a', 1:'b', 3:'c'}).index# explicit index will override the implicit index
print(pd.Series)


California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
38332521
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
California    38332521
Texas         26448193
New York      19651127
dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [24]:
#pandas DataFrame object
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
states = pd.DataFrame({'population': population, 'area': area})# DataFrame as a generalized NumPy array
print(states)
print(states.index)# accessing index
print(states.columns)# accessing columns



California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64
            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [33]:
# DataFrame as specialized dictionary
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995})

print(states['area'])# accessing columns
# Constructing DataFrame opulation'])# from a single Series object
print(pd.DataFrame(population, columns=['population']))# from a single Series object
# from a list of dicts
data = [{'a': i, 'b': 2 * i} for i in range(3)]
print(pd.DataFrame(data))
print(pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]))# missing values are filled with NaN (i.e., “not a number”)
# from a dictionary of Series objects
print(pd.DataFrame({'population': population, 'area': area}))# from a dictionary of Series objects
# from a two-dimensional NumPy array
print(pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c']))# from a two-dimensional NumPy array  # from a two-dimensional NumPy array
# from a NumPy structured array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(pd.DataFrame(A))# from a NumPy structured array

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
            population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
   a  b
0  0  0
1  1  2
2  2  4
     a  b    c
0  1.0  2  NaN
1  NaN  3  4.0
            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
        foo       bar
a  0.454855  0.780328
b  0.529411  0.456677
c  0.759012  0.943644
   A    B
0  0  0.0
1  0  0.0
2  0  0.0


In [8]:
#The Pandas Index Object
import pandas as pd
ind = pd.Index([2, 3, 5, 7, 11])
print(ind)
# Index as immutable array
print(ind[1])
print(ind[::2])
print(ind.size, ind.shape, ind.ndim, ind.dtype)
# Index as ordered set
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
print(indA & indB) # intersection

print(indA | indB) # union
print(indA ^ indB) # symmetric difference



Index([2, 3, 5, 7, 11], dtype='int64')
3
Index([2, 5, 11], dtype='int64')
5 (5,) 1 int64
Index([0, 3, 5, 7, 9], dtype='int64')
Index([3, 3, 5, 7, 11], dtype='int64')
Index([3, 0, 0, 0, 2], dtype='int64')


In [8]:
#data selection in series
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c','d'])
print(data)
data['b']
'a' in data
data.keys()
list(data.items())
data['e'] = 1.25
print(data)







a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64


In [None]:
#series as one-dimensional array
import pandas as pd
# slicing by explicit index
data['a' : 'c']
# slicing by implicit integer index
data[0:2]
# masking
data[(data > 0.3) & (data < 0.8)]
# fancy indexing
data[['a', 'e']]










In [24]:
# indexing: loc, iloc, ix
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data)
# explicit index when indexing
data[1]
# implicit index when slicing
data[1:3]
# loc attribute allows indexing and slicing that always references the explicit index
print(data.loc[1])
print(data.loc[1:3])
# iloc attribute allows indexing and slicing that always references the implicit Python-style index
print(data.iloc[1])
print(data.iloc[1:3])
# ix is a hybrid of the two





1    a
3    b
5    c
dtype: object
a
1    a
3    b
dtype: object
b
3    b
5    c
dtype: object


In [26]:
#dataframe as a dictionary
import pandas as pd
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995})  # dictionary-style indexing of the column
pop = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135})  # dictionary-style indexing of the column
data = pd.DataFrame({'area':area, 'pop':pop})  # dictionary-style indexing of the column
print(data)
print(data['area'])
print(data.area)
data['density'] = data['pop'] / data['area']  # adding a new column
print(data)
print(data.values)



              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763
[[4.23967000e+05 3.83325210e+07 9.04139261e+01]
 [6.95662000e+05 2.64481930e+07 3.80187404e+01]
 [1.41297000e+05 1.96511270e+07 1.39076746e+02]
 [1.70312000e+05 1.95528600e+07 1.14806121e+02]
 [1.49995000e+05 1.28821350e+07 8.58837628e+01]]


In [43]:
# dataframe as two-dimensional array
data.values
data.T# transpose
data.values[0]# indexing the rows
data['area']# indexing the columns
data.iloc[:3, :2]# slicing
data.loc[:'Illinois', :'pop']# slicing
data.loc[data.density > 100, ['pop', 'density']]# masking and fancy indexing
data.iloc[0, 2] = 90# modifying values
print(data)
data['Florida':'Illinois']# indexing explicit index
data[1:3]# indexing implicit Python-style index
data[data.density > 100]# masking







              area       pop     density
California  423967  38332521   90.000000
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
