# 1. Import Pandas and Numpy

In [113]:
import pandas as pd
import numpy as np

## check version

In [114]:
print(pd.__version__)
print(np.__version__)

2.2.3
2.2.6


# 2. Series

In [115]:
# Series is a one-dimensional labeled array capable of holding any data type 
#         (integers, strings, floating point numbers, Python objects, etc.)

# syntax:- s = pd.Series(data, index=index)

# data can be many different things:
#    a Python dict
#    an ndarray
#    a scalar value (like 5)

## create series

In [116]:
s = pd.Series(np.random.randn(5))
s

0   -0.234838
1    0.141694
2    0.759969
3   -0.113843
4   -0.092653
dtype: float64

## Random flot value with custom index

In [21]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    0.907704
b   -0.162780
c   -1.009450
d    1.187749
e   -0.137872
dtype: float64

## Random int value with custom index

In [104]:
s = pd.Series(np.random.randint(0, 15, size = 5), index=["a", "b", "c", "d", "e"])
s

a     0
b    10
c    12
d     3
e    14
dtype: int32

## Series can be instantiated from list

In [107]:
lists = [2, 4, 5, 8, 3]
s = pd.Series(lists, index=["a", "b", "c", "d", "e"])
s

a    2
b    4
c    5
d    8
e    3
dtype: int64

## Series can be instantiated from dicts:

In [26]:
d = {"b": 1, 
     "a": 0, 
     "c": 2}

pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [None]:
# If an index is passed, the values in data corresponding to the labels in the index will be pulled out

In [28]:
d = {"a": 0.0, 
     "b": 1.0, 
     "c": 2.0}

pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [None]:
# NaN (not a number) is the standard missing data marker used in pandas.

In [None]:
# If data is a scalar value, an index must be provided. The value will be repeated to match the length of index.

In [29]:
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

## Series is like ndarray

In [30]:
# Series acts very similarly to a ndarray and is a valid argument to most NumPy functions

In [34]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    0.153320
b    0.246091
c   -0.019314
d    0.990134
e   -1.682380
dtype: float64

In [36]:
s.iloc[0]
s.iloc[3]

np.float64(0.9901335800271764)

In [37]:
s.iloc[:3]

a    0.153320
b    0.246091
c   -0.019314
dtype: float64

In [38]:
s[s > s.median()]

b    0.246091
d    0.990134
dtype: float64

In [40]:
s.iloc[[4, 3, 1]]

e   -1.682380
d    0.990134
b    0.246091
dtype: float64

In [None]:
# Like a NumPy array, a pandas Series has a single dtype.

In [41]:
s.dtype

dtype('float64')

In [None]:
# If you need the actual array backing a Series, use Series.array.

In [42]:
s.array

<NumpyExtensionArray>
[   np.float64(0.1533196750777314),   np.float64(0.24609147390192315),
 np.float64(-0.019313705957263727),    np.float64(0.9901335800271764),
   np.float64(-1.6823799401607755)]
Length: 5, dtype: float64

In [None]:
# if you need an actual ndarray, then use Series.to_numpy().

In [43]:
s.to_numpy()

array([ 0.15331968,  0.24609147, -0.01931371,  0.99013358, -1.68237994])

In [44]:
# A Series is also like a fixed-size dict in that you can get and set values by index label
s

a    0.153320
b    0.246091
c   -0.019314
d    0.990134
e   -1.682380
dtype: float64

In [45]:
s["a"]

np.float64(0.1533196750777314)

In [47]:
s["e"] = 12.0
s

a     0.153320
b     0.246091
c    -0.019314
d     0.990134
e    12.000000
dtype: float64

In [50]:
"e" in s

True

In [51]:
"j" in s

False

In [None]:
# Using the Series.get() method, a missing label will return None or specified default

In [64]:
s.get("e")

np.float64(12.0)

In [65]:
s.get("f")

In [66]:
s.get("f", np.nan)

nan

## Vectorized operations and label alignment with Series

In [80]:
s = pd.Series(np.random.randint(5, size = 5), index = ['a', 'b', 'c', 'd', 'e'])
s

a    0
b    4
c    3
d    4
e    4
dtype: int32

In [81]:
s + s 

a    0
b    8
c    6
d    8
e    8
dtype: int32

In [82]:
s * s

a     0
b    16
c     9
d    16
e    16
dtype: int32

In [73]:
# "s.iloc[1:]" This gives you everything except the first value
# "s.iloc[:-1]" This gives you everything except the last value:

In [84]:
s.iloc[1:] + s.iloc[:-1]

b    8
c    6
d    8
e    8
dtype: int32

## Name attribute

In [91]:
s = pd.Series(np.random.randint(5, size=5), name="something")
print(s)
print(s.name)

0    2
1    1
2    0
3    1
4    2
Name: something, dtype: int32
something


In [None]:
# We can rename a Series with the pandas.Series.rename() method.

In [93]:
s2 = s
s2.name

'something'

In [95]:
s2 = s.rename("different")
s2

0    2
1    1
2    0
3    1
4    2
Name: different, dtype: int32

In [96]:
s.name

'something'

In [97]:
s.rename('laksh')

0    2
1    1
2    0
3    1
4    2
Name: laksh, dtype: int32

In [98]:
# Note that s and s2 refer to different objects.

## get infomation of Series

In [101]:
print(s.dtype)
print('.')
print(s.ndim)
print('.')
print(s.size)
print('.')
print(s.name)
print('.')
print(s.hasnans)
print('.')
print(s.index)
print('.')
print(s.head(2))
print('.')
print(s.tail(2))
print('.')
print(s.info)

int32
.
1
.
5
.
something
.
False
.
RangeIndex(start=0, stop=5, step=1)
.
0    2
1    1
Name: something, dtype: int32
.
3    1
4    2
Name: something, dtype: int32
.
<bound method Series.info of 0    2
1    1
2    0
3    1
4    2
Name: something, dtype: int32>


# 3. DataFrame

In [None]:
# DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
# You can think of it like a spreadsheet or SQL table, or a dict of Series objects.

In [None]:
# DataFrame accepts many different kinds of input:
#     A Series
#     Dict of 1D ndarrays, lists, dicts, or Series
#     2-D numpy.ndarray
#     Structured or record ndarray
#     Another DataFrame

In [108]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [109]:
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [111]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "one"])

Unnamed: 0,two,one
d,4.0,
b,2.0,2.0
a,1.0,1.0


In [112]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,
