# 1. Import Pandas and Numpy

In [230]:
import pandas as pd
import numpy as np

## check version

In [231]:
print(pd.__version__)
print(np.__version__)

2.2.3
2.2.6


# 2. Series

In [232]:
# Series is a one-dimensional labeled array capable of holding any data type 
#         (integers, strings, floating point numbers, Python objects, etc.)

# syntax:- s = pd.Series(data, index=index)

# data can be many different things:
#    a Python dict
#    an ndarray
#    a scalar value (like 5)

## create series

In [233]:
s = pd.Series(np.random.randn(5))
s

0   -0.421987
1   -1.295890
2    0.455593
3    1.381559
4    0.279279
dtype: float64

## Random flot value with custom index

In [234]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a   -0.157516
b   -1.486527
c   -1.007808
d   -0.296790
e   -0.422338
dtype: float64

## Random int value with custom index

In [235]:
s = pd.Series(np.random.randint(0, 15, size = 5), index=["a", "b", "c", "d", "e"])
s

a    6
b    8
c    0
d    3
e    6
dtype: int32

## Series can be instantiated from list

In [236]:
lists = [2, 4, 5, 8, 3]
s = pd.Series(lists, index=["a", "b", "c", "d", "e"])
s

a    2
b    4
c    5
d    8
e    3
dtype: int64

## Series can be instantiated from dicts:

In [237]:
d = {"b": 1, 
     "a": 0, 
     "c": 2}

pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [238]:
# If an index is passed, the values in data corresponding to the labels in the index will be pulled out

In [239]:
d = {"a": 0.0, 
     "b": 1.0, 
     "c": 2.0}

pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [240]:
# NaN (not a number) is the standard missing data marker used in pandas.

In [241]:
# If data is a scalar value, an index must be provided. The value will be repeated to match the length of index.

In [242]:
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

## Series is like ndarray

In [243]:
# Series acts very similarly to a ndarray and is a valid argument to most NumPy functions

In [244]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    1.929855
b    0.655437
c   -0.298145
d   -1.548523
e    0.154093
dtype: float64

In [245]:
s.iloc[0]
s.iloc[3]

np.float64(-1.548523373268338)

In [246]:
s.iloc[:3]

a    1.929855
b    0.655437
c   -0.298145
dtype: float64

In [247]:
s[s > s.median()]

a    1.929855
b    0.655437
dtype: float64

In [248]:
s.iloc[[4, 3, 1]]

e    0.154093
d   -1.548523
b    0.655437
dtype: float64

In [249]:
# Like a NumPy array, a pandas Series has a single dtype.

In [250]:
s.dtype

dtype('float64')

In [251]:
# If you need the actual array backing a Series, use Series.array.

In [252]:
s.array

<NumpyExtensionArray>
[ np.float64(1.9298546273221275),   np.float64(0.655437324685445),
 np.float64(-0.2981449795941059),  np.float64(-1.548523373268338),
 np.float64(0.15409304454240116)]
Length: 5, dtype: float64

In [253]:
# if you need an actual ndarray, then use Series.to_numpy().

In [254]:
s.to_numpy()

array([ 1.92985463,  0.65543732, -0.29814498, -1.54852337,  0.15409304])

In [255]:
# A Series is also like a fixed-size dict in that you can get and set values by index label
s

a    1.929855
b    0.655437
c   -0.298145
d   -1.548523
e    0.154093
dtype: float64

In [256]:
s["a"]

np.float64(1.9298546273221275)

In [257]:
s["e"] = 12.0
s

a     1.929855
b     0.655437
c    -0.298145
d    -1.548523
e    12.000000
dtype: float64

In [258]:
"e" in s

True

In [259]:
"j" in s

False

In [260]:
# Using the Series.get() method, a missing label will return None or specified default

In [261]:
s.get("e")

np.float64(12.0)

In [262]:
s.get("f")

In [263]:
s.get("f", np.nan)

nan

## Vectorized operations and label alignment with Series

In [264]:
s = pd.Series(np.random.randint(5, size = 5), index = ['a', 'b', 'c', 'd', 'e'])
s

a    4
b    1
c    2
d    2
e    2
dtype: int32

In [265]:
s + s 

a    8
b    2
c    4
d    4
e    4
dtype: int32

In [266]:
s * s

a    16
b     1
c     4
d     4
e     4
dtype: int32

In [267]:
# "s.iloc[1:]" This gives you everything except the first value
# "s.iloc[:-1]" This gives you everything except the last value:

In [268]:
s.iloc[1:] + s.iloc[:-1]

a    NaN
b    2.0
c    4.0
d    4.0
e    NaN
dtype: float64

## Name attribute

In [269]:
s = pd.Series(np.random.randint(5, size=5), name="something")
print(s)
print(s.name)

0    3
1    1
2    2
3    3
4    0
Name: something, dtype: int32
something


In [270]:
# We can rename a Series with the pandas.Series.rename() method.

In [271]:
s2 = s
s2.name

'something'

In [272]:
s2 = s.rename("different")
s2

0    3
1    1
2    2
3    3
4    0
Name: different, dtype: int32

In [273]:
s.name

'something'

In [274]:
s.rename('laksh')

0    3
1    1
2    2
3    3
4    0
Name: laksh, dtype: int32

In [275]:
# Note that s and s2 refer to different objects.

## get infomation of Series

In [276]:
print(s.dtype)
print('.')
print(s.ndim)
print('.')
print(s.size)
print('.')
print(s.name)
print('.')
print(s.hasnans)
print('.')
print(s.index)
print('.')
print(s.head(2))
print('.')
print(s.tail(2))
print('.')
print(s.info)

int32
.
1
.
5
.
something
.
False
.
RangeIndex(start=0, stop=5, step=1)
.
0    3
1    1
Name: something, dtype: int32
.
3    3
4    0
Name: something, dtype: int32
.
<bound method Series.info of 0    3
1    1
2    2
3    3
4    0
Name: something, dtype: int32>


# 3. DataFrame

In [277]:
# DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
# You can think of it like a spreadsheet or SQL table, or a dict of Series objects.

## DataFrame accepts many different kinds of input:
###   i. Dict of Series, 1D ndarrays, lists, dicts, or tuples
###   ii. A Series
###   iiI. 2-D numpy.ndarray
###   iv. Another DataFrame

## From dict from Series

In [278]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [279]:
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [280]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "one"])

Unnamed: 0,two,one
d,4.0,
b,2.0,2.0
a,1.0,1.0


In [281]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [282]:
# When a particular set of columns is passed along with a dict of data, 
#                     the passed columns override the keys in the dict.

In [283]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [284]:
df.columns

Index(['one', 'two'], dtype='object')

## From dict of ndarrays / lists

In [285]:
# All ndarrays must share the same length. If an index is passed, it must also be the same length as the arrays.
# If no index is passed, the result will be range(n), where n is the array length.

In [286]:
d = {"one": [1.0, 2.0, 3.0, 4.0], 
     "two": [4.0, 3.0, 2.0, 1.0]}
d

{'one': [1.0, 2.0, 3.0, 4.0], 'two': [4.0, 3.0, 2.0, 1.0]}

In [287]:
pd.DataFrame(d, index=["a", "b", "c", "d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [288]:
# DataFrame is not intended to work exactly like a 2-dimensional NumPy ndarray.

## From a list of dicts

In [289]:
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [290]:
pd.DataFrame(data2, index=["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [291]:
pd.DataFrame(data2, columns=["b", "a"])

Unnamed: 0,b,a
0,2,1
1,10,5


## From a dict of tuples

In [292]:
pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


## From a Series

In [293]:
ser = pd.Series(range(3), index=list("abc"), name="ser")
ser

a    0
b    1
c    2
Name: ser, dtype: int64

In [294]:
pd.DataFrame(ser)

Unnamed: 0,ser
a,0
b,1
c,2


# 4. Column selection, addition, deletion in Dataframe

## Create

In [295]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([4.0, 5.0, 6.0, 7.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,5.0
c,3.0,6.0
d,,7.0


# Select

In [296]:
df['two']

a    4.0
b    5.0
c    6.0
d    7.0
Name: two, dtype: float64

In [297]:
df.loc['b']

one    2.0
two    5.0
Name: b, dtype: float64

In [298]:
df.iloc[2]

one    3.0
two    6.0
Name: c, dtype: float64

## Adding

In [299]:
df["three"] = df["one"] * df["two"]
df

Unnamed: 0,one,two,three
a,1.0,4.0,4.0
b,2.0,5.0,10.0
c,3.0,6.0,18.0
d,,7.0,


In [300]:
df["flag"] = df["one"] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,4.0,4.0,False
b,2.0,5.0,10.0,False
c,3.0,6.0,18.0,True
d,,7.0,,False


## Delete

In [301]:
del df["two"]
df

Unnamed: 0,one,three,flag
a,1.0,4.0,False
b,2.0,10.0,False
c,3.0,18.0,True
d,,,False


In [302]:
df["foo"] = "bar"
df

Unnamed: 0,one,three,flag,foo
a,1.0,4.0,False,bar
b,2.0,10.0,False,bar
c,3.0,18.0,True,bar
d,,,False,bar


In [303]:
df["one_trunc"] = df["one"][:2]
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,4.0,False,bar,1.0
b,2.0,10.0,False,bar,2.0
c,3.0,18.0,True,bar,
d,,,False,bar,


## Insert Row

In [304]:
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.

In [305]:
# By default, columns get inserted at the end.

In [306]:
df.loc['e'] = {
    'one': 4.0,
    'three': 20.0,
    'flag': True,
    'foo': 'bar',
    'one_trunc': 4.0
}
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,4.0,False,bar,1.0
b,2.0,10.0,False,bar,2.0
c,3.0,18.0,True,bar,
d,,,False,bar,
e,4.0,20.0,True,bar,4.0


## Data alignment and arithmetic