# 1. Import Pandas and Numpy

In [1]:
import pandas as pd
import numpy as np

## check version

In [2]:
print(pd.__version__)
print(np.__version__)

2.3.1
2.3.2


# 2. Series

In [3]:
# Series is a one-dimensional labeled array capable of holding any data type 
#         (integers, strings, floating point numbers, Python objects, etc.)

# syntax:- s = pd.Series(data, index=index)

# data can be many different things:
#    a Python dict
#    an ndarray
#    a scalar value (like 5)

## create series

In [4]:
s = pd.Series(np.random.randn(5))
s

0    0.590602
1    0.759911
2    1.490200
3    0.260162
4    0.106801
dtype: float64

## Random flot value with custom index

In [5]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    1.390346
b    1.500891
c   -0.971622
d   -0.484385
e   -0.505629
dtype: float64

## Random int value with custom index

In [6]:
s = pd.Series(np.random.randint(0, 15, size = 5), index=["a", "b", "c", "d", "e"])
s

a     9
b     9
c     7
d     9
e    12
dtype: int32

## Series can be instantiated from list

In [7]:
lists = [2, 4, 5, 8, 3]
s = pd.Series(lists, index=["a", "b", "c", "d", "e"])
s

a    2
b    4
c    5
d    8
e    3
dtype: int64

## Series can be instantiated from dicts:

In [8]:
d = {"b": 1, 
     "a": 0, 
     "c": 2}

pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [9]:
# If an index is passed, the values in data corresponding to the labels in the index will be pulled out

In [10]:
d = {"a": 0.0, 
     "b": 1.0, 
     "c": 2.0}

pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [11]:
# NaN (not a number) is the standard missing data marker used in pandas.

In [12]:
# If data is a scalar value, an index must be provided. The value will be repeated to match the length of index.

In [13]:
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

## Series is like ndarray

In [14]:
# Series acts very similarly to a ndarray and is a valid argument to most NumPy functions

In [15]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a   -0.268935
b    0.735373
c    0.653064
d    1.052138
e    0.087796
dtype: float64

In [16]:
s.iloc[0]
s.iloc[3]

np.float64(1.0521380905599567)

In [17]:
s.iloc[:3]

a   -0.268935
b    0.735373
c    0.653064
dtype: float64

In [18]:
s[s > s.median()]

b    0.735373
d    1.052138
dtype: float64

In [19]:
s.iloc[[4, 3, 1]]

e    0.087796
d    1.052138
b    0.735373
dtype: float64

In [20]:
# Like a NumPy array, a pandas Series has a single dtype.

In [21]:
s.dtype

dtype('float64')

In [22]:
# If you need the actual array backing a Series, use Series.array.

In [23]:
s.array

<NumpyExtensionArray>
[-0.2689348292984969,  0.7353728047407346,  0.6530635909276967,
  1.0521380905599567, 0.08779597168611915]
Length: 5, dtype: float64

In [24]:
# if you need an actual ndarray, then use Series.to_numpy().

In [25]:
s.to_numpy()

array([-0.26893483,  0.7353728 ,  0.65306359,  1.05213809,  0.08779597])

In [26]:
# A Series is also like a fixed-size dict in that you can get and set values by index label
s

a   -0.268935
b    0.735373
c    0.653064
d    1.052138
e    0.087796
dtype: float64

In [27]:
s["a"]

np.float64(-0.2689348292984969)

In [28]:
s["e"] = 12.0
s

a    -0.268935
b     0.735373
c     0.653064
d     1.052138
e    12.000000
dtype: float64

In [29]:
"e" in s

True

In [30]:
"j" in s

False

In [31]:
# Using the Series.get() method, a missing label will return None or specified default

In [32]:
s.get("e")

np.float64(12.0)

In [33]:
s.get("f")

In [34]:
s.get("f", np.nan)

nan

## Vectorized operations and label alignment with Series

In [35]:
s = pd.Series(np.random.randint(5, size = 5), index = ['a', 'b', 'c', 'd', 'e'])
s

a    3
b    3
c    1
d    3
e    0
dtype: int32

In [36]:
s + s 

a    6
b    6
c    2
d    6
e    0
dtype: int32

In [37]:
s * s

a    9
b    9
c    1
d    9
e    0
dtype: int32

In [38]:
# "s.iloc[1:]" This gives you everything except the first value
# "s.iloc[:-1]" This gives you everything except the last value:

In [39]:
s.iloc[1:] + s.iloc[:-1]

a    NaN
b    6.0
c    2.0
d    6.0
e    NaN
dtype: float64

## Name attribute

In [40]:
s = pd.Series(np.random.randint(5, size=5), name="something")
print(s)
print(s.name)

0    4
1    3
2    0
3    3
4    3
Name: something, dtype: int32
something


In [41]:
# We can rename a Series with the pandas.Series.rename() method.

In [42]:
s2 = s
s2.name

'something'

In [43]:
s2 = s.rename("different")
s2

0    4
1    3
2    0
3    3
4    3
Name: different, dtype: int32

In [44]:
s.name

'something'

In [45]:
s.rename('laksh')

0    4
1    3
2    0
3    3
4    3
Name: laksh, dtype: int32

In [46]:
# Note that s and s2 refer to different objects.

## get infomation of Series

In [47]:
print(s.dtype)
print('.')
print(s.ndim)
print('.')
print(s.size)
print('.')
print(s.name)
print('.')
print(s.hasnans)
print('.')
print(s.index)
print('.')
print(s.head(2))
print('.')
print(s.tail(2))
print('.')
print(s.info)

int32
.
1
.
5
.
something
.
False
.
RangeIndex(start=0, stop=5, step=1)
.
0    4
1    3
Name: something, dtype: int32
.
3    3
4    3
Name: something, dtype: int32
.
<bound method Series.info of 0    4
1    3
2    0
3    3
4    3
Name: something, dtype: int32>


## Merge two Series

In [48]:
import pandas as pd

s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6], index=['a', 'b', 'c'])

df = pd.concat([s1, s2], axis=1)
df.columns = ['Series1', 'Series2']
print(df)


   Series1  Series2
a        1        4
b        2        5
c        3        6


## unique categories

In [49]:
s = pd.Series(['p', 'q', 'r', 's', 'q'], dtype="category")
s = s.cat.add_categories(['t','l'])
s

0    p
1    q
2    r
3    s
4    q
dtype: category
Categories (6, object): ['p', 'q', 'r', 's', 't', 'l']

## Series Releted all function

In [50]:
import pandas as pd
import numpy as np

# Sample Series with one missing value (NaN)
# We use this same series for all functions so you can compare the results easily.
s = pd.Series([10, 20, 30, None, 40, 50])
print("Original Series:\n", s, "\n")

# 1. count() → Counts only non-missing (non-NaN) values
print("count       →", s.count())

# 2. sum() → Adds up all non-missing values in the series
print("sum         →", s.sum())

# 3. mean() → Calculates the average (sum of values ÷ count)
print("mean        →", s.mean())

# 4. median() → Finds the middle value when data is sorted
print("median      →", s.median())

# 5. min() → Finds the smallest value in the series
print("min         →", s.min())

# 6. max() → Finds the largest value in the series
print("max         →", s.max())

# 7. mode() → Returns the most frequently occurring value(s)
print("mode        →", s.mode().tolist())

# 8. abs() → Returns the absolute value of each element
# (negative numbers become positive, positive numbers stay same)
print("abs         →\n", s.abs())

# 9. prod() → Multiplies all non-missing values together
print("prod        →", s.prod(skipna=True))

# 10. std() → Standard deviation (how spread out the values are from the mean)
print("std         →", s.std())

# 11. var() → Variance (square of standard deviation, measure of spread)
print("var         →", s.var())

# 12. sem() → Standard error of the mean (std ÷ sqrt(count)), used in statistics
print("sem         →", s.sem())

# 13. skew() → Measures asymmetry of the data distribution
# (0 means perfectly symmetrical, positive means right-skewed, negative means left-skewed)
print("skew        →", s.skew())

# 14. kurt() → Measures the 'tailedness' of the distribution
# (positive = heavy tails, negative = light tails compared to normal distribution)
print("kurt        →", s.kurt())

# 15. quantile(p) → Returns the value at the given percentile 'p'
# e.g., 0.5 means 50% (median), 0.25 means 25% (1st quartile)
print("quantile(0.5) →", s.quantile(0.5))

# 16. cumsum() → Cumulative sum (adds each value to all previous ones)
print("cumsum      →\n", s.cumsum())

# 17. cumprod() → Cumulative product (multiplies each value with all previous ones)
print("cumprod     →\n", s.cumprod())

# 18. cummax() → Cumulative maximum (largest value so far at each position)
print("cummax      →\n", s.cummax())

# 19. cummin() → Cumulative minimum (smallest value so far at each position)
print("cummin      →\n", s.cummin())

# 20. value_counts() → Counts how many times each unique value appears
print("value_counts→\n", s.value_counts())

# 21. unique() → Returns all unique values (NaN is included if present)
print("unique      →", s.unique())

# 22. nlargest(n) → Returns the top 'n' largest values
print("nlargest(3) →\n", s.nlargest(3))

# 23. nsmallest(n) → Returns the top 'n' smallest values
print("nsmallest(3)→\n", s.nsmallest(3))


Original Series:
 0    10.0
1    20.0
2    30.0
3     NaN
4    40.0
5    50.0
dtype: float64 

count       → 5
sum         → 150.0
mean        → 30.0
median      → 30.0
min         → 10.0
max         → 50.0
mode        → [10.0, 20.0, 30.0, 40.0, 50.0]
abs         →
 0    10.0
1    20.0
2    30.0
3     NaN
4    40.0
5    50.0
dtype: float64
prod        → 12000000.0
std         → 15.811388300841896
var         → 250.0
sem         → 7.071067811865475
skew        → 0.0
kurt        → -1.2000000000000002
quantile(0.5) → 30.0
cumsum      →
 0     10.0
1     30.0
2     60.0
3      NaN
4    100.0
5    150.0
dtype: float64
cumprod     →
 0          10.0
1         200.0
2        6000.0
3           NaN
4      240000.0
5    12000000.0
dtype: float64
cummax      →
 0    10.0
1    20.0
2    30.0
3     NaN
4    40.0
5    50.0
dtype: float64
cummin      →
 0    10.0
1    10.0
2    10.0
3     NaN
4    10.0
5    10.0
dtype: float64
value_counts→
 10.0    1
20.0    1
30.0    1
40.0    1
50.0    1
Name: co

# 3. DataFrame

In [51]:
# DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
# You can think of it like a spreadsheet or SQL table, or a dict of Series objects.

## DataFrame accepts many different kinds of input:
###   i. Dict of Series, 1D ndarrays, lists, dicts, or tuples
###   ii. A Series
###   iiI. 2-D numpy.ndarray
###   iv. Another DataFrame

## From dict from Series

In [52]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [53]:
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [54]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "one"])

Unnamed: 0,two,one
d,4.0,
b,2.0,2.0
a,1.0,1.0


In [55]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [56]:
# When a particular set of columns is passed along with a dict of data, 
#                     the passed columns override the keys in the dict.

In [57]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [58]:
df.columns

Index(['one', 'two'], dtype='object')

## From dict of ndarrays / lists

In [59]:
# All ndarrays must share the same length. If an index is passed, it must also be the same length as the arrays.
# If no index is passed, the result will be range(n), where n is the array length.

In [60]:
d = {"one": [1.0, 2.0, 3.0, 4.0], 
     "two": [4.0, 3.0, 2.0, 1.0]}
d

{'one': [1.0, 2.0, 3.0, 4.0], 'two': [4.0, 3.0, 2.0, 1.0]}

In [61]:
pd.DataFrame(d, index=["a", "b", "c", "d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [62]:
# DataFrame is not intended to work exactly like a 2-dimensional NumPy ndarray.

## From a list of dicts

In [63]:
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [64]:
pd.DataFrame(data2, index=["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [65]:
pd.DataFrame(data2, columns=["b", "a"])

Unnamed: 0,b,a
0,2,1
1,10,5


## From a dict of tuples

In [66]:
pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


## From a Series

In [67]:
ser = pd.Series(range(3), index=list("abc"), name="ser")
ser

a    0
b    1
c    2
Name: ser, dtype: int64

In [68]:
pd.DataFrame(ser)

Unnamed: 0,ser
a,0
b,1
c,2


# 4. Column selection, addition, deletion in Dataframe

## Create

In [69]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([4.0, 5.0, 6.0, 7.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,5.0
c,3.0,6.0
d,,7.0


# Select

In [70]:
df['two']

a    4.0
b    5.0
c    6.0
d    7.0
Name: two, dtype: float64

In [71]:
df.loc['b']

one    2.0
two    5.0
Name: b, dtype: float64

In [72]:
df.iloc[2]

one    3.0
two    6.0
Name: c, dtype: float64

In [73]:
value = df.loc['c', 'two']
value

np.float64(6.0)

In [74]:
value = df.iloc[1, 1]
value

np.float64(5.0)

In [75]:
# Interate Dataframe to display the columns
import pandas as pd

data = {
    'Student': ["Laksh", "Amit", "John", "Jakob", "Devid", "Steve"],
    'Rank': [1, 2, 3, 4, 5, 6],
    'Marks': [99, 98, 97, 96, 95, 94],
}

# Use the index argument to set your Index. 
df = pd.DataFrame(data, index=["Student_1", "Student_2", "Student_3", "Student_4", "Student_5", "Student_6"])
print(df)

for col in df:
    print(col)
    print(df[col].values)

          Student  Rank  Marks
Student_1   Laksh     1     99
Student_2    Amit     2     98
Student_3    John     3     97
Student_4   Jakob     4     96
Student_5   Devid     5     95
Student_6   Steve     6     94
Student
['Laksh' 'Amit' 'John' 'Jakob' 'Devid' 'Steve']
Rank
[1 2 3 4 5 6]
Marks
[99 98 97 96 95 94]


## Adding

In [76]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([4.0, 5.0, 6.0, 7.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,5.0
c,3.0,6.0
d,,7.0


In [77]:
df["three"] = df["one"] * df["two"]
df

Unnamed: 0,one,two,three
a,1.0,4.0,4.0
b,2.0,5.0,10.0
c,3.0,6.0,18.0
d,,7.0,


In [78]:
df["flag"] = df["one"] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,4.0,4.0,False
b,2.0,5.0,10.0,False
c,3.0,6.0,18.0,True
d,,7.0,,False


## Delete

In [79]:
del df["two"]
df

Unnamed: 0,one,three,flag
a,1.0,4.0,False
b,2.0,10.0,False
c,3.0,18.0,True
d,,,False


In [80]:
df["foo"] = "bar"
df

Unnamed: 0,one,three,flag,foo
a,1.0,4.0,False,bar
b,2.0,10.0,False,bar
c,3.0,18.0,True,bar
d,,,False,bar


In [81]:
df["one_trunc"] = df["one"][:2]
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,4.0,False,bar,1.0
b,2.0,10.0,False,bar,2.0
c,3.0,18.0,True,bar,
d,,,False,bar,


## Insert Row

In [82]:
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.

In [83]:
# By default, columns get inserted at the end.

In [84]:
df.loc['e'] = {
    'one': 4.0,
    'three': 20.0,
    'flag': True,
    'foo': 'bar',
    'one_trunc': 4.0
}
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,4.0,False,bar,1.0
b,2.0,10.0,False,bar,2.0
c,3.0,18.0,True,bar,
d,,,False,bar,
e,4.0,20.0,True,bar,4.0


## DataFrame Attribute

In [85]:
data = {
    'Student': ["Laksh", "Amit", "John", "Jakob", "Devid", "Steve"],
    'Rank': [1, 2, 3, 4, 5, 6],
    'Marks': [99, 98, 97, 96, 95, 94],
}

# Use the index argument to set your Index. 
df = pd.DataFrame(data, index=["Student_1", "Student_2", "Student_3", "Student_4", "Student_5", "Student_6"])
print(df)

          Student  Rank  Marks
Student_1   Laksh     1     99
Student_2    Amit     2     98
Student_3    John     3     97
Student_4   Jakob     4     96
Student_5   Devid     5     95
Student_6   Steve     6     94


In [86]:
print(df.dtypes)
print('.')
print(df.ndim)
print('.')
print(df.size)
print('.')
print(df.shape)
print('.')
print(df.index)
print('.')
print(df.T)
print('.')
print(df.head(2))
print('.')
print(df.tail(2))

Student    object
Rank        int64
Marks       int64
dtype: object
.
2
.
18
.
(6, 3)
.
Index(['Student_1', 'Student_2', 'Student_3', 'Student_4', 'Student_5',
       'Student_6'],
      dtype='object')
.
        Student_1 Student_2 Student_3 Student_4 Student_5 Student_6
Student     Laksh      Amit      John     Jakob     Devid     Steve
Rank            1         2         3         4         5         6
Marks          99        98        97        96        95        94
.
          Student  Rank  Marks
Student_1   Laksh     1     99
Student_2    Amit     2     98
.
          Student  Rank  Marks
Student_5   Devid     5     95
Student_6   Steve     6     94


## Data alignment and arithmetic

In [87]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])

df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print(df)
print('.')
print(df2)

          A         B         C         D
0 -1.191509 -0.536494 -0.002057  1.725754
1 -1.540053 -1.617953  0.112477  2.009726
2  1.040274 -0.049491  1.395413  0.085730
3  0.853755 -0.258273 -0.134567 -1.019501
4  0.175346  0.910376  2.567082  0.728113
5  1.142701  1.321979 -1.096610  0.185904
6  0.850437  1.272456  0.188868  1.625314
7  0.986921  0.524620 -0.766362 -0.329652
8 -1.265484  0.608503  0.754789 -0.038753
9 -0.045956  0.960628 -0.148223  0.348221
.
          A         B         C
0  1.045783  1.462392  0.968811
1  1.476169  1.252599  0.180473
2 -0.354172  1.381941  1.258117
3 -0.147497  0.073381 -1.299730
4  0.940903 -1.226440 -0.039235
5 -0.039446  0.706573 -0.095260
6  0.795487 -0.082181 -0.807994


In [88]:
df + df2

Unnamed: 0,A,B,C,D
0,-0.145726,0.925898,0.966754,
1,-0.063885,-0.365354,0.29295,
2,0.686102,1.33245,2.653531,
3,0.706258,-0.184891,-1.434297,
4,1.116248,-0.316064,2.527846,
5,1.103255,2.028551,-1.19187,
6,1.645925,1.190275,-0.619125,
7,,,,
8,,,,
9,,,,


In [89]:
df * 5 + 2

Unnamed: 0,A,B,C,D
0,-3.957545,-0.682469,1.989715,10.628768
1,-5.700266,-6.089763,2.562387,12.048631
2,7.201371,1.752546,8.977067,2.428652
3,6.268774,0.708636,1.327165,-3.097504
4,2.876729,6.551882,14.835408,5.640566
5,7.713505,8.609894,-3.483048,2.929519
6,6.252187,8.36228,2.944342,10.126568
7,6.934604,4.6231,-1.831809,0.35174
8,-4.327422,5.042514,5.773947,1.806237
9,1.770219,6.80314,1.258885,3.741105


In [90]:
1 / df

Unnamed: 0,A,B,C,D
0,-0.839272,-1.863954,-486.167547,0.579457
1,-0.649328,-0.618065,8.890669,0.49758
2,0.961285,-20.205741,0.716633,11.664486
3,1.171296,-3.871876,-7.431242,-0.980872
4,5.703016,1.098447,0.389547,1.373413
5,0.87512,0.756442,-0.911902,5.379126
6,1.175866,0.785882,5.294692,0.615266
7,1.013253,1.906141,-1.304867,-3.033502
8,-0.790211,1.643378,1.324873,-25.804693
9,-21.75982,1.040986,-6.746594,2.87174


### boolean Operation

In [91]:
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)

df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)

print(df1)
print('.')
print(df2)

       a      b
0   True  False
1  False   True
2   True   True
.
       a      b
0  False   True
1   True   True
2   True  False


In [92]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [93]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [94]:
df[:5].T

Unnamed: 0,0,1,2,3,4
A,-1.191509,-1.540053,1.040274,0.853755,0.175346
B,-0.536494,-1.617953,-0.049491,-0.258273,0.910376
C,-0.002057,0.112477,1.395413,-0.134567,2.567082
D,1.725754,2.009726,0.08573,-1.019501,0.728113


## Merge DataFrame

In [95]:
import pandas as pd

data1 = {
    'id': ["S01", "S02", "S03", "S04", "S05"],
    'Student': ["Laksh", "Taksh", "Daksh", "Taksh", "Darsh"],
    'Roll': [101, 102, 103, 104, 105],
}

data2 = {
    'Rank': [3, 2, 4, 5, 1],
    'Marks': [12, 35, 53, 45, 23]
}

dataFrame1 = pd.DataFrame(data1)
dataFrame2 = pd.DataFrame(data2)


resDf = dataFrame1.join(dataFrame2)
print(resDf)

print(resDf.T)

    id Student  Roll  Rank  Marks
0  S01   Laksh   101     3     12
1  S02   Taksh   102     2     35
2  S03   Daksh   103     4     53
3  S04   Taksh   104     5     45
4  S05   Darsh   105     1     23
             0      1      2      3      4
id         S01    S02    S03    S04    S05
Student  Laksh  Taksh  Daksh  Taksh  Darsh
Roll       101    102    103    104    105
Rank         3      2      4      5      1
Marks       12     35     53     45     23


In [96]:
import pandas as pd

s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6], index=['a', 'b', 'c'])

df = pd.concat([s1, s2], axis=1)
df.columns = ['Series1', 'Series2']

print(df)


   Series1  Series2
a        1        4
b        2        5
c        3        6


In [97]:
import pandas as pd

data1 = {
    'id': ["S01", "S02", "S03", "S04", "S05"],
    'Student': ["Laksh", "Taksh", "Daksh", "Taksh", "Darsh"],
    'Roll': [101, 102, 103, 104, 105],
}

data2 = {
    'id': ["S06", "S07", "S08", "S09", "S10"],
    'Student': ["Paresh", "Jayesh", "Suresh", "Rajesh", "Naresh"],
    'Roll': [106, 107, 108, 109, 110],
}

dataFrame1 = pd.DataFrame(data1, index=["Student_1", "Student_2", "Student_3", "Student_4", "Student_5",])
dataFrame2 = pd.DataFrame(data2, index=[ "Student_6", "Student_7", "Student_8", "Student_9", "Student_10",])

resDf = pd.concat([dataFrame1, dataFrame2])
print(resDf)

             id Student  Roll
Student_1   S01   Laksh   101
Student_2   S02   Taksh   102
Student_3   S03   Daksh   103
Student_4   S04   Taksh   104
Student_5   S05   Darsh   105
Student_6   S06  Paresh   106
Student_7   S07  Jayesh   107
Student_8   S08  Suresh   108
Student_9   S09  Rajesh   109
Student_10  S10  Naresh   110


# 5. Essential basic functionality

In [98]:
index = pd.date_range("1/1/2000", periods=8)

s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
print(df)
val = df.to_numpy()
print(val)

                   A         B         C
2000-01-01 -1.317295 -0.468126  0.728729
2000-01-02 -0.270636  0.815908  1.241371
2000-01-03 -1.827863 -0.557270 -1.395688
2000-01-04  0.294813 -1.733121 -1.648971
2000-01-05  1.012599  1.263191  0.426313
2000-01-06  0.628102 -0.338300  1.257148
2000-01-07 -0.383281 -0.904483  0.731344
2000-01-08  0.998039  1.560781  0.004450
[[-1.31729518 -0.46812617  0.72872932]
 [-0.27063567  0.81590775  1.24137124]
 [-1.82786325 -0.55726995 -1.39568791]
 [ 0.29481257 -1.73312072 -1.64897056]
 [ 1.01259927  1.26319057  0.42631282]
 [ 0.6281019  -0.33829964  1.25714754]
 [-0.38328141 -0.90448267  0.73134431]
 [ 0.99803941  1.56078059  0.00445023]]


## DataFrame Functions

In [99]:
import pandas as pd
import numpy as np

# Sample dataset for demonstration
data = {
    "Name": ["Aarav", "Diya", "Kunal", "Meera", "Rohan", "Isha", "Aryan", "Tanvi"],
    "Department": ["HR", "IT", "Finance", "IT", "Finance", "HR", "Finance", "IT"],
    "Salary": [50000, 60000, 75000, 65000, 70000, 52000, 72000, 58000],
    "Joining_Year": [2018, 2019, 2017, 2020, 2018, 2019, 2017, 2020]
}

df = pd.DataFrame(data)

# ==============================
# 1. head()
# ==============================
# PURPOSE: Returns the first 'n' rows of the DataFrame.
# WHY USE IT? To quickly preview the start of your dataset without printing all rows.
print("1. head() →\n", df.head(), "\n")

# ==============================
# 2. tail()
# ==============================
# PURPOSE: Returns the last 'n' rows of the DataFrame.
# WHY USE IT? Useful to see the ending records and check dataset completeness.
print("2. tail() →\n", df.tail(), "\n")

# ==============================
# 3. info()
# ==============================
# PURPOSE: Displays DataFrame structure, column names, data types, and null count.
# WHY USE IT? Helps in understanding dataset metadata before analysis.
print("3. info() →")
print(df.info(), "\n")

# ==============================
# 4. describe()
# ==============================
# PURPOSE: Gives statistical summary for numeric columns.
# WHY USE IT? Quickly checks min, max, mean, quartiles for data insights.
print("4. describe() →\n", df.describe(), "\n")

# ==============================
# 5. shape
# ==============================
# PURPOSE: Returns a tuple (rows, columns) showing dataset dimensions.
# WHY USE IT? Essential for understanding dataset size.
print("5. shape →", df.shape, "\n")

# ==============================
# 6. columns
# ==============================
# PURPOSE: Lists all column names.
# WHY USE IT? Useful when you forget column spelling or want to rename them.
print("6. columns →", df.columns, "\n")

# ==============================
# 7. index
# ==============================
# PURPOSE: Returns the index (row labels) of the DataFrame.
# WHY USE IT? Helps in row selection and indexing operations.
print("7. index →", df.index, "\n")

# ==============================
# 8. dtypes
# ==============================
# PURPOSE: Shows the data type for each column.
# WHY USE IT? Ensures correct types for calculations and operations.
print("8. dtypes →\n", df.dtypes, "\n")

# ==============================
# 9. sort_values()
# ==============================
# PURPOSE: Sorts rows by one or more columns.
# WHY USE IT? Helps in ranking, ordering, or prioritizing data.
print("9. sort_values('Salary') →\n", df.sort_values(by="Salary"), "\n")

# ==============================
# 10. groupby()
# ==============================
# PURPOSE: Groups data based on a column and performs aggregate calculations.
# WHY USE IT? Summarizes large datasets by category.
print("10. groupby('Department').mean() →\n", df.groupby("Department").mean(numeric_only=True), "\n")

# ==============================
# 11. value_counts()
# ==============================
# PURPOSE: Counts the occurrence of each unique value in a column.
# WHY USE IT? Great for frequency distribution analysis.
print("11. value_counts('Department') →\n", df["Department"].value_counts(), "\n")

# ==============================
# 12. isnull()
# ==============================
# PURPOSE: Checks for missing values (NaN) in dataset.
# WHY USE IT? Important for data cleaning before analysis.
print("12. isnull() →\n", df.isnull(), "\n")

# ==============================
# 13. dropna()
# ==============================
# PURPOSE: Removes rows or columns with missing values.
# WHY USE IT? Keeps dataset clean by removing incomplete records.
print("13. dropna() →\n", df.dropna(), "\n")

# ==============================
# 14. fillna()
# ==============================
# PURPOSE: Replaces NaN values with a specified value.
# WHY USE IT? Retains data completeness without dropping rows.
print("14. fillna(0) →\n", df.fillna(0), "\n")

# ==============================
# 15. apply()
# ==============================
# PURPOSE: Applies a function to each column or row.
# WHY USE IT? Useful for custom calculations.
print("15. apply(lambda x: x) →\n", df.apply(lambda x: x), "\n")

# ==============================
# 16. pivot_table()
# ==============================
# PURPOSE: Creates a pivot table for summarizing data.
# WHY USE IT? Best for multidimensional data analysis.
print("16. pivot_table() →\n", df.pivot_table(values="Salary", index="Department", aggfunc="mean"), "\n")

# ==============================
# 17. duplicated()
# ==============================
# PURPOSE: Detects duplicate rows.
# WHY USE IT? Helps in data cleaning by finding repeated entries.
print("17. duplicated() →\n", df.duplicated(), "\n")

# ==============================
# 18. drop_duplicates()
# ==============================
# PURPOSE: Removes duplicate rows.
# WHY USE IT? Ensures data uniqueness.
print("18. drop_duplicates() →\n", df.drop_duplicates(), "\n")

# ==============================
# 19. rename()
# ==============================
# PURPOSE: Renames one or more columns.
# WHY USE IT? Improves column readability and correctness.
print("19. rename(columns={'Name': 'Full_Name'}) →\n", df.rename(columns={"Name": "Full_Name"}), "\n")

# ==============================
# 20. set_index()
# ==============================
# PURPOSE: Sets a column as the index of the DataFrame.
# WHY USE IT? Useful for indexed data access and better row labeling.
print("20. set_index('Name') →\n", df.set_index("Name"), "\n")


1. head() →
     Name Department  Salary  Joining_Year
0  Aarav         HR   50000          2018
1   Diya         IT   60000          2019
2  Kunal    Finance   75000          2017
3  Meera         IT   65000          2020
4  Rohan    Finance   70000          2018 

2. tail() →
     Name Department  Salary  Joining_Year
3  Meera         IT   65000          2020
4  Rohan    Finance   70000          2018
5   Isha         HR   52000          2019
6  Aryan    Finance   72000          2017
7  Tanvi         IT   58000          2020 

3. info() →
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          8 non-null      object
 1   Department    8 non-null      object
 2   Salary        8 non-null      int64 
 3   Joining_Year  8 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 388.0+ bytes
None 

4. describe() →
              Salary  Jo

In [100]:
df

Unnamed: 0,Name,Department,Salary,Joining_Year
0,Aarav,HR,50000,2018
1,Diya,IT,60000,2019
2,Kunal,Finance,75000,2017
3,Meera,IT,65000,2020
4,Rohan,Finance,70000,2018
5,Isha,HR,52000,2019
6,Aryan,Finance,72000,2017
7,Tanvi,IT,58000,2020


In [101]:
check = pd.DataFrame({
    "Name": ["Aarav", "Diya", "Kunal", "Meera", "Rohan", "Isha"]
})
check

Unnamed: 0,Name
0,Aarav
1,Diya
2,Kunal
3,Meera
4,Rohan
5,Isha


In [102]:
for i, j in check.iterrows():
    print(i)
    print(j)

0
Name    Aarav
Name: 0, dtype: object
1
Name    Diya
Name: 1, dtype: object
2
Name    Kunal
Name: 2, dtype: object
3
Name    Meera
Name: 3, dtype: object
4
Name    Rohan
Name: 4, dtype: object
5
Name    Isha
Name: 5, dtype: object


In [103]:
for i, j in df.iterrows():
    print(i)
    print(j)

0
Name            Aarav
Department         HR
Salary          50000
Joining_Year     2018
Name: 0, dtype: object
1
Name             Diya
Department         IT
Salary          60000
Joining_Year     2019
Name: 1, dtype: object
2
Name              Kunal
Department      Finance
Salary            75000
Joining_Year       2017
Name: 2, dtype: object
3
Name            Meera
Department         IT
Salary          65000
Joining_Year     2020
Name: 3, dtype: object
4
Name              Rohan
Department      Finance
Salary            70000
Joining_Year       2018
Name: 4, dtype: object
5
Name             Isha
Department         HR
Salary          52000
Joining_Year     2019
Name: 5, dtype: object
6
Name              Aryan
Department      Finance
Salary            72000
Joining_Year       2017
Name: 6, dtype: object
7
Name            Tanvi
Department         IT
Salary          58000
Joining_Year     2020
Name: 7, dtype: object


In [104]:
for i, row in df.iterrows():
    print(f"Index: {i}")
    print(f"Name: {row['Name']}")
    print(f"Department: {row['Department']}")
    print(f"Salary: {row['Salary']}")
    print(f"Joining Year: {row['Joining_Year']}")
    print("------")


Index: 0
Name: Aarav
Department: HR
Salary: 50000
Joining Year: 2018
------
Index: 1
Name: Diya
Department: IT
Salary: 60000
Joining Year: 2019
------
Index: 2
Name: Kunal
Department: Finance
Salary: 75000
Joining Year: 2017
------
Index: 3
Name: Meera
Department: IT
Salary: 65000
Joining Year: 2020
------
Index: 4
Name: Rohan
Department: Finance
Salary: 70000
Joining Year: 2018
------
Index: 5
Name: Isha
Department: HR
Salary: 52000
Joining Year: 2019
------
Index: 6
Name: Aryan
Department: Finance
Salary: 72000
Joining Year: 2017
------
Index: 7
Name: Tanvi
Department: IT
Salary: 58000
Joining Year: 2020
------


## Date and time

In [105]:
s = pd.Series(pd.date_range("20130101 09:10:12", periods=4))
print(s)
print('.')
print(s.dt.day)
print('.')
print(s.dt.hour)
print('.')
print(s.dt.second)

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]
.
0    1
1    2
2    3
3    4
dtype: int32
.
0    9
1    9
2    9
3    9
dtype: int32
.
0    12
1    12
2    12
3    12
dtype: int32


## smallest / largest values

In [106]:
s = pd.Series(np.random.permutation(10))
s

0    1
1    2
2    5
3    0
4    7
5    3
6    6
7    4
8    9
9    8
dtype: int32

In [107]:
print(s.sort_values())
print('.')
print(s.nsmallest(3))
print('.')
print(s.nlargest(3))

3    0
0    1
1    2
5    3
7    4
2    5
6    6
4    7
9    8
8    9
dtype: int32
.
3    0
0    1
1    2
dtype: int32
.
8    9
9    8
4    7
dtype: int32


In [108]:
df = pd.DataFrame(
    {
        "a": [-2, -1, 1, 10, 8, 11, -1],
        "b": list("abdceff"),
        "c": [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0],
    }
)

df

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
2,1,d,4.0
3,10,c,3.2
4,8,e,
5,11,f,3.0
6,-1,f,4.0


In [109]:
df.nlargest(3, "a")

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,


In [110]:
df.nsmallest(3, "a")

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0


In [111]:
df.nlargest(5, ["a", "c"])

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,
2,1,d,4.0
6,-1,f,4.0


In [112]:
df.nsmallest(5, ["a", "c"])

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0
2,1,d,4.0
4,8,e,


# 6. Best way to select data

In [113]:
dates = pd.date_range("2000-01-01", periods=8)
s = pd.Series(np.random.randn(8), index=dates, name="A")
print(s)


2000-01-01    0.453468
2000-01-02   -0.485845
2000-01-03    0.021867
2000-01-04   -1.404052
2000-01-05    1.296674
2000-01-06    2.024306
2000-01-07   -1.558409
2000-01-08    0.814904
Freq: D, Name: A, dtype: float64


In [114]:
s[:5]

2000-01-01    0.453468
2000-01-02   -0.485845
2000-01-03    0.021867
2000-01-04   -1.404052
2000-01-05    1.296674
Freq: D, Name: A, dtype: float64

In [115]:
s[::2]

2000-01-01    0.453468
2000-01-03    0.021867
2000-01-05    1.296674
2000-01-07   -1.558409
Freq: 2D, Name: A, dtype: float64

In [116]:
s[::-1]

2000-01-08    0.814904
2000-01-07   -1.558409
2000-01-06    2.024306
2000-01-05    1.296674
2000-01-04   -1.404052
2000-01-03    0.021867
2000-01-02   -0.485845
2000-01-01    0.453468
Freq: -1D, Name: A, dtype: float64

In [117]:
s2 = s.copy()
s2[:5] = 0
s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06    2.024306
2000-01-07   -1.558409
2000-01-08    0.814904
Freq: D, Name: A, dtype: float64

In [118]:
df[:3]

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
2,1,d,4.0


In [119]:
dates = pd.date_range("2000-01-01", periods=8)

df = pd.DataFrame(
    np.random.randn(8, 4),  # 8 rows, 4 columns
    index=dates,
    columns=list("ABCD")
)

print(df)


                   A         B         C         D
2000-01-01  1.452063 -0.942676 -2.207208  0.217917
2000-01-02  0.406889  0.132702 -1.495531 -0.036559
2000-01-03 -0.162525 -1.012809 -1.635911 -0.753711
2000-01-04 -0.986722 -1.313141 -0.200765 -0.249731
2000-01-05 -1.384928  0.773844 -0.165643 -0.196949
2000-01-06 -1.509055  0.862026 -0.421215 -0.370142
2000-01-07 -0.272076 -1.236316  0.699499  0.171861
2000-01-08  0.960760  1.939923  0.608294 -0.585689


In [120]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,1.452063,-0.942676,-2.207208,0.217917
2000-01-02,0.406889,0.132702,-1.495531,-0.036559
2000-01-03,-0.162525,-1.012809,-1.635911,-0.753711


In [121]:
df[::-1]

Unnamed: 0,A,B,C,D
2000-01-08,0.96076,1.939923,0.608294,-0.585689
2000-01-07,-0.272076,-1.236316,0.699499,0.171861
2000-01-06,-1.509055,0.862026,-0.421215,-0.370142
2000-01-05,-1.384928,0.773844,-0.165643,-0.196949
2000-01-04,-0.986722,-1.313141,-0.200765,-0.249731
2000-01-03,-0.162525,-1.012809,-1.635911,-0.753711
2000-01-02,0.406889,0.132702,-1.495531,-0.036559
2000-01-01,1.452063,-0.942676,-2.207208,0.217917


In [122]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-0.607224,0.526057,-0.219569,-2.112782
b,-0.384366,0.37736,-0.656182,-0.221889
c,0.842467,1.764635,2.362233,0.951795
d,-0.70475,-0.868925,-1.692997,0.766077
e,-0.465105,0.059438,-0.071518,-0.311735
f,-0.057679,1.042649,-1.175842,0.059158


In [123]:
df1.loc[['a', 'b', 'd'], :]

Unnamed: 0,A,B,C,D
a,-0.607224,0.526057,-0.219569,-2.112782
b,-0.384366,0.37736,-0.656182,-0.221889
d,-0.70475,-0.868925,-1.692997,0.766077


In [124]:
df1.loc['a']

A   -0.607224
B    0.526057
C   -0.219569
D   -2.112782
Name: a, dtype: float64

In [125]:
df1.loc['a'] > 0

A    False
B     True
C    False
D    False
Name: a, dtype: bool

In [126]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,B
a,0.526057
b,0.37736
c,1.764635
d,-0.868925
e,0.059438
f,1.042649


In [127]:
mask = pd.array([True, False, True, False, pd.NA, True], dtype="boolean")
mask

<BooleanArray>
[True, False, True, False, <NA>, True]
Length: 6, dtype: boolean

In [128]:
df1[mask]

Unnamed: 0,A,B,C,D
a,-0.607224,0.526057,-0.219569,-2.112782
c,0.842467,1.764635,2.362233,0.951795
f,-0.057679,1.042649,-1.175842,0.059158


In [129]:
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])
s

0    a
3    b
2    c
5    d
4    e
dtype: object

In [130]:
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [131]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [132]:
s.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object

In [133]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1

0    0.370495
2    0.863155
4   -1.491876
6   -1.316430
8    0.013384
dtype: float64

In [134]:
s1.iloc[:3]

0    0.370495
2    0.863155
4   -1.491876
dtype: float64

In [135]:
s1.iloc[3]

np.float64(-1.3164301950231285)

In [136]:
s1.iloc[:3] = 0
s1

0    0.000000
2    0.000000
4    0.000000
6   -1.316430
8    0.013384
dtype: float64

## With Dataframe

In [137]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list(range(0, 12, 2)),
                   columns=list(range(0, 8, 2)))

df1

Unnamed: 0,0,2,4,6
0,0.178436,-0.735226,0.273352,-0.468548
2,-1.68915,0.56804,1.08737,0.144711
4,-0.53239,2.570344,-0.675914,0.044541
6,-0.922165,-0.831106,0.48465,-0.084256
8,0.021782,1.788185,0.221417,-0.691754
10,0.14105,0.11026,-0.665162,0.488526


In [138]:
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,0.178436,-0.735226,0.273352,-0.468548
2,-1.68915,0.56804,1.08737,0.144711
4,-0.53239,2.570344,-0.675914,0.044541


In [139]:
# 1:5 → Select rows from index position 1 up to but not including 5 (so rows with positions 1, 2, 3, 4).

# 2:4 → Select columns from index position 2 up to but not including 4 (so columns at positions 2 and 3).

df1.iloc[1:5, 2:4]

Unnamed: 0,4,6
2,1.08737,0.144711
4,-0.675914,0.044541
6,0.48465,-0.084256
8,0.221417,-0.691754


In [140]:
df1.iloc[[1, 3, 5], [1, 3]]

Unnamed: 0,2,6
2,0.56804,0.144711
6,-0.831106,-0.084256
10,0.11026,0.488526


In [141]:
df1.iloc[:, 1:3]

Unnamed: 0,2,4
0,-0.735226,0.273352
2,0.56804,1.08737
4,2.570344,-0.675914
6,-0.831106,0.48465
8,1.788185,0.221417
10,0.11026,-0.665162


In [142]:
df1.iloc[1]

0   -1.689150
2    0.568040
4    1.087370
6    0.144711
Name: 2, dtype: float64

In [143]:
# Same thing is also work in normal python and also numpy
x = list('abcdef')
x

['a', 'b', 'c', 'd', 'e', 'f']

In [144]:
print(x[4:10])
print(x[8:10])

['e', 'f']
[]


In [145]:
s = pd.Series(x)
print(s.iloc[4:10])
print(s.iloc[8:10])

4    e
5    f
dtype: object
Series([], dtype: object)


In [146]:
dfl = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))
dfl

Unnamed: 0,A,B
0,-0.158049,-2.835179
1,0.635775,1.105613
2,2.096,-1.196981
3,2.019922,1.8733
4,-1.961959,-0.545041


In [147]:
dfl.iloc[:, 2:3]

0
1
2
3
4


In [148]:
dfl.iloc[:, 1:3]

Unnamed: 0,B
0,-2.835179
1,1.105613
2,-1.196981
3,1.8733
4,-0.545041


In [149]:
dfl.iloc[4:6]

Unnamed: 0,A,B
4,-1.961959,-0.545041


## Boolean indexing

In [150]:
s = pd.Series(range(-3, 4))
s

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [151]:
s[s > 0]

4    1
5    2
6    3
dtype: int64

In [152]:
s[(s < -1) | (s > 0.5)]

0   -3
1   -2
4    1
5    2
6    3
dtype: int64

In [153]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

## "where()" method

In [154]:
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [155]:
s[s > 0]

3    1
2    2
1    3
0    4
dtype: int64

In [156]:
# where show all data
s.where(s > 0)

4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [157]:
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])

df

Unnamed: 0,A,B,C,D
2000-01-01,-1.111184,0.111425,-0.203216,-0.010681
2000-01-02,-1.459308,0.944168,0.133725,-0.92813
2000-01-03,1.841284,0.141059,0.016433,1.520488
2000-01-04,0.051046,0.193758,-0.205155,-1.216477
2000-01-05,0.124263,-0.394562,2.398312,0.603187
2000-01-06,-0.237255,-0.307944,-1.46518,0.22996
2000-01-07,0.879876,-0.976123,-0.008626,-1.791656
2000-01-08,-0.86831,-0.810089,0.400653,-0.269449


In [158]:
df[df < 0]

Unnamed: 0,A,B,C,D
2000-01-01,-1.111184,,-0.203216,-0.010681
2000-01-02,-1.459308,,,-0.92813
2000-01-03,,,,
2000-01-04,,,-0.205155,-1.216477
2000-01-05,,-0.394562,,
2000-01-06,-0.237255,-0.307944,-1.46518,
2000-01-07,,-0.976123,-0.008626,-1.791656
2000-01-08,-0.86831,-0.810089,,-0.269449


In [159]:
df.where(df < 0, other=100)

Unnamed: 0,A,B,C,D
2000-01-01,-1.111184,100.0,-0.203216,-0.010681
2000-01-02,-1.459308,100.0,100.0,-0.92813
2000-01-03,100.0,100.0,100.0,100.0
2000-01-04,100.0,100.0,-0.205155,-1.216477
2000-01-05,100.0,-0.394562,100.0,100.0
2000-01-06,-0.237255,-0.307944,-1.46518,100.0
2000-01-07,100.0,-0.976123,-0.008626,-1.791656
2000-01-08,-0.86831,-0.810089,100.0,-0.269449


In [160]:
df.where(df < 0, -df)

Unnamed: 0,A,B,C,D
2000-01-01,-1.111184,-0.111425,-0.203216,-0.010681
2000-01-02,-1.459308,-0.944168,-0.133725,-0.92813
2000-01-03,-1.841284,-0.141059,-0.016433,-1.520488
2000-01-04,-0.051046,-0.193758,-0.205155,-1.216477
2000-01-05,-0.124263,-0.394562,-2.398312,-0.603187
2000-01-06,-0.237255,-0.307944,-1.46518,-0.22996
2000-01-07,-0.879876,-0.976123,-0.008626,-1.791656
2000-01-08,-0.86831,-0.810089,-0.400653,-0.269449


In [161]:
s2 = s.copy()
s2[s2 < 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [162]:
df2 = df.copy()
df2[df2 < 0] = 0
df2

Unnamed: 0,A,B,C,D
2000-01-01,0.0,0.111425,0.0,0.0
2000-01-02,0.0,0.944168,0.133725,0.0
2000-01-03,1.841284,0.141059,0.016433,1.520488
2000-01-04,0.051046,0.193758,0.0,0.0
2000-01-05,0.124263,0.0,2.398312,0.603187
2000-01-06,0.0,0.0,0.0,0.22996
2000-01-07,0.879876,0.0,0.0,0.0
2000-01-08,0.0,0.0,0.400653,0.0


In [163]:
df2 = df.copy()
df2[df2[1:4] > 0] = 3
df2

Unnamed: 0,A,B,C,D
2000-01-01,-1.111184,0.111425,-0.203216,-0.010681
2000-01-02,-1.459308,3.0,3.0,-0.92813
2000-01-03,3.0,3.0,3.0,3.0
2000-01-04,3.0,3.0,-0.205155,-1.216477
2000-01-05,0.124263,-0.394562,2.398312,0.603187
2000-01-06,-0.237255,-0.307944,-1.46518,0.22996
2000-01-07,0.879876,-0.976123,-0.008626,-1.791656
2000-01-08,-0.86831,-0.810089,0.400653,-0.269449


In [164]:
df = pd.DataFrame({
    'col1': ['A', 'B', 'B', 'C'],
    'col2': ['Z', 'Z', 'X', 'Y']
})

# Define conditions
conditions = [
    (df['col2'] == 'Z') & (df['col1'] == 'A'),
    (df['col2'] == 'Z') & (df['col1'] == 'B'),
    (df['col1'] == 'B')
]

# Define choices
choices = ['yellow', 'blue', 'purple']

# Apply conditions
df['color'] = np.select(conditions, choices, default='black')

print(df)


  col1 col2   color
0    A    Z  yellow
1    B    Z    blue
2    B    X  purple
3    C    Y   black


## Query() method

In [165]:
n = 20
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
print(df)
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

           a         b         c
0   0.001535  0.599048  0.359289
1   0.223811  0.526910  0.024859
2   0.143609  0.214179  0.681243
3   0.126323  0.291977  0.029326
4   0.841549  0.805362  0.752678
5   0.402579  0.687470  0.675145
6   0.534265  0.615124  0.510543
7   0.338336  0.405713  0.038000
8   0.927626  0.177379  0.110387
9   0.145881  0.886670  0.459744
10  0.248308  0.894628  0.725790
11  0.622451  0.565187  0.505934
12  0.697030  0.907298  0.443837
13  0.484884  0.516475  0.499317
14  0.434971  0.637858  0.628640
15  0.835484  0.428290  0.044093
16  0.704006  0.719076  0.149676
17  0.214076  0.412534  0.067388
18  0.793348  0.069989  0.816133
19  0.104083  0.828937  0.077961


Unnamed: 0,a,b,c
2,0.143609,0.214179,0.681243


In [166]:
df.query('(a < b) & (b < c)')

Unnamed: 0,a,b,c
2,0.143609,0.214179,0.681243


In [167]:
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
df.index.name = 'a'
df
print(df.query('a < b and b < c'))
df.query('index < b < c')


   b  c
a      
0  3  5
3  5  9


Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3,5
3,5,9


In [168]:
df = pd.DataFrame({'a': np.random.randint(5, size=5)})
df.index.name = 'b'
df.query('a > 2')

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
2,3
4,4


In [169]:
df.query('index > 2')

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
3,1
4,4


In [170]:
df.query('index == 2')

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
2,3


In [171]:
pt = pd.DataFrame({'a': np.random.randint(5, size=5)})
pt.index.names = [None]
pt

Unnamed: 0,a
0,3
1,0
2,1
3,0
4,3


## In and Not in Operator

In [172]:
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                   'c': np.random.randint(5, size=12),
                   'd': np.random.randint(9, size=12)})
df

Unnamed: 0,a,b,c,d
0,a,a,3,8
1,a,a,1,7
2,b,a,3,3
3,b,a,2,0
4,c,b,1,6
5,c,b,3,7
6,d,b,2,2
7,d,b,4,0
8,e,c,4,0
9,e,c,4,2


In [173]:
df.query('a in b')

Unnamed: 0,a,b,c,d
0,a,a,3,8
1,a,a,1,7
2,b,a,3,3
3,b,a,2,0
4,c,b,1,6
5,c,b,3,7


In [174]:
df.query('a not in b')

Unnamed: 0,a,b,c,d
6,d,b,2,2
7,d,b,4,0
8,e,c,4,0
9,e,c,4,2
10,f,c,1,3
11,f,c,3,6


In [175]:
df.query('a in b and c < d')

Unnamed: 0,a,b,c,d
0,a,a,3,8
1,a,a,1,7
4,c,b,1,6
5,c,b,3,7


## '==' Operator

In [176]:
df.query('b == ["a", "b", "c"]')

Unnamed: 0,a,b,c,d
0,a,a,3,8
1,a,a,1,7
2,b,a,3,3
3,b,a,2,0
4,c,b,1,6
5,c,b,3,7
6,d,b,2,2
7,d,b,4,0
8,e,c,4,0
9,e,c,4,2


In [177]:
df[df['b'].isin(["a", "b", "c"])]

Unnamed: 0,a,b,c,d
0,a,a,3,8
1,a,a,1,7
2,b,a,3,3
3,b,a,2,0
4,c,b,1,6
5,c,b,3,7
6,d,b,2,2
7,d,b,4,0
8,e,c,4,0
9,e,c,4,2


In [178]:
df.query('c != [1, 2]')

Unnamed: 0,a,b,c,d
0,a,a,3,8
2,b,a,3,3
5,c,b,3,7
7,d,b,4,0
8,e,c,4,0
9,e,c,4,2
11,f,c,3,6


## Duplicate data

In [179]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                    'c': np.random.randn(7)})
df2

Unnamed: 0,a,b,c
0,one,x,-0.956854
1,one,y,-0.583803
2,two,x,-1.393652
3,two,y,2.597319
4,two,x,0.11402
5,three,x,0.719719
6,four,x,0.246763


In [180]:
df2.duplicated('a')

0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [181]:
df2.duplicated('a', keep='last')

0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [182]:
df2.duplicated('a', keep=False)

0     True
1     True
2     True
3     True
4     True
5    False
6    False
dtype: bool

In [183]:
df2.drop_duplicates('a')

Unnamed: 0,a,b,c
0,one,x,-0.956854
2,two,x,-1.393652
5,three,x,0.719719
6,four,x,0.246763


In [184]:
df2.drop_duplicates('a', keep='last')

Unnamed: 0,a,b,c
1,one,y,-0.583803
4,two,x,0.11402
5,three,x,0.719719
6,four,x,0.246763


In [185]:
df2.drop_duplicates('a', keep=False)

Unnamed: 0,a,b,c
5,three,x,0.719719
6,four,x,0.246763


In [186]:
df2.duplicated(['a', 'b'])

0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [187]:
df2.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,c
0,one,x,-0.956854
1,one,y,-0.583803
2,two,x,-1.393652
3,two,y,2.597319
5,three,x,0.719719
6,four,x,0.246763


## get() method

In [188]:
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s

a    1
b    2
c    3
dtype: int64

In [189]:
print(s.get('a'))

1


In [190]:
print(s.get('x', default=-1))

-1


## Index() Object

In [191]:
index = pd.Index(['e', 'd', 'a', 'b'])
index

Index(['e', 'd', 'a', 'b'], dtype='object')

In [192]:
'd' in index

True

In [193]:
index = pd.Index([1, 5, 12])
index

Index([1, 5, 12], dtype='int64')

In [194]:
5 in index

True

In [195]:
index = pd.Index(['e', 'd', 'a', 'b'], dtype="string")
index

Index(['e', 'd', 'a', 'b'], dtype='string')

In [196]:
index = pd.Index([1, 5, 12], dtype="int8")
index

Index([1, 5, 12], dtype='int8')

In [197]:
index = pd.Index([1, 5, 12], dtype="float32")
index

Index([1.0, 5.0, 12.0], dtype='float32')

In [198]:
index = pd.Index(['e', 'd', 'a', 'b'], name='something')
index

Index(['e', 'd', 'a', 'b'], dtype='object', name='something')

In [199]:
index = pd.Index(list(range(5)), name='rows')
index

Index([0, 1, 2, 3, 4], dtype='int64', name='rows')

In [200]:
columns = pd.Index(['A', 'B', 'C'], name='cols')
columns

Index(['A', 'B', 'C'], dtype='object', name='cols')

In [201]:
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)
df

cols,A,B,C
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.664389,0.511811,-0.313597
1,0.256076,0.27454,-0.295792
2,0.563026,0.39439,-0.58956
3,1.359471,-0.839611,1.377868
4,1.197912,0.056227,0.093883


In [202]:
df['A']

rows
0   -0.664389
1    0.256076
2    0.563026
3    1.359471
4    1.197912
Name: A, dtype: float64