# Pandas Introduction (Series and DataFrame)

In [9]:
# # Installing python packages from notebook cell
# ! pip install pandas

In [1]:
%%timeit
import numpy as np
np.arange(900).reshape(9, 100)

The slowest run took 8.78 times longer than the fastest. This could mean that an intermediate result is being cached.
7.59 μs ± 8.41 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [2]:
import pandas as pd
import numpy as np

# Series

In [3]:
l = [1, 2, 3, 4, 5]
arr = np.arange(5)

In [4]:
s1 = pd.Series(l)
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
index = list('abcde')
index

['a', 'b', 'c', 'd', 'e']

In [6]:
s2 = pd.Series(arr, index=index, name='demo')
s2

a    0
b    1
c    2
d    3
e    4
Name: demo, dtype: int32

In [7]:
s1[0], s1.loc[0]

(1, 1)

In [8]:
s2[0], s2['a'], s2.loc['a'], s2.iloc[0]

  s2[0], s2['a'], s2.loc['a'], s2.iloc[0]


(0, 0, 0, 0)

In [11]:
s2 + 1000

a    1000
b    1001
c    1002
d    1003
e    1004
Name: demo, dtype: int32

In [12]:
s2.describe()

count    5.000000
mean     2.000000
std      1.581139
min      0.000000
25%      1.000000
50%      2.000000
75%      3.000000
max      4.000000
Name: demo, dtype: float64

In [13]:
s2.info()

<class 'pandas.core.series.Series'>
Index: 5 entries, a to e
Series name: demo
Non-Null Count  Dtype
--------------  -----
5 non-null      int32
dtypes: int32(1)
memory usage: 232.0+ bytes


# DataFrame

In [14]:
pd.DataFrame()

In [None]:
d = {
    "age": [10, 12, 11, 16, 9, 10],
    "height": [1.5, 1.6, 1.7, 1.8, 1.9, 1.5],
}
df = pd.DataFrame(d)
df

Unnamed: 0,age,height
0,10,1.5
1,12,1.6
2,11,1.7
3,16,1.8
4,9,1.9
5,10,1.5


In [21]:
import numpy as np
l = [
    [10, 12, 11, 16, 9, 10],
    [1.5, 1.6, 1.7, 1.8, 1.9, 1.5]
]
arr = np.array(l).T
df = pd.DataFrame(arr, columns=['age', 'height'])
df

Unnamed: 0,age,height
0,10.0,1.5
1,12.0,1.6
2,11.0,1.7
3,16.0,1.8
4,9.0,1.9
5,10.0,1.5


In [22]:
l = [
    # [10, 12, 11, 16, 9, 10],
    # [1.5, 1.6, 1.7, 1.8, 1.9, 1.5]
    [10, 1.5],
    [12, 1.6],
    [11, 1.7],
    [16, 1.8],
    [9, 1.9],
    [10, 1.5]
]

df = pd.DataFrame(l, columns=['age', 'height'])
df

Unnamed: 0,age,height
0,10,1.5
1,12,1.6
2,11,1.7
3,16,1.8
4,9,1.9
5,10,1.5


In [25]:
d = {
    "age": [10, 12, 11, 16, 9, 10],
    "height": [1.5, 1.6, 1.7, 1.8, 1.9, 1.5],
}
df1 = pd.DataFrame(d)
df1

Unnamed: 0,age,height
0,10,1.5
1,12,1.6
2,11,1.7
3,16,1.8
4,9,1.9
5,10,1.5


In [24]:
np.linspace(0, 10, 5)

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

In [26]:
d = {
    "age": np.arange(50, 60),
    "height": np.linspace(4, 5, 10),
}
df2 = pd.DataFrame(d)
df2

Unnamed: 0,age,height
0,50,4.0
1,51,4.111111
2,52,4.222222
3,53,4.333333
4,54,4.444444
5,55,4.555556
6,56,4.666667
7,57,4.777778
8,58,4.888889
9,59,5.0


In [28]:
pd.concat([df1, df2], axis=0, ignore_index=True)

Unnamed: 0,age,height
0,10,1.5
1,12,1.6
2,11,1.7
3,16,1.8
4,9,1.9
5,10,1.5
6,50,4.0
7,51,4.111111
8,52,4.222222
9,53,4.333333


In [29]:
d1 = {
    'a': np.arange(5),
    'b': np.random.random(5)
}
d2 = {
    'c': np.linspace(10, 19, 5),
    'd': np.linspace(100, 1000, 5)
}
d1, d2

({'a': array([0, 1, 2, 3, 4]),
  'b': array([0.7006708 , 0.32197063, 0.72427239, 0.49793997, 0.06774942])},
 {'c': array([10.  , 12.25, 14.5 , 16.75, 19.  ]),
  'd': array([ 100.,  325.,  550.,  775., 1000.])})

In [30]:
df1 = pd.DataFrame(d1)
df2 = pd.DataFrame(d2)

In [32]:
df1.head(2)

Unnamed: 0,a,b
0,0,0.700671
1,1,0.321971


In [33]:
df2.head(3)

Unnamed: 0,c,d
0,10.0,100.0
1,12.25,325.0
2,14.5,550.0


In [36]:
df1.columns, df2.columns

(Index(['a', 'b'], dtype='object'), Index(['c', 'd'], dtype='object'))

In [38]:
cols = []
for df in [df1, df2]:
    for col in df.columns:
        cols.append(col)

print(cols)

columns = [col for df in [df1, df2] for col in df.columns ]
columns

['a', 'b', 'c', 'd']


['a', 'b', 'c', 'd']

In [45]:
df_cat = pd.concat([df1, df2], axis=1, ignore_index=True)
df_cat

Unnamed: 0,0,1,2,3
0,0,0.700671,10.0,100.0
1,1,0.321971,12.25,325.0
2,2,0.724272,14.5,550.0
3,3,0.49794,16.75,775.0
4,4,0.067749,19.0,1000.0


In [46]:
df_cat.columns

RangeIndex(start=0, stop=4, step=1)

In [47]:
df_cat.rename(columns={0:'age'})

Unnamed: 0,age,1,2,3
0,0,0.700671,10.0,100.0
1,1,0.321971,12.25,325.0
2,2,0.724272,14.5,550.0
3,3,0.49794,16.75,775.0
4,4,0.067749,19.0,1000.0


In [50]:
# EITHER  OF TWO FOR INPLACE UPDATION
# df_cat = df_cat.rename(columns={0:'age', 1:'height', 2:'grade', 3:'roll num'})
df_cat.rename(columns={0:'age', 1:'height', 2:'grade', 3:'roll num'}, inplace=True)

In [53]:
df_cat = df_cat.rename(columns={"roll num": "roll_num"})
df_cat

Unnamed: 0,age,height,grade,roll_num
0,0,0.700671,10.0,100.0
1,1,0.321971,12.25,325.0
2,2,0.724272,14.5,550.0
3,3,0.49794,16.75,775.0
4,4,0.067749,19.0,1000.0


In [57]:
df = pd.DataFrame(
    np.array([1, 2, 4, np.nan, 100, np.nan, 7, 8]).T
)
df

Unnamed: 0,0
0,1.0
1,2.0
2,4.0
3,
4,100.0
5,
6,7.0
7,8.0


In [58]:
df[0].backfill()

  df[0].backfill()


0      1.0
1      2.0
2      4.0
3    100.0
4    100.0
5      7.0
6      7.0
7      8.0
Name: 0, dtype: float64

In [59]:
df[0].ffill()

0      1.0
1      2.0
2      4.0
3      4.0
4    100.0
5    100.0
6      7.0
7      8.0
Name: 0, dtype: float64

In [60]:
df_cat

Unnamed: 0,age,height,grade,roll_num
0,0,0.700671,10.0,100.0
1,1,0.321971,12.25,325.0
2,2,0.724272,14.5,550.0
3,3,0.49794,16.75,775.0
4,4,0.067749,19.0,1000.0


In [68]:
df_cat['height']

0    0.700671
1    0.321971
2    0.724272
3    0.497940
4    0.067749
Name: height, dtype: float64

In [70]:
df_cat[['height', 'age', 'grade']]

Unnamed: 0,height,age,grade
0,0.700671,0,10.0
1,0.321971,1,12.25
2,0.724272,2,14.5
3,0.49794,3,16.75
4,0.067749,4,19.0


In [64]:
df_cat.height

0    0.700671
1    0.321971
2    0.724272
3    0.497940
4    0.067749
Name: height, dtype: float64