# Pandas Library

In [45]:
import numpy as np

In [1]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

In [2]:
print(pd.__version__)

2.2.2


# Create Pandas Series

In [61]:
arr=[1,2,3,4,5]
s1=pd.Series(arr)

In [62]:
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

# Dictionary into Pandas

In [63]:
dict1={'a':4,'b':5,'c':8}

In [66]:
s2=pd.Series(dict1)

In [67]:
s2

a    4
b    5
c    8
dtype: int64

# Creating Pandas DataFrame (always use second bracket)

In [68]:
data={'name':['Asif','Parvez','Naharul','Asad','Rakib','Asifa'],
      'Age':[25,40,34,20,75,28],
      'Sex':['male','male','male','male','male','female'],
      }

In [3]:
data

{'name': ['Asif', 'Parvez', 'Naharul', 'Asad', 'Rakib', 'Asifa'],
 'Age': [25, 40, 34, 20, 75, 28],
 'Sex': ['male', 'male', 'male', 'male', 'male', 'female']}

# Convert DataFrame into Dictionary

In [69]:
df=pd.DataFrame(data)

In [70]:
df

Unnamed: 0,name,Age,Sex
0,Asif,25,male
1,Parvez,40,male
2,Naharul,34,male
3,Asad,20,male
4,Rakib,75,male
5,Asifa,28,female


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    6 non-null      object
 1   Age     6 non-null      int64 
 2   Sex     6 non-null      object
dtypes: int64(1), object(2)
memory usage: 276.0+ bytes


In [13]:
df.isnull()

Unnamed: 0,name,Age,Sex
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False


In [14]:
df.isnull().sum()

name    0
Age     0
Sex     0
dtype: int64

In [15]:
df.describe()

Unnamed: 0,Age
count,6.0
mean,37.0
std,19.879638
min,20.0
25%,25.75
50%,31.0
75%,38.5
max,75.0


# 66.67% Data lies on 1 Standard Deviation
# 88.7%% Data lies on 2 Standard Deviation
# 99.98% Data lies on 3 Standard Deviation

In [71]:
df['Age'].mean()

np.float64(37.0)

In [72]:
abs(df['Age']-37)

0    12
1     3
2     3
3    17
4    38
5     9
Name: Age, dtype: int64

In [19]:
abs(df['Age']-30)

0     5
1    10
2     4
3    10
4    45
5     2
Name: Age, dtype: int64

In [73]:
df['Age'].min()

np.int64(20)

In [74]:
df['Age'].max()

np.int64(75)

In [75]:
df['Cum_Age']=df['Age'].cumsum()

In [76]:
df

Unnamed: 0,name,Age,Sex,Cum_Age
0,Asif,25,male,25
1,Parvez,40,male,65
2,Naharul,34,male,99
3,Asad,20,male,119
4,Rakib,75,male,194
5,Asifa,28,female,222


In [77]:
df.head(3)

Unnamed: 0,name,Age,Sex,Cum_Age
0,Asif,25,male,25
1,Parvez,40,male,65
2,Naharul,34,male,99


In [78]:
df.tail(10)

Unnamed: 0,name,Age,Sex,Cum_Age
0,Asif,25,male,25
1,Parvez,40,male,65
2,Naharul,34,male,99
3,Asad,20,male,119
4,Rakib,75,male,194
5,Asifa,28,female,222


In [79]:
df['Age']>=30

0    False
1     True
2     True
3    False
4     True
5    False
Name: Age, dtype: bool

In [82]:
df1=[df['Age']>=30]

In [14]:
df['Sex'].value_counts()

Sex
male      5
female    1
Name: count, dtype: int64

In [29]:
df[df['Sex']=='male']

Unnamed: 0,name,Age,Sex
0,Asif,25,male
1,Parvez,40,male
2,Naharul,34,male
3,Asad,20,male
4,Rakib,75,male


In [30]:
df1=df['Age']

In [20]:
df1

0    25
1    40
2    34
3    20
4    75
5    28
Name: Age, dtype: int64

In [96]:
df1=df[['Age'], ['Sex']].corr()


InvalidIndexError: (['Age'], ['Sex'])

# We should exclude highly correlated values to avoid overfitting

In [98]:
df1=df['Dup_Age']=df['Age']

In [99]:
df1

0    25
1    40
2    34
3    20
4    75
5    28
Name: Age, dtype: int64

In [102]:
df['Dup_age'] = df['Age']


In [103]:
df

Unnamed: 0,name,Age,Sex,Cum_Age,Dup_Age,Dup_age
0,Asif,25,male,25,25,25
1,Parvez,40,male,65,40,40
2,Naharul,34,male,99,34,34
3,Asad,20,male,119,20,20
4,Rakib,75,male,194,75,75
5,Asifa,28,female,222,28,28


In [104]:
df1 = df[['Age', 'Dup_age']].corr()


In [105]:
df1

Unnamed: 0,Age,Dup_age
Age,1.0,1.0
Dup_age,1.0,1.0


# We use Median when values are Categorical and there is Outlier in the Dataset

In [86]:
df['Sex'].mode()

0    male
Name: Sex, dtype: object

In [87]:
df['Age'].mode()

0    20
1    25
2    28
3    34
4    40
5    75
Name: Age, dtype: int64

# When there is no Outlier in that case Mean gives us better result

In [106]:
df_sorted = df.sort_values(by='Age')


In [107]:
print(df_sorted)


      name  Age     Sex  Cum_Age  Dup_Age  Dup_age
3     Asad   20    male      119       20       20
0     Asif   25    male       25       25       25
5    Asifa   28  female      222       28       28
2  Naharul   34    male       99       34       34
1   Parvez   40    male       65       40       40
4    Rakib   75    male      194       75       75


In [108]:
df1.sort_values(by='Age',ascending=False)

Unnamed: 0,Age,Dup_age
Age,1.0,1.0
Dup_age,1.0,1.0


In [109]:
df1

Unnamed: 0,Age,Dup_age
Age,1.0,1.0
Dup_age,1.0,1.0


# Sorting

In [110]:
df1.sort_values(by='Age',ascending=False,inplace=True)

In [111]:
df1

Unnamed: 0,Age,Dup_age
Age,1.0,1.0
Dup_age,1.0,1.0


In [112]:
df1.reset_index()

Unnamed: 0,index,Age,Dup_age
0,Age,1.0,1.0
1,Dup_age,1.0,1.0


In [113]:
df1.reset_index(inplace=True,drop=True)

In [114]:
df1

Unnamed: 0,Age,Dup_age
0,1.0,1.0
1,1.0,1.0


In [115]:
data={'name':['Asif','Parvez','Naharul','Asad','Rakib','Asifa'],
      'Age':[25,40,34,20,75,28],
      'Sex':['male','male','male','male','male','female'],
      }

In [116]:
df2=pd.DataFrame(data)

In [117]:
df2

Unnamed: 0,name,Age,Sex
0,Asif,25,male
1,Parvez,40,male
2,Naharul,34,male
3,Asad,20,male
4,Rakib,75,male
5,Asifa,28,female


In [118]:
df2.sort_values(by='Age',ascending=False)

Unnamed: 0,name,Age,Sex
4,Rakib,75,male
1,Parvez,40,male
2,Naharul,34,male
5,Asifa,28,female
0,Asif,25,male
3,Asad,20,male


In [119]:
df2.reset_index(inplace=True,drop=True)

In [120]:
df2

Unnamed: 0,name,Age,Sex
0,Asif,25,male
1,Parvez,40,male
2,Naharul,34,male
3,Asad,20,male
4,Rakib,75,male
5,Asifa,28,female


In [60]:
df1.drop('index',axis=1)

KeyError: "['index'] not found in axis"

In [121]:
data={'name':['Asif','Parvez','Naharul',np.nan,'Asad','Rakib','Asifa'],
      'Age':[25,40,34,20,75,np.nan,28],
      'Sex':['male','male','male','female','male','male','female'],
      }

In [122]:
df3=pd.DataFrame(data)

In [123]:
df3

Unnamed: 0,name,Age,Sex
0,Asif,25.0,male
1,Parvez,40.0,male
2,Naharul,34.0,male
3,,20.0,female
4,Asad,75.0,male
5,Rakib,,male
6,Asifa,28.0,female


In [58]:
df3.isnull().sum()

name    1
Age     1
Sex     0
dtype: int64

In [59]:
df3.dropna()

Unnamed: 0,name,Age,Sex
0,Asif,25.0,male
1,Parvez,40.0,male
2,Naharul,34.0,male
4,Asad,75.0,male
6,Asifa,28.0,female


# Null Value with Mean
# Aggregation Function
# Group By
# Merge Function
# Concatenate
# Pivot Table