In [1]:
import pandas as pd
import numpy as np

### Pandas Series
##### A Series is a one-dimensional labeled array capable of holding any data type. The axis labels are collectively called the index.

In [2]:
labels = ['a', 'b', 'c']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a':10,'b':20,'c':30}

In [3]:
pd.Series(my_list)

0    10
1    20
2    30
dtype: int64

In [4]:
pd.Series(my_list,index=labels)

a    10
b    20
c    30
dtype: int64

In [5]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [6]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

### Creating a DataFrame

In [7]:
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [65000, 70000, 62000, 85000]
}
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [9]:
data_list = [
    ['John', 28, 'New York', 65000],
    ['Anna', 34, 'Paris', 70000],
    ['Peter', 29, 'Berlin', 62000],
    ['Linda', 42, 'London', 85000]
]
df2 = pd.DataFrame(data_list)

In [10]:
df2

Unnamed: 0,0,1,2,3
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [11]:
columns = ["Name","Age","City","Salary"]
df2 = pd.DataFrame(data_list,columns =columns)
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


### Selection and Indexing of Columns

In [12]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [13]:
df2["Name"]

0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object

In [14]:
df2[["Name","Age"]]

Unnamed: 0,Name,Age
0,John,28
1,Anna,34
2,Peter,29
3,Linda,42


### Creating a new column

In [15]:
df2["Designation"] = ["Doctor","Eng.","Doctor","Eng."]
df2

Unnamed: 0,Name,Age,City,Salary,Designation
0,John,28,New York,65000,Doctor
1,Anna,34,Paris,70000,Eng.
2,Peter,29,Berlin,62000,Doctor
3,Linda,42,London,85000,Eng.


### Removing Columns

In [16]:
df2

Unnamed: 0,Name,Age,City,Salary,Designation
0,John,28,New York,65000,Doctor
1,Anna,34,Paris,70000,Eng.
2,Peter,29,Berlin,62000,Doctor
3,Linda,42,London,85000,Eng.


In [17]:
df2.drop("Designation",axis = 1)

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [19]:
df2.drop(["Designation","Age"],axis = 1)

Unnamed: 0,Name,City,Salary
0,John,New York,65000
1,Anna,Paris,70000
2,Peter,Berlin,62000
3,Linda,London,85000


In [20]:
df2

Unnamed: 0,Name,Age,City,Salary,Designation
0,John,28,New York,65000,Doctor
1,Anna,34,Paris,70000,Eng.
2,Peter,29,Berlin,62000,Doctor
3,Linda,42,London,85000,Eng.


In [21]:
df2.drop("Designation",axis = 1,inplace = True)

In [22]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


### Removing rows

In [23]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [24]:
df2.drop(0,axis = 0)

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [25]:
df2.drop([2,1],axis = 0)

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
3,Linda,42,London,85000


In [26]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


### Selecting Rows

In [27]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [28]:
df2.loc[[0,1]]

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000


In [29]:
df.iloc[3]

Name       Linda
Age           42
City      London
Salary     85000
Name: 3, dtype: object

### Selecting Subsets of Rows and Columns

In [30]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [31]:
df.loc[[0,1]][["City","Salary"]]

Unnamed: 0,City,Salary
0,New York,65000
1,Paris,70000


### Conditional Selection

In [32]:
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [35]:
# I only want to see those people whose age is above 30

In [36]:
df2[df2["Age"]>30]

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000
3,Linda,42,London,85000


In [37]:
# I only want poeple whose age is above 30 and their city must be paris

In [38]:
df2[(df2["Age"] > 30) & (df2["City"] == 'Paris')]

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000


### Finding Missing Data

In [40]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [1, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, np.nan],
    'D': [1, np.nan, np.nan, np.nan, 5]
}
df3 = pd.DataFrame(data)

In [41]:
df3

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [44]:
df3.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [45]:
df3.isna().sum()

A    1
B    0
C    2
D    3
dtype: int64

In [46]:
df3.isna().any()

A     True
B    False
C     True
D     True
dtype: bool

### Removing Missing Data

In [47]:
df3.dropna()

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [48]:
df3

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [51]:
df3.dropna(thresh=1)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


### Filling the missing Data

In [52]:
df3

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [53]:
df3.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,0.0
2,0.0,3,3.0,0.0
3,4.0,4,0.0,0.0
4,5.0,5,0.0,5.0
