Pandas DataFrame is two-dimensional size-mutable, tabular data structure with labeled axes(rows and columns).

Dataframe consists of three principal components, the data, rows, columns.

# Creating a Pandas DataFrame

In [1]:
#import pandas as pd
import pandas as pd

# list of string
lists = ['football','cricket','tennis', 'hand-ball']

#create dataframe
df= pd.DataFrame(lists)

print(df)

           0
0   football
1    cricket
2     tennis
3  hand-ball


#Create dataframe from dict of ndarray/lists

In [2]:
import pandas as pd

data = {'Name':['Tom','Nick','Hassy','krish'],
      'Value':[12,34,56,67]
      }
df= pd.DataFrame(data)
print(df)

    Name  Value
0    Tom     12
1   Nick     34
2  Hassy     56
3  krish     67


# Dealing with Rows and Columns

In [3]:
import pandas as pd

data = {'Name':['Tom','Nick','Hassy','krish'],
      'Value':[12,34,56,67],
      'Group':['H','T','V','M'],
      'Id': ['f','g','h','y']
      }
df= pd.DataFrame(data)
print(df[['Name','Id']])

    Name Id
0    Tom  f
1   Nick  g
2  Hassy  h
3  krish  y


# Row selection
DataFrame.iloc() function is used to retrive rows from pandas DataFrame.

In [4]:
#importing pandas package
import pandas as pd

#making dataframe from csv file
data = pd.read_csv("/content/nba.csv", index_col= "Name")

#retriving row by loc method
first = data.loc["Avery Bradley"]
second = data.loc["Raul Neto"]
print(first, "\n\n", second)

Team        Boston Celtics
Number                 0.0
Position                PG
Age                   25.0
Height                 6-2
Weight               180.0
College              Texas
Salary           7730337.0
Name: Avery Bradley, dtype: object 

 Team        Utah Jazz
Number           25.0
Position           PG
Age              24.0
Height            6-1
Weight          179.0
College           NaN
Salary       900000.0
Name: Raul Neto, dtype: object


In [5]:
import pandas as pd

data = pd.read_csv("/content/nba.csv", index_col = "Name")

#retrive rows by iloc method
row2= data.iloc[3]
print(row2)

Team        Boston Celtics
Number                28.0
Position                SG
Age                   22.0
Height                 6-5
Weight               185.0
College      Georgia State
Salary           1148640.0
Name: R.J. Hunter, dtype: object


# Indexing and Selecting Data

# Selecting a single Column

In [6]:
# importing pandas package
import pandas as pd

# making dataframe from csv file
data = pd.read_csv("/content/nba.csv", index_col="Name")

#retrive columns by indexing operator
df = data["Age"]

print(df)

Name
Avery Bradley    25.0
Jae Crowder      25.0
John Holland     27.0
R.J. Hunter      22.0
Jonas Jerebko    29.0
                 ... 
Shelvin Mack     26.0
Raul Neto        24.0
Tibor Pleiss     26.0
Jeff Withey      26.0
NaN               NaN
Name: Age, Length: 458, dtype: float64


# Working with missing data

In [8]:
#importing pandas as pd
import pandas as pd

#importing numpy as np
import numpy as np

#dictionaries of lists

dicts ={'First score': [100,90,np.nan, 95],
       'Second Score': [23,56,82,np.nan],
       'Third Sore': [np.nan,3,4,5]
       }
#creating dataframe from the list
df = pd.DataFrame(dicts)

#using isnull function
df.isnull()

Unnamed: 0,First score,Second Score,Third Sore
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


# Filling missing values using fillna(), replace() and interpolate()

In [10]:
#importing pandas as pd
import pandas as pd

#importing numpy as np
import numpy as np

#dictionaries of lists

dicts ={'First score': [100,90,np.nan, 95],
       'Second Score': [23,56,82,np.nan],
       'Third Sore': [np.nan,3,4,5]
       }
#creating dataframe from the list
df = pd.DataFrame(dicts)

#using isnull function
df.fillna(0)

Unnamed: 0,First score,Second Score,Third Sore
0,100.0,23.0,0.0
1,90.0,56.0,3.0
2,0.0,82.0,4.0
3,95.0,0.0,5.0


In [12]:
# importing pandas as pd
import pandas as pd

# importing numpy as np
import numpy as np

# dictionary of lists
dicts= {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}

# creating a dataframe from dictionary
df = pd.DataFrame(dicts)

df

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


Now we drop rows with at least one Nan value (Null value)

In [14]:
# importing numpy as np
import numpy as np

# dictionary of lists
dicts = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}

# creating a dataframe from dictionary
df = pd.DataFrame(dicts)

df.dropna()

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


# Iterating over rows and column

In [16]:
# importing numpy as np
import numpy as np

# dictionary of lists
dicts = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}

# creating a dataframe from dictionary
df = pd.DataFrame(dicts)
print(df)

   First Score  Second Score  Third Score  Fourth Score
0        100.0          30.0           52           NaN
1         90.0           NaN           40           NaN
2          NaN          45.0           80           NaN
3         95.0          56.0           98          65.0


In [18]:
# importing pandas as pd
import pandas as pd

#dictionary of lists
dicts ={'name':['masud', 'mostafiz','rahman'],
       'score':[75,80,93],
       'degree':['bs','ll','gst']
  }

#creating dataframe from directory
df= pd.DataFrame(dicts)

#iterating over rows using iterrows() function
for i,j in df.iterrows():
  print(i,j)
  print("\n")

0 name      masud
score        75
degree       bs
Name: 0, dtype: object


1 name      mostafiz
score           80
degree          ll
Name: 1, dtype: object


2 name      rahman
score         93
degree       gst
Name: 2, dtype: object




# iterating over column

In [19]:
#importing pandas as pd
import pandas as pd

#dictionarires of lists
dicts ={"name":['masud','rahman','mostafizur'],
       "score":[23,53,59],
       "occu": ['student','employee','businessman']
       }

df= pd.DataFrame(dicts)
print(df)


         name  score         occu
0       masud     23      student
1      rahman     53     employee
2  mostafizur     59  businessman


In [24]:
# creating a list of dataframe columns
columns = list(df)
print(columns)
for i in columns:

    # printing the third element of the column
    print (df[i][2])


['name', 'score', 'occu']
mostafizur
59
businessman


# Method returns index (row labels) of the DataFrame

In [28]:
import pandas as pd

Date = ['1/1/2018', '2/1/2018', '3/1/2018', '4/1/2018']
Index_name = ['Day 1', 'Day 2', 'Day 3', 'Day 4']

df=pd.Series(Date,
             Index_name
          )

print(df)

Day 1    1/1/2018
Day 2    2/1/2018
Day 3    3/1/2018
Day 4    4/1/2018
dtype: object
