In [319]:
import pandas as pd

# Read File to DataFrame
A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns.

In [320]:
# by default, pandas support csv format
df = pd.read_csv('data/friend_list.csv')

In [321]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [322]:
# you can read txt file like this, if the txt file data are comma separated
df = pd.read_csv('data/friend_list.txt')

In [323]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [324]:
# if txt file delimiter is not comma, you can use define delimiter using keyword argument
df = pd.read_csv('data/friend_list_tab.txt', delimiter = "\t")

In [325]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [326]:
# if data file doesn't have header,use header = None, so first column not to be your column header
df = pd.read_csv('data/friend_list_no_head.csv', header = None)

In [327]:
df.head()

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [328]:
# you can add column header after you create dataframe
df.columns = ['name', 'age', 'job']

In [329]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [330]:
# you can create column header for no header data at once
df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name', 'age', 'job'])

In [331]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


# Create DataFrame
when you want to create dataframe from your python code

## from dictionary

In [332]:
friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},
         {'name': 'Jenny', 'age': 30, 'job': 'developer'},
         {'name': 'Nate', 'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(friend_dict_list)

In [333]:
df.head()

Unnamed: 0,age,job,name
0,20,student,Jone
1,30,developer,Jenny
2,30,teacher,Nate


In [334]:
df = df[['name', 'age', 'job']]

In [335]:
df.head()

Unnamed: 0,name,age,job
0,Jone,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## from OrderedDict

In [336]:
from collections import OrderedDict

In [337]:
friend_ordered_dict = OrderedDict([ ('name', ['John', 'Jenny', 'Nate']),
          ('age', [20, 30, 30]),
          ('job', ['student', 'developer', 'teacher']) ] )
df = pd.DataFrame.from_dict(friend_ordered_dict)

In [338]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## from list

In [339]:
friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]
column_name = ['name', 'age', 'job']
df = pd.DataFrame.from_records(friend_list, columns=column_name)

In [340]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [341]:
friend_list = [ 
                ['name',['John', 'Jenny', 'Nate']],
                ['age',[20,30,30]],
                ['job',['student', 'developer', 'teacher']] 
              ]
df = pd.DataFrame.from_items(friend_list)

In [342]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


# Write DataFrame to File

In [343]:
df.to_csv('friend_list_from_df.csv')

In [344]:
df.to_csv('friend_list_from_df.txt', sep='\t')

In [345]:
df.to_csv('friend_list_from_df.txt', sep='\t', encoding='utf-8')

# Filter Row

## by index

In [346]:
friend_list = [ 
                ['name',['John', 'Jenny', 'Nate']],
                ['age',[20,30,30]],
                ['job',['student', 'developer', 'teacher']] 
              ]
df = pd.DataFrame.from_items(friend_list)

In [347]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [348]:
df[1:2]

Unnamed: 0,name,age,job
1,Jenny,30,developer


In [349]:
df.loc[[0,2]]

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,30,teacher


In [350]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## by column condition

In [351]:
df_filtered = df[df.age > 25]

In [352]:
df_filtered

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [353]:
df_filtered = df.query('age>25')

In [354]:
df_filtered

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [355]:
df_filtered = df[(df.age >25) & (df.name == 'Nate')]

In [356]:
df_filtered

Unnamed: 0,name,age,job
2,Nate,30,teacher


In [357]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


# Filter Column

## by index

In [358]:
friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]
df = pd.DataFrame.from_records(friend_list)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [359]:
df.iloc[:, 0:2]

Unnamed: 0,0,1
0,John,20
1,Jenny,30
2,Nate,30


In [360]:
df_filtered = df.iloc[:, 1]
df_filtered

0    20
1    30
2    30
Name: 1, dtype: int64

In [361]:
df.iloc[:,[0,2]]

Unnamed: 0,0,2
0,John,student
1,Jenny,developer
2,Nate,teacher


In [362]:
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


## by column name

In [363]:
# you can create column header for no header data at once
df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name', 'age', 'job'])
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [364]:
df_filtered = df[['name', 'age']]
df_filtered

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [365]:
df.filter(items=['age', 'job'])

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [366]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [367]:
# select columns containing 'a'
df.filter(like='a',axis=1)

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [368]:
# select columns using regex
df.filter(regex='b$',axis=1)

Unnamed: 0,job
0,student
1,developer
2,teacher
3,dentist
4,manager
5,intern


Practice Available below,  
https://github.com/minsuk-heo/python_tutorial/blob/master/data_science/pandas/Pandas_Cheatsheet.ipynb

Download or Clone Available below,  
https://github.com/minsuk-heo/python_tutorial