In [2]:
import pandas as pd
print(pd.__version__)

1.0.1


In [None]:
# DataFrame Initialization
dic = \
	{
		"col 1": [1, 2, 3],
		"col 2": [10, 20, 30],
		"col 3": list("xyz"),
		"col 4": pd.Series(range(3)),
	}

df = pd.DataFrame(dic)
df

In [None]:
# Column Rename
rename_col = {"col 1": "x", "col 2": "10x"}
df.rename(rename_col, axis = 1, inplace = True)
df

df.columns = ["x(new)", "10x(new)"] + list(df.columns[2:])
df

In [None]:
# Create Random DataFrame
print(pd.util.testing.makeDataFrame().shape)
# Create Random DataFrame with Mixed DataType
print(pd.util.testing.makeMixedDataFrame().shape)

In [None]:
# I/O of DataFrame
# df.to_csv("file.csv")
# df = pd.read_csv("file path or http address")

In [None]:
# Memory Usage of DataFrame
df.info(memory_usage = "deep")

In [None]:
# DataFrame Combination
df = pd.concat([pd.read_csv(f) for f in file_list]) # combine a list of dataframes (default axis = 0)
#				[df1, df2, df3...]
df.reset_index(drop = True, inplace = True)		# drop the orginal index column created from reset_index function

dic1 = \
	{
		"PassengerID": [1, 2, 3],
		"Survived": [0, 1, 1],
		"Pclass": [8, 1, 8],
		"Name": ["Braund", "Cumings", "Heikkinen"],
	}
df1 = pd.DataFrame(dic1)

dic2 = \
	{
		"Sex": ["male", "female", "female"],
		"Age": [22, 88, 26],
	}
df2 = pd.DataFrame(dic2)

df = pd.concat([df1, df2], axis = 1)
print(df)

In [None]:
# Global Display Option
pd.set_option("display.max_columns", None)	# remove the limit number to show column
pd.get_option("display.max_colwidth")		# get display length of column
pd.set_option("display.max_colwidth", 10)	# set display length of column
pd.set_option("display.precision", 1)
pd.reset_option("all")
pd.describe_option()						# display available options

In [27]:
# if we want to set the specific dataframe, use datafrome.style
df_sample = pd.read_csv("Titanic.csv")

df_sample.style\
	.format('{:.1f}', subset='Fare')\
	.set_caption('★五顏六色の鐵達尼號數據集☆')\
	.hide_index()\
	.bar('Age', vmin=0)\
	.highlight_max('Survived')\
	.background_gradient('Greens', subset='Fare')\
	.highlight_null()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.3,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.1,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.5,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.9,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.1,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.1,,C


In [None]:
# Random Sampling
df_sample = df.sample(n=10, random_state = 9527).drop("name", axis = 1)

In [None]:
# Transpose
df.T

In [None]:
# Fill Nan/None
df.fillna(0)
df.fillna("unknown")

In [None]:
# Delete Column/Row
columns = ["col 1", "col 2"]
df.drop(columns, axis = 1)

df.drop("Name", axis = 1, inplace = True)
df.drop(0, axis = 0, inplace = True)

In [21]:
# Split String in Column into New Columns
df = pd.DataFrame({
	"name": ["大雄", "胖虎"], 
    "feature": ["膽小, 翻花繩", "粗魯, 演唱會"]
	})

df[["性格", "特技"]] = df["feature"].str.split(",", expand = True)
df

Unnamed: 0,name,feature,性格,特技
0,大雄,"膽小, 翻花繩",膽小,翻花繩
1,胖虎,"粗魯, 演唱會",粗魯,演唱會


In [8]:
# Split List in Column into New Columns
df = pd.DataFrame({
    "name": ["大雄", "胖虎"], 
    "feature": [["膽小", "翻花繩"], ["粗魯", "演唱會"]]
})

cols = ["性格", "特技"]
df[["性格", "特技"]] = pd.DataFrame(df.feature.tolist())
# df[["性格", "特技"]] = df.feature.apply(pd.Series)
df

Unnamed: 0,name,feature,性格,特技
0,大雄,"[膽小, 翻花繩]",膽小,翻花繩
1,胖虎,"[粗魯, 演唱會]",粗魯,演唱會


In [28]:
# Slice of DataFrame
df = pd.read_csv("Titanic.csv")

df.loc[:3, "Pclass":"Ticket"]    # .loc is using tag to slice
df.iloc[:3, 2:7]                 # .iloc is using index to slice

Unnamed: 0,Pclass,Name,Sex,Age,SibSp
0,3,"Braund, Mr. Owen Harris",male,22.0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1
2,3,"Heikkinen, Miss. Laina",female,26.0,0


In [45]:
# Mask of DataFrame

# find the specific condition data - method1
bool_series = (df["Sex"] == "male") & (df["Age"] > 70)
df[bool_series]

# find the specific condition data - method 2
age = 70
df.query("Age > @age & Sex == 'male'")    # @ will access the variable has defined already

# find None 
df[df.isnull().any(axis = 1)].head()
# isnull() return boolean dataframe
# any() return row index containing true, axis = 1 return column index

# include or exclude specific datatype from dataframe
df.select_dtypes(include = ["number"]).head()
df.select_dtypes(exclude = ["object"]).head()  # exclude strings

# 

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.925
3,4,1,1,35.0,1,0,53.1
4,5,0,3,35.0,0,0,8.05
