## 说明

这是一个Jupyter练习。加载本地数据。

DataFrame:二维数据，整个表格，多行多列

df.columns 查询列

df.index  查询行

Series：一维数据，一行或者一列

![pandas base](../../images/pandas-base-1.png)

In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

from base import testBase as tb

# 指定了索引，如果不指定，索引为：RangeIndex(start=0, stop=4, step=1)
s1 = pd.Series([10, 'A', 5.2, 26], index = ['d','b','a','c'])
tb.myprint("数据列表")
print(s1)

tb.myprint("数据索引")
print(s1.index)

tb.myprint("数据")
print(s1.values)

tb.myprint("查询数据")
print(s1.a)
print(type(s1.a))
print(s1[['b', 'a']])


数据字典

In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

from base import testBase as tb

sdata = {'ohio':3500,'Texas':72000,'Oregs':16000,'Ggrqg':5000}
s1 = pd.Series(sdata)

tb.myprint("数据列表")
print(s1)

多个字典序列创建dataframe


In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

from base import testBase as tb

data = {
    'state':['ofjg','sdfg','werw','wrgwer','rgwg'],
    'year':[2000,3000,5000,6000,9000],
    'pop':[1.5,1.7,1.6,5.3,3.5]
}
df = pd.DataFrame(data)

tb.myprint("数据列表")
print(df)

tb.myprint("数据一列")
print(df['year'])

tb.myprint("数据多列")
print(df[['year','pop']])

tb.myprint("数据最后一列")
print(df.loc[1])

常规操作

In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

from base import testBase as tb

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

tb.myprint("原数据")
print(df)

tb.myprint("转置数据（列与行转置）")
print(df.T)

tb.myprint("按照指定轴排序")
print(df.sort_index(axis=1, ascending=False))

tb.myprint("按照值排序")
print(df.sort_values(by='B'))


tb.myprint("选择单列，产生 Series")
print(df['A']) # 等同于df.A

tb.myprint("用 [ ] 切片行")
print(df[0:3])

tb.myprint("")
print(df['20130102':'20130104'])

tb.myprint("Series.loc[indexer]，返回Series对象。DataFrame.loc[row_indexer,column_indexer]返回DataFrame")
tb.myprint("loc 在 index 的标签上进行索引，范围包括 start 和 end。")
tb.myprint("iloc 在 index 的位置上进行索引，不包括 end。")

tb.myprint("按标签选择：提取一行数据")
tb.myprint("loc")
print(df.loc[dates[0]])
tb.myprint("iloc")
print(df.iloc[0])

tb.myprint("按标签选择：提取多列数据")
tb.myprint("loc")
print(df.loc[:, ['A', 'B']])
tb.myprint("iloc")
print(df.iloc[:, [0, 2]])

tb.myprint("按标签选择：提取标量值")
tb.myprint("loc")
print(df.loc[dates[0], 'A']) # df.at[dates[0], 'A']
tb.myprint("iloc")
print(df.iloc[1, 1])

tb.myprint("按位置选择：切片方式")
tb.myprint("loc")
print(df.loc['20130102':'20130104', ['A', 'B']])
tb.myprint("iloc")
print(df.iloc[2:4, :])


tb.myprint("布尔索引选择")
tb.myprint("用单列的值选择数据")
print(df[df.A > 0])

tb.myprint("选择 DataFrame 里满足条件的值")
print(df[df > 0])

df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
tb.myprint("新DataFrame")
print(df2)

tb.myprint("用 isin() 选择")
print(df2[df2['E'].isin(['two', 'four'])])


tb.myprint("赋值")

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
tb.myprint("新增列")
print(s1)

tb.myprint("用索引自动对齐新增列的数据")
df['F'] = s1
tb.myprint("按标签赋值")
df.at[dates[0], 'A'] = 0
tb.myprint("按标签赋值")
df.iat[0, 1] = 0
print(df)

tb.myprint("用 where 条件赋值")
df3 = df.copy()
df3[df3 > 0] = -df3
print(df3)


tb.myprint("Pandas 主要用 np.nan 表示缺失数据。 计算时，默认不包含空值。")
tb.myprint("重建索引（reindex）可以更改、添加、删除指定轴的索引，并返回数据副本，即不更改原数据。")

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
print(df1)

tb.myprint("删除所有含缺失值的行")
print(df1.dropna(how='any'))

tb.myprint("填充缺失值")
print(df1.fillna(value=5))

tb.myprint("提取 nan 值的布尔掩码")
print(pd.isna(df1))

读取文件，将文件内容转换成dataFrame

In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

from base import testBase as tb

click_series = pd.read_csv("D:/wangrun/data/theme_item_pool.csv")
#click_series = pd.read_csv("D:/wangrun/data/theme_click_log.csv", sep = ",", header = 0)
tb.myprint("查看前几行")
print(click_series.head())

tb.myprint("查看后几行")
print(click_series.tail(10))

tb.myprint("查看数据的形状，返回（行数、列数）")
print(click_series.shape)

tb.myprint("查看列")
print(click_series.columns)

tb.myprint("查看索引")
print(click_series.index)

tb.myprint("查看每一列的数据类型")
print(click_series.dtypes)

tb.myprint("查看统计概要")
print(click_series.describe())

## 连接MySQL

看例子

In [None]:
import pymysql
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

from base import testBase as tb

conn = pmysql.connect(
         host = '127.0.0.1',
         user = 'root',
         password = '123456',
         database = 'test',
         charest = 'utf8'
)
mysql_page = pd.read_sql("select * from 表名", con=conn)
print(mysql_page)