## 说明

这是一个Jupyter练习。加载本地数据。

DataFrame:二维数据，整个表格，多行多列

df.columns 查询列

df.index  查询行

Series：一维数据，一行或者一列

![pandas base](../../../assets/materials/pandas/pandas_dataframe1.png)

![pandas base](../../../assets/materials/pandas/pandas_dataframe1.png)


In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

# 指定了索引，如果不指定，索引为：RangeIndex(start=0, stop=4, step=1)
s1 = pd.Series([10, 'A', 5.2, 26], index = ['d','b','a','c'])
print("==数据列表==")
print(s1)

print("数据索引")
print(s1.index)

print("数据")
print(s1.values)

print("查询数据")
print(s1.a)
print(type(s1.a))
print(s1[['b', 'a']])


数据字典

In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

sdata = {'ohio':3500,'Texas':72000,'Oregs':16000,'Ggrqg':5000}
s1 = pd.Series(sdata)

print("数据列表")
print(s1)

多个字典序列创建dataframe


In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

data = {
    'state':['ofjg','sdfg','werw','wrgwer','rgwg'],
    'year':[2000,3000,5000,6000,9000],
    'pop':[1.5,1.7,1.6,5.3,3.5]
}
df = pd.DataFrame(data)

print("数据列表")
print(df)

print("数据一列")
print(df['year'])

print("数据多列")
print(df[['year','pop']])

print("数据最后一列")
print(df.loc[1])

常规操作

In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

print("原数据")
print(df)

print("转置数据（列与行转置）")
print(df.T)

print("按照指定轴排序")
print(df.sort_index(axis=1, ascending=False))

print("按照值排序")
print(df.sort_values(by='B'))


print("选择单列，产生 Series")
print(df['A']) # 等同于df.A

print("用 [ ] 切片行")
print(df[0:3])

print("")
print(df['20130102':'20130104'])

print("Series.loc[indexer]，返回Series对象。DataFrame.loc[row_indexer,column_indexer]返回DataFrame")
print("loc 在 index 的标签上进行索引，范围包括 start 和 end。")
print("iloc 在 index 的位置上进行索引，不包括 end。")

print("按标签选择：提取一行数据")
print("loc")
print(df.loc[dates[0]])
print("iloc")
print(df.iloc[0])

print("按标签选择：提取多列数据")
print("loc")
print(df.loc[:, ['A', 'B']])
print("iloc")
print(df.iloc[:, [0, 2]])

print("按标签选择：提取标量值")
print("loc")
print(df.loc[dates[0], 'A']) # df.at[dates[0], 'A']
print("iloc")
print(df.iloc[1, 1])

print("按位置选择：切片方式")
print("loc")
print(df.loc['20130102':'20130104', ['A', 'B']])
print("iloc")
print(df.iloc[2:4, :])


print("布尔索引选择")
print("用单列的值选择数据")
print(df[df.A > 0])

print("选择 DataFrame 里满足条件的值")
print(df[df > 0])

df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
print("新DataFrame")
print(df2)

print("用 isin() 选择")
print(df2[df2['E'].isin(['two', 'four'])])


print("赋值")

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
print("新增列")
print(s1)

print("用索引自动对齐新增列的数据")
df['F'] = s1
print("按标签赋值")
df.at[dates[0], 'A'] = 0
print("按标签赋值")
df.iat[0, 1] = 0
print(df)

print("用 where 条件赋值")
df3 = df.copy()
df3[df3 > 0] = -df3
print(df3)


print("Pandas 主要用 np.nan 表示缺失数据。 计算时，默认不包含空值。")
print("重建索引（reindex）可以更改、添加、删除指定轴的索引，并返回数据副本，即不更改原数据。")

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
print(df1)

print("删除所有含缺失值的行")
print(df1.dropna(how='any'))

print("填充缺失值")
print(df1.fillna(value=5))

print("提取 nan 值的布尔掩码")
print(pd.isna(df1))

读取文件，将文件内容转换成dataFrame

In [None]:
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + "..")
sys.path.append(root_path)

print(root_path)


file_excel = os.sep.join([root_path, "data", "mingri", "4-22_sales.xlsx"])
click_series = pd.read_excel(file_excel)
# click_series = pd.read_excel(os.path.abspath('../..') + os.path.sep + "data" + os.path.sep + "mingri" + os.path.sep + "4-22_sales.xlsx", encoding="GB18030")
# click_series = pd.read_csv(os.path.abspath('.').join("/data/theme_item_pool.csv"), sep = ",", header = 0)
print("查看前几行")
print(click_series.head())

print("查看后几行")
print(click_series.tail(10))

print("查看数据的形状，返回（行数、列数）")
print(click_series.shape)

print("查看列")
print(click_series.columns)

print("查看索引")
print(click_series.index)

print("查看每一列的数据类型")
print(click_series.dtypes)

print("查看统计概要")
print(click_series.describe())

## 连接MySQL

看例子

In [None]:
import pymysql
import numpy as np
import pandas as pd
import sys
import os

current_directory = os.path.dirname(os.path.abspath('__file__'))
root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
sys.path.append(root_path)

conn = pymysql.connect(
         host = 'localhost',
         user = 'root',
         password = '1234',
         database = 'test',
         charset = 'utf8mb4'
)
mysql_page = pd.read_sql("select * from test", con=conn)
print(mysql_page)

## 连接Presto

看例子

In [None]:
import numpy as np
import pandas as pd
from pyhive import presto

presto_conn = presto.connect(
    host = 'emr-bi',
    port = 9090,
    catalog='hive',
    schema='default'
)

conn = presto_conn.cursor()
hql = ""
conn.execute(hql)
for result in conn.fetchall():
    print(result)