# chapter 8 特殊なデータ加工・可視化10ノック


## ノック91: 大容量CSVデータを扱ってみよう

csv データを行数毎に区切りながら処理 -> 1つのファイルに追記しながら保存

In [None]:
# オリジナル
import pandas as pd
df = pd.read_csv('data/person_count_out_0001_2021011509.csv')
df

Unnamed: 0,id,place,receive_time,sensor_num,in1,out1,state1,in2,out2,state2
0,0,1,2021-01-15 09:00:00.144,2,508,73,0,73,508,0
1,1,1,2021-01-15 09:00:01.146,2,508,73,0,73,508,0
2,2,1,2021-01-15 09:00:02.161,2,508,73,0,73,508,0
3,3,1,2021-01-15 09:00:03.176,2,508,73,0,73,508,0
4,4,1,2021-01-15 09:00:04.192,2,508,73,0,73,508,0
...,...,...,...,...,...,...,...,...,...,...
3535,3535,1,2021-01-15 09:59:55.054,2,782,156,0,156,782,0
3536,3536,1,2021-01-15 09:59:56.07,2,782,156,0,156,782,0
3537,3537,1,2021-01-15 09:59:57.085,2,782,156,0,156,782,0
3538,3538,1,2021-01-15 09:59:58.101,2,782,156,0,156,782,0


In [None]:
# 行数指定しながら読み込み
for df in pd.read_csv('data/person_count_out_0001_2021011509.csv', chunksize=512):
  print(df.shape)

(512, 10)
(512, 10)
(512, 10)
(512, 10)
(512, 10)
(512, 10)
(468, 10)


In [None]:
# chunk 毎に処理を実行し、追記しながら保存
i = 0
for df in pd.read_csv('data/person_count_out_0001_2021011509.csv', chunksize=64):
  df['processd_per_chunk'] = True # 仮の処理
  df.to_csv('data/processed_big_data.csv',
            mode='a', # 同ファイルに追記
            index=False,
            header = i==0 # 最初の chunk のみ header 有りと指定
            )
  i += 1

In [4]:
df = pd.read_csv('data/processed_big_data.csv')
df

Unnamed: 0,id,place,receive_time,sensor_num,in1,out1,state1,in2,out2,state2,processd_per_chunk
0,0,1,2021-01-15 09:00:00.144,2,508,73,0,73,508,0,True
1,1,1,2021-01-15 09:00:01.146,2,508,73,0,73,508,0,True
2,2,1,2021-01-15 09:00:02.161,2,508,73,0,73,508,0,True
3,3,1,2021-01-15 09:00:03.176,2,508,73,0,73,508,0,True
4,4,1,2021-01-15 09:00:04.192,2,508,73,0,73,508,0,True
...,...,...,...,...,...,...,...,...,...,...,...
3535,3535,1,2021-01-15 09:59:55.054,2,782,156,0,156,782,0,True
3536,3536,1,2021-01-15 09:59:56.07,2,782,156,0,156,782,0,True
3537,3537,1,2021-01-15 09:59:57.085,2,782,156,0,156,782,0,True
3538,3538,1,2021-01-15 09:59:58.101,2,782,156,0,156,782,0,True


## ノック92: Json形式のファイルを扱ってみよう

In [5]:
pd.read_json('data/column_oriented.json')

Unnamed: 0,id,value
0,1,1
1,2,10
2,3,100


In [8]:
pd.read_json('data/index_oriented.json')

Unnamed: 0,0,1,2
id,1,2,3
value,1,10,100


In [None]:
# 正しく読み込めるようにオプション指定
pd.read_json('data/index_oriented.json', orient='index')

Unnamed: 0,id,value
0,1,1
1,2,10
2,3,100


In [11]:
# 正しく読み込めるようにオプション指定
pd.read_json('data/table_oriented.json', orient='table')

Unnamed: 0,id,value
0,1,1
1,2,10
2,3,100


## ノック93: *Webからデータを取得してみよう*

In [None]:
import requests
response = requests.get('https://worldtimeapi.org/api/timezone/Asia/Tokyo',
                        verify=False)
response.content

In [None]:
# 辞書型に変換
result = response.json()
result

In [None]:
# data series に変換
pd.Series(result)

In [None]:
# 結果の保存
import json

with open('data/response.json', mode='w') as f:
  json.dump(result, f)

In [None]:
# 定期的にAPIを呼んで結果を書き込みする場合
import time

for _ in range(4):
  response = requests.get('https://worldtimeapi.org/api/timezone/Asia/Tokyo')
  with open('data/responses.txt', mode='a') as f:
    res = response.json()
    f.write(f'{json.dumps(res)}\n')
  time.sleep(1)

## ノック94: configファイルを扱ってみよう

In [21]:
import yaml
with open('config.yml', mode='r') as f:
  config = yaml.load(f, Loader=yaml.FullLoader)
config

{'dataset': {'name': 'pseudo', 'path': 'data/images_by_py/'}, 'use_gpu': True}

In [24]:
import toml
with open('config.toml', mode='r') as f:
  config = toml.load(f)
config

{'use_gpu': True, 'dataset': {'name': 'pseudo', 'path': 'data/images_by_py/'}}

## ノック95 : 動画ファイルを音声ファイルへ変換してみよう

In [27]:
from moviepy import VideoFileClip

video_clip = VideoFileClip('data/sample_video.mp4')
video_clip.audio.write_audiofile('data/audio_by_py.mp3')

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'creation_time': '2017-01-31T18:19:07.000000Z'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [576, 320], 'bitrate': 490, 'fps': 29.97002997002997, 'codec_name': 'h264', 'profile': '(Constrained Baseline)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 72, 'metadata': {'Metadata': '', 'creation_time': '2017-01-31T18:19:07.000000Z', 'handler_name': 'IsoMedia File Produced by Google, 5-11-2011', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 13.03, 'bitrate': 563, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile':

                                                        

MoviePy - Done.




## ノック96 : 動画ファイルを画像ファイルへ分割してみよう

In [None]:
import cv2
from tqdm import trange
import os

cap = cv2.VideoCapture('data/sample_video.mp4')
img_dir = 'data/images_by_py/'
os.makedirs(img_dir, exist_ok=1)
n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

for i in trange(n):
  success, img = cap.read()
  if not success:
    continue
  cv2.imwrite(f'{img_dir}/{i:04}.png', img)

100%|██████████| 389/389 [00:02<00:00, 180.72it/s]


## ノック97 : PowerPointやWordファイルを読み込んでみよう

In [None]:
!pip install python-pptx
!pip install python-docx

In [None]:
import pptx
pptx_data = pptx.Presentation('data/サンプル_PowerPoint.pptx')
len(pptx_data.slides)

In [None]:
sld_0 = pptx_data.slides[0]
shp_sld_0 = sld_0.shapes
len(shp_sld_0)

In [None]:
print(shp_sld_0[0].text)
print(shp_sld_0[0].has_text_frame)

In [None]:
pptx_data = pptx.Presentation('data/サンプル_PowerPoint.pptx')
texts = []
for slide in pptx_data.slides:
    for shape in slide.shapes:
        if shape.has_text_frame:
          texts.append(shape.text)
print(texts)

In [None]:
import docx
docx_data = docx.Document('data/サンプル_Word.docx')
len(docx_data.paragraphs)

In [None]:
docx_data.paragraphs[0].text

In [None]:
texts = []
for paragraph in docx_data.paragraphs:
  texts.append(paragraph.text)
print(texts)

## ノック98 : PDFデータを読み込んでみよう

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

In [30]:
pdf_data = open('data/サンプル_PDF.pdf', 'rb')
txt_file = 'data/サンプル_PDF.txt'
out_data = open(txt_file, mode='w')

rscmgr = PDFResourceManager()
laprms = LAParams()
device = TextConverter(rscmgr, out_data, laparams=laprms)
itprtr = PDFPageInterpreter(rscmgr, device)

for page in PDFPage.get_pages(pdf_data):
    itprtr.process_page(page)

out_data.close()
device.close()
pdf_data.close()

In [31]:
with open('data/サンプル_PDF.txt', mode='r') as f:
  content = f.read()
print(content)

これは、サンプルテキストです。そして、これが一つめの段落になっています。いろいろ

読み込んでいきましょう。 

続いて、これが二つ目の段落になっています。 

これが三つめの段落です。 

 
 
ここから 2 ページ目です。 

2 ページ目二つ目の段落です。 




## ノック99 : インタラクティブなグラフを作成してみよう

In [32]:
import pandas as pd
df = pd.read_csv('data/person_count_out_0001_2021011509.csv')
df.head()

Unnamed: 0,id,place,receive_time,sensor_num,in1,out1,state1,in2,out2,state2
0,0,1,2021-01-15 09:00:00.144,2,508,73,0,73,508,0
1,1,1,2021-01-15 09:00:01.146,2,508,73,0,73,508,0
2,2,1,2021-01-15 09:00:02.161,2,508,73,0,73,508,0
3,3,1,2021-01-15 09:00:03.176,2,508,73,0,73,508,0
4,4,1,2021-01-15 09:00:04.192,2,508,73,0,73,508,0


In [33]:
import plotly.express as px
fig = px.line(x=df['receive_time'], y=df['in1'])
fig.show()

In [34]:
# 縦持ちデータへ変換
df_v = pd.melt(df[['receive_time','in1','out1']],
               id_vars=['receive_time'],
               var_name="変数名",
               value_name="値")
df_v.head()

Unnamed: 0,receive_time,変数名,値
0,2021-01-15 09:00:00.144,in1,508
1,2021-01-15 09:00:01.146,in1,508
2,2021-01-15 09:00:02.161,in1,508
3,2021-01-15 09:00:03.176,in1,508
4,2021-01-15 09:00:04.192,in1,508


In [35]:
fig = px.line(df_v, x='receive_time', y='値', color='変数名')
fig.show()

## ノック100: 3次元グラフを作成してみよう

In [36]:
import seaborn as sns
df_iris = sns.load_dataset('iris')
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [37]:
fig = px.scatter(df_iris, x='sepal_length', y='sepal_width', color='species')
fig.show()

In [38]:
fig = px.scatter_3d(df_iris, x='sepal_length', y='sepal_width', z='petal_width',color='species')
fig.show()