## 讀取 txt / json / npy 檔
### 讀取 txt 檔

In [4]:
file_path = "data\\example.txt"
with open(file_path, 'r') as f:
    data = f.readlines()
print(data)

['id,sex,age,score\n', '001,F,20,77\n', '002,F,25,90\n', '003,M,22,80\n', '004,F,30,66\n', '005,M,40,60\n', '006,M,29,87']


In [5]:
import pandas as pd

In [6]:
#把 txt 轉成 pandas dataframe
data = []
file_path = "data\\example.txt"
with open(file_path, 'r') as f:
    for line in f:
        line = line.replace('\n', '').split(',') # 將每句最後的 /n 取代成空值後，再以逗號斷句
        data.append(line)
data

[['id', 'sex', 'age', 'score'],
 ['001', 'F', '20', '77'],
 ['002', 'F', '25', '90'],
 ['003', 'M', '22', '80'],
 ['004', 'F', '30', '66'],
 ['005', 'M', '40', '60'],
 ['006', 'M', '29', '87']]

In [7]:
df = pd.DataFrame(data[1:])
df.columns = data[0]
df

Unnamed: 0,id,sex,age,score
0,1,F,20,77
1,2,F,25,90
2,3,M,22,80
3,4,F,30,66
4,5,M,40,60
5,6,M,29,87


### 讀取 json 檔
可以檢視看看json格式與txt是否有差異(以id為key)<br><br>
如果是DataFrame轉json，預設的orient是'columns'，orient可選引數有{‘split','records','index','columns','values'}
- split，樣式為 {index -> [index], columns -> [columns], data -> [values]}
- records，樣式為[{column -> value}, … , {column -> value}]
- index ，樣式為 {index -> {column -> value}}
- columns，樣式為 {index -> {column -> value}}
- values，陣列樣式
- table，樣式為{‘schema': {schema}, ‘data': {data}}，和records類似

In [9]:
import json
# 把 dataframe轉存成json檔， default = column
df.to_json('data\\example01.json')

In [10]:
# 上面的存入方式，會將 column name 做為主要的 key, row name 做為次要的 key
with open('data\\example01.json', 'r') as f:
    j1 = json.load(f)
j1

{'id': {'0': '001',
  '1': '002',
  '2': '003',
  '3': '004',
  '4': '005',
  '5': '006'},
 'sex': {'0': 'F', '1': 'F', '2': 'M', '3': 'F', '4': 'M', '5': 'M'},
 'age': {'0': '20', '1': '25', '2': '22', '3': '30', '4': '40', '5': '29'},
 'score': {'0': '77', '1': '90', '2': '80', '3': '66', '4': '60', '5': '87'}}

In [11]:
df.set_index('id', inplace=True) # 以ID為key
df

Unnamed: 0_level_0,sex,age,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,20,77
2,F,25,90
3,M,22,80
4,F,30,66
5,M,40,60
6,M,29,87


In [12]:
# 把 dataframe轉存成json檔， index
df.to_json('data\\example02.json', orient='index')

In [13]:
with open('data\\example02.json', 'r') as f:
    j2 = json.load(f)
j2

{'001': {'sex': 'F', 'age': '20', 'score': '77'},
 '002': {'sex': 'F', 'age': '25', 'score': '90'},
 '003': {'sex': 'M', 'age': '22', 'score': '80'},
 '004': {'sex': 'F', 'age': '30', 'score': '66'},
 '005': {'sex': 'M', 'age': '40', 'score': '60'},
 '006': {'sex': 'M', 'age': '29', 'score': '87'}}

### 讀取 npy 檔
npy是一個專門儲存numpy array的檔案格式，讀取資料的速度快

In [14]:
import numpy as np
# 將 data 的數值部分轉成 numpy array
array = np.array(data[1:])
array

array([['001', 'F', '20', '77'],
       ['002', 'F', '25', '90'],
       ['003', 'M', '22', '80'],
       ['004', 'F', '30', '66'],
       ['005', 'M', '40', '60'],
       ['006', 'M', '29', '87']], dtype='<U3')

In [16]:
# 存檔
np.save(arr=array, file='data\\examplenpy.npy')

In [17]:
# 讀檔
data_npy = np.load('data\\examplenpy.npy')
data_npy

array([['001', 'F', '20', '77'],
       ['002', 'F', '25', '90'],
       ['003', 'M', '22', '80'],
       ['004', 'F', '30', '66'],
       ['005', 'M', '40', '60'],
       ['006', 'M', '29', '87']], dtype='<U3')

## 不用下載檔案到本機，使用Request 抓取資料
* 請讀取 [text file](https://raw.githubusercontent.com/vashineyu/slides_and_others/master/tutorial/examples/imagenet_urls_examples.txt)
* 懶人複製連結: https://raw.githubusercontent.com/vashineyu/slides_and_others/master/tutorial/examples/imagenet_urls_examples.txt<br><br>

- 使用 [Request](https://blog.gtwang.org/programming/python-requests-module-tutorial/) 抓取資料
- [字串分割](http://www.runoob.com/python/att-string-split.html)
- 例外處理: [Try-Except](https://pydoing.blogspot.com/2011/01/python-try.html)

In [33]:
## 假如我們不想把資料載到自己的電腦裡?
# 把連結填入
target_url = "https://raw.githubusercontent.com/vashineyu/slides_and_others/master/tutorial/examples/imagenet_urls_examples.txt"

In [34]:
import requests
response = requests.get(target_url)
data = response.text

# 用 request 傳送回來的資料不會認得斷行符號
print(len(data))
data[0:100]

784594


'n00015388_157\thttp://farm1.static.flickr.com/145/430300483_21e993670c.jpg\nn00015388_238\thttp://farm2'

In [35]:
# 找到換行符號，用該符號做字串分割後，把它拿掉
split_tag = "\n"

data = data.split(split_tag)
print(len(data))
data[0:3]

9996


['n00015388_157\thttp://farm1.static.flickr.com/145/430300483_21e993670c.jpg',
 'n00015388_238\thttp://farm2.static.flickr.com/1005/3352960681_37b9c1d27b.jpg',
 'n00015388_304\thttp://farm1.static.flickr.com/27/51009336_a9663af3dd.jpg']

## 將 txt 轉成 pandas dataframe

In [36]:
import pandas as pd

    
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0
0,n00015388_157\thttp://farm1.static.flickr.com/...
1,n00015388_238\thttp://farm2.static.flickr.com/...
2,n00015388_304\thttp://farm1.static.flickr.com/...
3,n00015388_327\thttp://farm4.static.flickr.com/...
4,n00015388_355\thttp://img100.imageshack.us/img...


## 讀取圖片，請讀取上面 data frame 中的前 5 張圖片

In [None]:
from PIL import Image
from io import BytesIO
import numpy as np
import matplotlib.pyplot as plt

# 請用 df.loc[...] 得到第一筆資料的連結
first_link = 

response = requests.get(first_link)
img = Image.open(BytesIO(response.content))

# Convert img to numpy array

plt.imshow(img)
plt.show()

In [None]:
def img2arr_fromURLs(url_list, resize = False):
    """
    請完成這個 Function
    Args
        - url_list: list of URLs
        - resize: bool
    Return
        - list of array
    """
    
    return img_list

In [None]:
result = img2arr_fromURLs(df[0:5][1].values)
print("Total images that we got: %i " % len(result)) # 如果不等於 5, 代表有些連結失效囉

for im_get in result:
    plt.imshow(im_get)
    plt.show()