In [24]:
import pandas as pd
import numpy as np
from pathlib import Path
from pandas.testing import assert_frame_equal
from zipfile import ZipFile
p1 = Path.cwd() / 'back_data'

In [3]:
# 데이터를 담을 병렬 리스트 -> 리스트 각각은 DataFrame의 열로
fname = ['Paul', 'John', 'Richard', 'George']
lname = ['McCartney', 'Lennon', 'Starkey', 'Harrison']
birth = [1942, 1940, 1940, 1943]
# zip() 함수를 활용하여 각 데이터들을 묶은 후
people = zip(fname, lname, birth)
# pd.DataFrame() 함수를 이용하여 DataFrame 생성 -> columns, index 인자 전달 가능
beatles = pd.DataFrame(people, columns=['first', 'last', 'birth'])
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [11]:
# StringIO는 여러 줄의 string에서 한 줄씩 읽는 코드를 작성하기 위함?
from io import StringIO
fout = StringIO()
# pandas의 to_csv() 메서드를 통해 fout에 전달, index=False 인자 전달 가능
beatles.to_csv(fout, index=False)
# StringIO의 seek() 메서드를 통해 커서를 다시 맨 위 줄로 이동시키기
fout.seek(0)
fout.read()

'first,last,birth\r\nPaul,McCartney,1942\r\nJohn,Lennon,1940\r\nRichard,Starkey,1940\r\nGeorge,Harrison,1943\r\n'

In [13]:
# dtpye 인자를 활용해 형식 지정 가능 -> 메모리 절약 가능
# usecols, nrows 인자를 활용해 원하는 열 / 작은 표본으로 제한 가능
# read_csv 함수는 인터넷 상 csv파일도 읽기 가능 -> url을 직접 전달
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds = pd.read_csv(p1 / 'diamonds.csv', dtype={'carat': 'float32', 'depth': 'float32',
           'table': 'float32', 'x': 'float32',
           'y': 'float32', 'z': 'float32',
           'price': 'int16',
           'cut': 'category', 'color': 'category',
           'clarity': 'category'}, usecols=cols)
# 열에 'category' 형식이 있을 경우 groupby 진행 시 observed=True 인자 넣어줘야
diamonds.dtypes

carat       float32
cut        category
color      category
clarity    category
depth       float32
table       float32
price         int16
dtype: object

In [15]:
# memory_usage(deep=True) 메서드 활용하여 객체 형식 Series의 사용량 파악 가능
diamonds['price'].memory_usage(deep=True)

108008

In [18]:
# pd.ExcelWriter() 함수 활용하여 DataFrame을 엑셀로 저장 가능
xlsx = pd.ExcelWriter(Path.cwd() / 'result_data' / 'beatles.xlsx', engine='openpyxl')
beatles.to_excel(xlsx, sheet_name='Sheet1', index=False)
xlsx.close()

In [22]:
# csv 파일이 zip 파일에 들어있는 유일한 파일이면 read_csv 함수 사용 가능
# parse_dates는 무조건 대괄호 안에 넣어야
autos = pd.read_csv(p1 / 'vehicles.csv.zip', parse_dates=['modifiedOn'])
autos['modifiedOn']

  autos = pd.read_csv(p1 / 'vehicles.csv.zip', parse_dates=['modifiedOn'])


0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
39096   2013-01-01
39097   2013-01-01
39098   2013-01-01
39099   2013-01-01
39100   2013-01-01
Name: modifiedOn, Length: 39101, dtype: datetime64[ns]

In [28]:
# zip 파일을 열기 위해 zipfile.ZipFile() 함수 활용
with ZipFile(p1 / 'kaggle-survey-2018.zip') as file:
    # pd.read_csv() 함수에 zip 파일에서 open을 희망하는 파일 전달
    kag = pd.read_csv(file.open('multipleChoiceResponses.csv'))
survey = kag.iloc[1:]
survey.head()

  kag = pd.read_csv(file.open('multipleChoiceResponses.csv'))


Unnamed: 0,Time from Start to Finish (seconds),Q1,Q1_OTHER_TEXT,Q2,Q3,Q4,Q5,Q6,Q6_OTHER_TEXT,Q7,...,Q49_OTHER_TEXT,Q50_Part_1,Q50_Part_2,Q50_Part_3,Q50_Part_4,Q50_Part_5,Q50_Part_6,Q50_Part_7,Q50_Part_8,Q50_OTHER_TEXT
1,710,Female,-1,45-49,United States of America,Doctoral degree,Other,Consultant,-1,Other,...,-1,,,,,,,,,-1
2,434,Male,-1,30-34,Indonesia,Bachelor’s degree,Engineering (non-computer focused),Other,0,Manufacturing/Fabrication,...,-1,,,,,,,,,-1
3,718,Female,-1,30-34,United States of America,Master’s degree,"Computer science (software engineering, etc.)",Data Scientist,-1,I am a student,...,-1,,Too time-consuming,,,,,,,-1
4,621,Male,-1,35-39,United States of America,Master’s degree,"Social sciences (anthropology, psychology, soc...",Not employed,-1,,...,-1,,,Requires too much technical knowledge,,Not enough incentives to share my work,,,,-1
5,731,Male,-1,22-24,India,Master’s degree,Mathematics or statistics,Data Analyst,-1,I am a student,...,-1,,Too time-consuming,,,Not enough incentives to share my work,,,,-1


In [36]:
# json 형식의 경우 orient='columns', orient='records' 등 활용 가능
beatles.to_json(), beatles.to_json(orient='records')

('{"first":{"0":"Paul","1":"John","2":"Richard","3":"George"},"last":{"0":"McCartney","1":"Lennon","2":"Starkey","3":"Harrison"},"birth":{"0":1942,"1":1940,"2":1940,"3":1943}}',
 '[{"first":"Paul","last":"McCartney","birth":1942},{"first":"John","last":"Lennon","birth":1940},{"first":"Richard","last":"Starkey","birth":1940},{"first":"George","last":"Harrison","birth":1943}]')

In [51]:
# 스탯티즈 사이트의 경우 BeautifulSoup 대신 pd.read_html() 함수 사용 가능
statiz = pd.read_html('http://www.statiz.co.kr/stat.php?re=0&lr=')[0]
(statiz
.droplevel(0, axis=1)
.query("순 != '순'")
)

Unnamed: 0,순,이름,팀,WAR*,G,타석,타수,득점,안타,2타,...,희타,희비,타율,출루,장타,OPS,wOBA,wRC+,WAR*.1,WPA
0,1,이정후,22키CF,5.77,92,401,348,48,118,21,...,0,2,0.339,0.421,0.566,0.988,0.435,180.0,5.77,4.73
1,2,나성범,22KRF,4.82,90,408,343,61,109,28,...,0,5,0.318,0.414,0.542,0.957,0.428,170.9,4.82,2.79
2,3,피렐라,22삼LF,4.35,89,394,350,65,117,18,...,0,0,0.334,0.409,0.549,0.957,0.426,165.6,4.35,1.73
3,4,소크라테스,22KCF,3.94,76,328,304,54,101,20,...,0,1,0.332,0.378,0.546,0.924,0.411,161.0,3.94,1.18
4,5,최정,22S3B,3.49,78,314,257,56,74,11,...,0,4,0.288,0.405,0.482,0.887,0.404,149.9,3.49,1.99
5,6,김현수,22LLF,3.49,89,382,338,57,97,17,...,0,0,0.287,0.368,0.512,0.879,0.391,153.0,3.49,2.1
6,7,최지훈,22SCF,3.46,92,407,356,65,109,22,...,9,3,0.306,0.372,0.433,0.804,0.363,127.2,3.46,1.65
7,8,박성한,22SSS,3.4,89,359,315,41,102,12,...,3,1,0.324,0.399,0.4,0.799,0.372,133.7,3.4,1.91
8,9,오지환,22LSS,3.27,90,363,321,47,82,11,...,3,2,0.255,0.331,0.461,0.792,0.353,126.1,3.27,1.52
9,10,한동희,22롯3B,3.19,80,315,288,31,93,21,...,0,2,0.323,0.375,0.51,0.885,0.399,149.4,3.19,1.67
