# 할당과 복사

In [1]:
my_list1 = [1, 2, 3]

my_list1은 1, 2, 3 원소를 가진 리스트([1, 2, 3])를 참조하고 있다.

In [2]:
my_list2 = my_list1

my_list2에는 my_list1이 가지고 있던 리스트를 똑같이 참조한다.

In [3]:
print(my_list1)
print(my_list2)

[1, 2, 3]
[1, 2, 3]


my_list1과 my_list2는 같은 리스트이기 때문에 한쪽이 변경되면 다른쪽도 변경되는 것 처럼 보인다.

In [4]:
my_list2.append(4)

In [5]:
print(my_list1)
print(my_list2)

[1, 2, 3, 4]
[1, 2, 3, 4]


## 복사
* copy() 함수를 사용
* slice [ : ] 를 사용
* list() 를 이용해 새로운 리스트를 직접 생성

In [6]:
my_list_original = [10, 20, 30]

In [7]:
# 1. copy() 함수 사용해서 리스트 복사하기
my_list_copy = my_list_original.copy()
my_list_copy.append(4)

print(my_list_original)
print(my_list_copy)

[10, 20, 30]
[10, 20, 30, 4]


In [8]:
# 2. slice 기법을 사용해서 리스트 복사하기
my_list_slice = my_list_original[:]
my_list_slice.append(40)

print(my_list_original)
print(my_list_slice)

[10, 20, 30]
[10, 20, 30, 40]


# 딕셔너리
딕셔너리를 다루는 방법과 데이터 분석을 하기 위해 사용하는 pandas의 사용법이 매우 유사

In [9]:
my_dict = {
    "name": "소민호",
    "age" : 33
}

print(my_dict)

{'name': '소민호', 'age': 33}


In [10]:
print(my_dict['name'])

소민호


In [11]:
print(my_dict['age'])

33


문법상 딕셔너리의 데이터 추가와 수정은 똑같다.

없으면 새로 만들고, 있으면 수정된다.

In [12]:
my_dict['email'] = 'mhso.dev@gmail.com'
print(my_dict)

{'name': '소민호', 'age': 33, 'email': 'mhso.dev@gmail.com'}


In [13]:
my_dict['age'] = 34
print(my_dict)

{'name': '소민호', 'age': 34, 'email': 'mhso.dev@gmail.com'}


딕셔너리의 결합

In [14]:
dict_a = { 'a': 1, 'b': 2}
dict_b = { 'c': 3, 'd': 4}

dict_a.update(dict_b)

print(dict_a)

{'a': 1, 'b': 2, 'c': 3, 'd': 4}


In [15]:
dict_a = { 'a': 1, 'b': 2}
dict_b = { 'b': 3, 'd': 4}

dict_a.update(dict_b)

print(dict_a)

{'a': 1, 'b': 3, 'd': 4}


In [16]:
my_dict = { 
    'a': 10,
    'b': 20
}

In [17]:
'a' in my_dict # 'a'라는 키값이 my_dict 안에 있다.

True

In [18]:
'c' in my_dict

False

In [19]:
my_dict['a']

10

In [20]:
my_dict['c']

KeyError: 'c'

In [21]:
my_dict.keys()

dict_keys(['a', 'b'])

In [22]:
my_dict.values()

dict_values([10, 20])

In [23]:
my_dict.items()

dict_items([('a', 10), ('b', 20)])

In [24]:
# 딕셔너리 삭제
del my_dict['a']
my_dict

{'b': 20}

In [25]:
# 딕셔너리 전체 삭제
my_dict.clear()

In [26]:
my_dict

{}

# Set(집합)
* 중복된 자료의 저장 불가 ( 데이터의 unique 유지 )
* 데이터 저장의 순서가 없다. ( hash 알고리즘 )

In [27]:
set1 = {1,2,3,3,4,6,7,1,2,3,4,1,2,3,9,1,1,2,3}
print(set1)

{1, 2, 3, 4, 6, 7, 9}


In [28]:
my_list = [1, 5,1,2,3,4,1,20,1,2,3,1,2,3,5,4,6,7,8,9]
set2 = set(my_list)
print(set2)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 20}


In [29]:
my_str = 'letter'
set3 = set(my_str)
print(set3)

{'l', 'e', 'r', 't'}


In [30]:
list(set3)

['l', 'e', 'r', 't']

데이터 분석을 하기 위한 자료구조

* 통계적, 수학적 분석 ( 보통 pandas )
* 머신러닝이나 딥러닝 분석 ( tensor 개념 )

In [32]:
list_2dim = [[10, 20, 30],
             [40, 50, 60]]

print(list_2dim)

[[10, 20, 30], [40, 50, 60]]


In [33]:
list_2dim[1]

[40, 50, 60]

In [34]:
list_2dim[1][1]

50

# 판다스 사용하기
* 설치 : pip install pandas

In [35]:
import pandas as pd
import numpy as np

In [36]:
s = pd.Series([1, 2 ,3, np.nan, 6, 7])
s

0    1.0
1    2.0
2    3.0
3    NaN
4    6.0
5    7.0
dtype: float64

In [38]:
dates = pd.date_range('20200828', periods=10)
dates

DatetimeIndex(['2020-08-28', '2020-08-29', '2020-08-30', '2020-08-31',
               '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04',
               '2020-09-05', '2020-09-06'],
              dtype='datetime64[ns]', freq='D')

In [39]:
df = pd.DataFrame( np.random.rand(5, 4),
                   index = [1, 2, 3, 4, 5],
                   columns = ['A','B','C','D'] )

df

Unnamed: 0,A,B,C,D
1,0.26231,0.909602,0.211128,0.156121
2,0.021702,0.55097,0.298729,0.177286
3,0.147116,0.32199,0.166857,0.548681
4,0.741914,0.339404,0.604417,0.000643
5,0.097113,0.040754,0.501769,0.355234


In [40]:
df['A']

1    0.262310
2    0.021702
3    0.147116
4    0.741914
5    0.097113
Name: A, dtype: float64

In [41]:
df[['A','C']]

Unnamed: 0,A,C
1,0.26231,0.211128
2,0.021702,0.298729
3,0.147116,0.166857
4,0.741914,0.604417
5,0.097113,0.501769


In [42]:
column_names = ['A', 'D']
df[column_names]

Unnamed: 0,A,D
1,0.26231,0.156121
2,0.021702,0.177286
3,0.147116,0.548681
4,0.741914,0.000643
5,0.097113,0.355234


In [43]:
df.loc[3]

A    0.147116
B    0.321990
C    0.166857
D    0.548681
Name: 3, dtype: float64

In [44]:
df.loc[:, ['A', 'C']]

Unnamed: 0,A,C
1,0.26231,0.211128
2,0.021702,0.298729
3,0.147116,0.166857
4,0.741914,0.604417
5,0.097113,0.501769


In [45]:
df.loc[1:3, ['B', 'D']]

Unnamed: 0,B,D
1,0.909602,0.156121
2,0.55097,0.177286
3,0.32199,0.548681


In [46]:
df['A'] > 0.5

1    False
2    False
3    False
4     True
5    False
Name: A, dtype: bool

In [47]:
df[ df['A'] > 0.3 ]

Unnamed: 0,A,B,C,D
4,0.741914,0.339404,0.604417,0.000643


In [48]:
df[ df > 0.3 ]

Unnamed: 0,A,B,C,D
1,,0.909602,,
2,,0.55097,,
3,,0.32199,,0.548681
4,0.741914,0.339404,0.604417,
5,,,0.501769,0.355234


B 컬럼의 값이 0.3 보다 큰 A, D컬럼만 뽑아주세요

In [50]:
df.loc[df['B'] > 0.3, ['A','D']]

Unnamed: 0,A,D
1,0.26231,0.156121
2,0.021702,0.177286
3,0.147116,0.548681
4,0.741914,0.000643
