In [1]:
import pandas as pd

In [2]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] 
users = pd.read_table('users.dat', sep='::', header=None, names=unames, engine='python', index_col='user_id')

In [3]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames, engine='python', index_col='movie_id')

In [4]:
ratings = pd.read_table('ratings.csv', sep=',', index_col='id')

# assign

assign은 원본 데이터를 건드리지 않고 새로운 컬럼을 추가하는 함수에요.  
assign(column_name = lambda 함수)의 형식으로 주로 씁니다.  
lambda 함수가 받는 인자는 그 데이터 프레임 전체에요. 거기에 아무짓이나 해서 컬럼 하나로만 만들어주면 돼요.

In [5]:
movies.assign(year = lambda x: x['title'].map(lambda x: x[-5:-1])).head()

Unnamed: 0_level_0,title,genres,year
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Animation|Children's|Comedy,1995
2,Jumanji (1995),Adventure|Children's|Fantasy,1995
3,Grumpier Old Men (1995),Comedy|Romance,1995
4,Waiting to Exhale (1995),Comedy|Drama,1995
5,Father of the Bride Part II (1995),Comedy,1995


method chaining으로 여러번 쓸 수 있어요.

In [6]:
(movies
 .assign(year = lambda x: x['title'].map(lambda x: x[-5:-1]))
 .assign(num_genres = lambda x: x['genres'].map(lambda x: len(x.split('|'))))
 ).head()

Unnamed: 0_level_0,title,genres,year,num_genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Animation|Children's|Comedy,1995,3
2,Jumanji (1995),Adventure|Children's|Fantasy,1995,3
3,Grumpier Old Men (1995),Comedy|Romance,1995,2
4,Waiting to Exhale (1995),Comedy|Drama,1995,2
5,Father of the Bride Part II (1995),Comedy,1995,1


movies는 전혀 변하지 않았죠.

In [7]:
movies.head()

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


의미있는 단계마다 변수를 만들어 저장해두면 좋아요.

In [8]:
movies_added = (movies
 .assign(year = lambda x: x['title'].map(lambda x: x[-5:-1]))
 .assign(num_genres = lambda x: x['genres'].map(lambda x: len(x.split('|'))))
 )
movies_added.head()

Unnamed: 0_level_0,title,genres,year,num_genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Animation|Children's|Comedy,1995,3
2,Jumanji (1995),Adventure|Children's|Fantasy,1995,3
3,Grumpier Old Men (1995),Comedy|Romance,1995,2
4,Waiting to Exhale (1995),Comedy|Drama,1995,2
5,Father of the Bride Part II (1995),Comedy,1995,1


컬럼 이름을 똑같이 쓰면 덮어쓰기가 됩니다. 하지만 이건 데이터를 변경하기 때문에 꼭 필요할 때만 쓰세요.

movies_added의 year를 string에서 int로 바꿔보겠습니다.

In [9]:
(movies_added
 .assign(year = lambda x: x['year'].astype(int))
 ).head()

Unnamed: 0_level_0,title,genres,year,num_genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Animation|Children's|Comedy,1995,3
2,Jumanji (1995),Adventure|Children's|Fantasy,1995,3
3,Grumpier Old Men (1995),Comedy|Romance,1995,2
4,Waiting to Exhale (1995),Comedy|Drama,1995,2
5,Father of the Bride Part II (1995),Comedy,1995,1


이제 year의 중앙값을 구할 수 있겠네요.

In [10]:
movies_added['year'].median()

1994.0

# Q

Q. ratings에서 rating 컬럼을 1~5점에서 0~100점으로 변환해서 'rating_100'이라는 컬럼에 넣어보세요.
> hint : (x.rating - 1) * 25

In [11]:
# A0 = 
### BEGIN SOLUTION
A0 = ratings.assign(rating_100 = lambda x: (x.rating - 1)*25)
### END SOLUTION

In [12]:
assert 'rating_100' in A0.columns
assert A0.rating_100.max() == 100
assert A0.rating_100.min() == 0
assert A0.rating_100.mean() == 64.6