In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("mobiles.csv")
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


### Q1.

In [4]:
stat_mean = df["sales"].mean()
stat_std  = df["sales"].std()
stat_out  = stat_mean + 2 * stat_std
stat_out

146.55150129273215

In [6]:
df_q1 = df.loc[df["sales"] > stat_out, ]
df_q1.shape

(16, 11)

In [None]:
df_q1["idx"] = (df_q1["ROM"] / 32) + (df_q1["RAM"] / 2) + \
(df_q1["num_front_camera"] + df_q1["num_rear_camera"]) + (df_q1["battery_capacity"] / 1000)

In [9]:
round(df_q1["idx"].mean(), 2)

11.01

### Q2.

In [12]:
df["num_rear_camera"].unique()

array([1, 2, 3, 4], dtype=int64)

In [None]:
df_q2 = df.loc[df["num_rear_camera"] != 1, "battery_capacity":]
df_q2.head(1)

In [18]:
df_corr = df_q2.corr()
type(df_corr)

pandas.core.frame.DataFrame

In [23]:
df_corr["sales"].abs().round(2) # 0.95

In [25]:
df_corr["sales"].drop("sales").abs().round(2).max()

0.95

### Q3.

`pd.get_dummies()`는 다른 메서드/함수/클래스와 다르게 "columns" 인자에 단일 값을 할당하는 경우에도 반드시 🌟**리스트**🌟 객체를 사용하여 할당해야 한다. 단순 문자열을 할당할 경우 에러가 난다.

그리고 원핫인코딩을 실시할 때 변수명에 띄어쓰기가 있을 수 있는데 `statsmodels` 라이브러리 기반 모델링을 하면서 formula 를 사용하는 경우 변수명에 띄어쓰기를 제거하지 않은 채로 formula를 작성하면 반드시 에러가 발생함. 그리고 이 이슈는 이전 시험에서 응시자가 어려움을 겪은 사례가 있음.  
※ 다음의 코드 결과에서는 "screen_size_Very Large"  
※ "screen_size_Very Large" -> "screen_size_Very_Large"

In [None]:
# df_q3_dum = pd.get_dummies(df, columns = ["screen_size"]) # 시험버전
df_q3_dum = pd.get_dummies(df, columns = ["screen_size"], dtype = "int")
df_q3_dum.head(2)

In [None]:
df_q3_dum.shape

In [None]:
df_q3_dum = df_q3_dum.set_index("sales").reset_index()
df_q3_dum.head(1)

In [35]:
df_train, df_test = train_test_split(df_q3_dum, train_size = 0.8, random_state = 123)
len(df_train), len(df_test)

(344, 86)

In [36]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [None]:
arr_train_nor[:1, ]

In [None]:
k = 3
model_knn = KNeighborsRegressor(n_neighbors = k)
model_knn.fit(X = arr_train_nor[:, 1:],
              y = arr_train_nor[:, 0])
pred = model_knn.predict(arr_test_nor[:, 1:])
mean_squared_error(y_true = arr_test_nor[:, 0], y_pred = pred) ** 0.5

In [46]:
ls_k = [3, 5, 7, 9, 11]
ls_rmse = []
for k in ls_k:
    model_knn = KNeighborsRegressor(n_neighbors = k)
    model_knn.fit(X = arr_train_nor[:, 1:],
                  y = arr_train_nor[:, 0])
    pred = model_knn.predict(arr_test_nor[:, 1:])
    val_rmse = mean_squared_error(y_true = arr_test_nor[:, 0], y_pred = pred) ** 0.5
    ls_rmse = ls_rmse + [val_rmse]

In [50]:
ls_rmse # 3!!!

[0.08186677375964535,
 0.09879109824384892,
 0.107669855645971,
 0.11232111394853059,
 0.1136902366621185]

In [48]:
ser_rmse = pd.Series(ls_rmse, index = ls_k)
ser_rmse # 3 !!!

3     0.081867
5     0.098791
7     0.107670
9     0.112321
11    0.113690
dtype: float64

In [49]:
best_k = ser_rmse.idxmin()
best_k

3

### Q3. \[추가 지시사항\] 다음은 저번달에 신규 출시된 경쟁사의 스마트폰 정보이다. 해당 스마트폰의 판매지수는 얼마로 예상되는가?  
> **단계 1)** 주어진 데이터는 기존 원핫인코딩 규칙을 기반으로 더미변수를 생성하시오.  
> **단계 2)** 기존 정규화 규칙을 기반으로 주어진 데이터를 정규화 하시오.  
> **단계 3)** 기존에 정제한 학습 데이터 세트와 이웃 개수는 직전에 최적이라고 판단한 k값을 사용한 k-NN 모델을 준비하시오.  
> **단계 4)** 준비된 k-NN 모델에 "단계 2"에서 산출한 데이터 세트를 입력하고 그 결과를 확인하시오.  
> **단계 5)** "단계 4"의 결과물을 기존 정규화 규칙을 기반으로 역변환 하시오.  

※ 정답은 반올림하여 소수 첫째 자리까지 출력하시오.  
(정답 예시: 0.1)
* ROM: 256
* RAM: 6
* num_rear_camera: 4
* num_front_camera: 1
* battery_capacity: 4000
* ratings: 4.3
* num_of_ratings: 25000
* sales_price: 85000
* discount_percent: 0.05
* screen_size: "Large"


In [54]:
df_t1 = df_test.head(1).reset_index(drop = True)
df_t1["RAM"] = 6
df_t1["num_rear_camera"] = 4
df_t1["battery_capacity"] = 4000
df_t1["ratings"] = 4.3
df_t1["num_of_ratings"] = 25000
df_t1["sales_price"] = 85000
df_t1["discount_percent"] = 0.05
df_t1["screen_size_Large"] = 1
df_t1["screen_size_Medium"] = 0

In [59]:
df_t1

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,5.9,256,6,4,1,4000,4.3,25000,85000,0.05,1,0,0,0,0


In [56]:
MinMaxScaler().fit_transform(df_t1)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [64]:
# model_nor.transform(df_t1.drop(columns = "sales"))
arr_t1_nor = model_nor.transform(df_t1)
arr_t1_nor

array([[0.0119438 , 0.49206349, 0.45454545, 1.        , 0.        ,
        0.42307692, 0.625     , 0.05308122, 0.51815842, 0.09302326,
        1.        , 0.        , 0.        , 0.        , 0.        ]])

In [69]:
model_knn_b = KNeighborsRegressor(n_neighbors = best_k)
model_knn_b.fit(X = arr_train_nor[:, 1:],
                y = arr_train_nor[:, 0])
pred_t1 = model_knn_b.predict(arr_t1_nor[:, 1:])
pred_t1

array([0.00132259])

In [None]:
# model_nor.inverse_transform(pred_t1)
# model_nor.inverse_transform([pred_t1])

arr_t1_nor[0, 0] = pred_t1
arr_t1_inv = model_nor.inverse_transform(arr_t1_nor)
arr_t1_inv

In [76]:
round(arr_t1_inv[0, 0], 1)

0.7

In [77]:
df_t1_inv = pd.DataFrame(arr_t1_inv, columns = df_test.columns)
df_t1_inv

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,0.653333,256.0,6.0,4.0,1.0,4000.0,4.3,25000.0,85000.0,0.05,1.0,0.0,0.0,0.0,0.0


In [78]:
df_t1

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,5.9,256,6,4,1,4000,4.3,25000,85000,0.05,1,0,0,0,0
