In [1]:
import pandas as pd
import numpy as np

## 생존여부 예측모델 만들기 - 타이타닉

### 학습용 데이터 (X_train, y_train)을 이용하여 생존 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 accuracy 평가지표에 따라 채점)

### 데이터 불러오기 

In [15]:
train = pd.read_csv('./data/Titanic_train.csv')

In [16]:
X_train = train.drop('Survived',axis=1)
y_train = train['Survived']

X_test = pd.read_csv('./data/Titanic_test.csv')

In [17]:
X_train.shape, y_train.shape, X_test.shape

((891, 11), (891,), (418, 11))

### 결측치 처리

In [18]:
X_train.info() # cabin 삭제, age는 평균값으로 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [19]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [20]:
X_train.drop('Cabin',axis=1, inplace=True)
X_test.drop('Cabin',axis=1, inplace=True)

In [21]:
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_test['Age'] = X_test['Age'].fillna(X_train['Age'].mean())

In [22]:
X_train['Age'].isnull().sum() , X_test['Age'].isnull().sum()

(0, 0)

In [23]:
# test데이터에 fare 1개의 결측치 존재
X_test['Fare'] = X_test['Fare'].fillna(X_train['Fare'].mean())

### 범주형 변수 인코딩

In [24]:
X_train.head(7)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S
5,6,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,Q
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S


In [25]:
col_cat = ['Sex','Embarked']
col_num = ['Age','Fare']
col_drop = ['PassengerId','Name','Ticket']

In [26]:
# 버릴 컬럼 버리기
X_train.drop(col_drop,axis=1,inplace=True)
X_test.drop(col_drop,axis=1,inplace=True)

In [27]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for i in col_cat:
    X_train[i] = encoder.fit_transform(X_train[i]) #레이블인코딩 이 형식으로 해라 바보야 
    X_test[i] = encoder.transform(X_test[i])

### 검증용 데이터 나누기 

In [13]:
from sklearn.model_selection import train_test_split

X_tr,X_val,y_tr,y_val = train_test_split(X_train,y_train, test_size = 0.2)

### 수치형 스케일링

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() 

X_tr[col_num] = scaler.fit_transform(X_tr[col_num]) # 수치형 스케일링은 이런식으로
X_val[col_num] = scaler.transform(X_val[col_num])
X_test[col_num] = scaler.transform(X_test[col_num])

### 모델 만들기

In [40]:
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier(n_estimators=200,max_depth=7,max_features=5)

from sklearn.tree import DecisionTreeClassifier

tree2 = DecisionTreeClassifier()

from sklearn.linear_model import LogisticRegression
lo = LogisticRegression()

In [41]:
from sklearn.model_selection import cross_val_score

print(np.mean(cross_val_score(tree,X_train,y_train,scoring='accuracy',cv=5)))
print(np.mean(cross_val_score(tree2,X_train,y_train,scoring='accuracy',cv=5)))

0.832803967108154
0.7676981984809491


In [17]:
tree.fit(X_train,y_train)
pre = tree.predict(X_val)

### 모델 성능 평가

In [20]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_val,pre)

In [21]:
score

0.7988826815642458

### 답 예측하기

In [184]:
y = tree.predict(X_test)

In [194]:
pd.DataFrame(y,columns=['Serviver']).to_csv('./data/answer/titanic.csv')

## 당뇨병 여부 판단¶
이상치 처리 (Glucose, BloodPressure, SkinThickness, Insulin, BMI가 0인 값)

In [44]:
df = pd.read_csv('./data/diabetes.csv')

In [45]:
df.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1


### Train,Test데이터 나누기

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X = df.drop('Outcome',axis = 1)
y = df['Outcome']

In [48]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=2021)  # 문제랑 같은데이터로 맞추기

In [49]:
X_train.shape, X_test.shape

((614, 8), (154, 8))

### 결측치 확인

In [50]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 147 to 116
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               614 non-null    int64  
 1   Glucose                   614 non-null    int64  
 2   BloodPressure             614 non-null    int64  
 3   SkinThickness             614 non-null    int64  
 4   Insulin                   614 non-null    int64  
 5   BMI                       614 non-null    float64
 6   DiabetesPedigreeFunction  614 non-null    float64
 7   Age                       614 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 43.2 KB


In [51]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 258 to 174
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               154 non-null    int64  
 1   Glucose                   154 non-null    int64  
 2   BloodPressure             154 non-null    int64  
 3   SkinThickness             154 non-null    int64  
 4   Insulin                   154 non-null    int64  
 5   BMI                       154 non-null    float64
 6   DiabetesPedigreeFunction  154 non-null    float64
 7   Age                       154 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 10.8 KB


### 이상치 처리
(Glucose, BloodPressure, SkinThickness, Insulin, BMI가 0인 값)

In [52]:
#train
print(X_train['Glucose'][X_train['Glucose'] == 0].count())
print(X_train['BloodPressure'][X_train['BloodPressure'] == 0].count())
print(X_train['SkinThickness'][X_train['SkinThickness'] == 0].count())
print(X_train['Insulin'][X_train['Insulin'] == 0].count())
print(X_train['BMI'][X_train['BMI'] == 0].count())

5
31
175
287
9


In [53]:
print(X_test['Glucose'][X_test['Glucose'] == 0].count())
print(X_test['BloodPressure'][X_test['BloodPressure'] == 0].count())
print(X_test['SkinThickness'][X_test['SkinThickness'] == 0].count())
print(X_test['Insulin'][X_test['Insulin'] == 0].count())
print(X_test['BMI'][X_test['BMI'] == 0].count())

0
4
52
87
2


In [54]:
# Glucose가 0인값은 삭제
X_train_drop = X_train[X_train['Glucose'] != 0]
y_train_drop = y_train[X_train['Glucose'] != 0] # y값도 꼭 같이 버려야해!!

In [55]:
# 0값을 평균값으로 대체(한번에 가능)
cols = ['BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
cols_mean = X_train_drop[cols].mean()
X_train_drop[cols] = X_train_drop[cols].replace(0, cols_mean)

cols = ['BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
cols_mean = X_train_drop[cols].mean()
X_test[cols] = X_test[cols].replace(0, cols_mean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_drop[cols] = X_train_drop[cols].replace(0, cols_mean)


### 범주형 인코딩 

In [56]:
# 할거없음
X_train_drop.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
147,2,106,64.0,35.0,119.0,30.5,1.4,34
344,8,95,72.0,20.91133,82.866995,36.8,0.485,57
390,1,100,66.0,29.0,196.0,32.0,0.444,42
150,1,136,74.0,50.0,204.0,37.4,0.399,24
132,3,170,64.0,37.0,225.0,34.5,0.356,30


### 수치형 스케일

In [61]:
col_num = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X_train_drop[col_num])
X_train_drop[col_num] = scaler.transform(X_train_drop[col_num])

X_test[col_num] = scaler.transform(X_test[col_num])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_drop[col_num] = scaler.transform(X_train_drop[col_num])


### 모델 생성

In [62]:
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier(n_estimators=200,max_depth=5,max_features=5)

In [67]:
from sklearn.model_selection import cross_val_score

print(np.mean(cross_val_score(tree,X_train_drop,y_train_drop,scoring='accuracy',cv=5)))

0.7651944181005284


In [64]:
tree.fit(X_train_drop,y_train_drop)

pre = tree.predict(X_test)

### 모델 평가

In [65]:
from sklearn.metrics import accuracy_score

In [66]:
score = accuracy_score(y_test,pre)
score

0.7727272727272727

In [49]:
pd.DataFrame({'cust_id': X_test.index, 'gender': pre}).to_csv('./data/answer/am.csv', index=False)

In [50]:
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
258,1,193,50.0,16.000000,375.000000,25.9,0.655,24
220,0,177,60.0,29.000000,478.000000,34.6,1.072,21
438,1,97,70.0,15.000000,121.374975,18.2,0.147,21
130,4,173,70.0,14.000000,168.000000,29.7,0.361,33
730,3,130,78.0,23.000000,79.000000,28.4,0.323,34
...,...,...,...,...,...,...,...,...
678,3,121,52.0,26.920333,121.374975,36.0,0.127,25
500,2,117,90.0,19.000000,71.000000,25.2,0.313,21
256,3,111,56.0,39.000000,121.374975,30.1,0.557,30
80,3,113,44.0,13.000000,121.374975,22.4,0.140,22


## 성인 인구조사 소득 예측

In [458]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("./data/adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

In [459]:
X_train.head(3)

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States


In [460]:
y_train['income'].value_counts()

<=50K    19756
>50K      6292
Name: income, dtype: int64

### 결측치 처리 

In [461]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 21851 to 25716
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26048 non-null  int64 
 1   age             26048 non-null  int64 
 2   workclass       24592 non-null  object
 3   fnlwgt          26048 non-null  int64 
 4   education       26048 non-null  object
 5   education.num   26048 non-null  int64 
 6   marital.status  26048 non-null  object
 7   occupation      24585 non-null  object
 8   relationship    26048 non-null  object
 9   race            26048 non-null  object
 10  sex             26048 non-null  object
 11  capital.gain    26048 non-null  int64 
 12  capital.loss    26048 non-null  int64 
 13  hours.per.week  26048 non-null  int64 
 14  native.country  25587 non-null  object
dtypes: int64(7), object(8)
memory usage: 3.2+ MB


In [462]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6513 entries, 20901 to 25782
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              6513 non-null   int64 
 1   age             6513 non-null   int64 
 2   workclass       6133 non-null   object
 3   fnlwgt          6513 non-null   int64 
 4   education       6513 non-null   object
 5   education.num   6513 non-null   int64 
 6   marital.status  6513 non-null   object
 7   occupation      6133 non-null   object
 8   relationship    6513 non-null   object
 9   race            6513 non-null   object
 10  sex             6513 non-null   object
 11  capital.gain    6513 non-null   int64 
 12  capital.loss    6513 non-null   int64 
 13  hours.per.week  6513 non-null   int64 
 14  native.country  6391 non-null   object
dtypes: int64(7), object(8)
memory usage: 814.1+ KB


In [463]:
# 문자데이터이므로 최빈값으로 처리해줌
nan =['workclass','occupation','native.country']

for i in nan:
    X_train[i] = X_train[i].fillna(X_train[i].mode()[0])
    X_test[i] = X_test[i].fillna(X_train[i].mode()[0])

### 범주형 인코딩

In [464]:
X_train.head(4)

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States


In [465]:
col_cat = ['workclass','education','marital.status','marital.status','relationship','race','sex','native.country','occupation']
col_num = ['age','fnlwgt','hours.per.week', 'capital.loss','capital.gain']

In [466]:
from sklearn.preprocessing import LabelEncoder

scaler = LabelEncoder()

for i in col_cat:
    scaler.fit(X_train[i])
    X_train[i] = scaler.transform(X_train[i])
    X_test[i] = scaler.transform(X_test[i])

### 수치형 스케일링

In [467]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[col_num])
X_train[col_num] = scaler.transform(X_train[col_num])
X_test[col_num] = scaler.transform(X_test[col_num])

### 필요없는 컬럼 삭제

In [468]:
X_train.drop('id',axis=1,inplace=True)
X_test2 = X_test.drop('id',axis=1)

### 모델 생성

In [469]:
y = []
for i in y_train['income']:
    if i == '>50K':
        y.append(1)
    else :
        y.append(0)
        
y = np.array(y)

In [470]:
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier()
tree.fit(X_train,y)

RandomForestClassifier()

In [471]:
pre = tree.predict(X_test2)

### 모델 평가

In [472]:
y2 = []
for i in y_test['income']:
    if i == '>50K':
        y2.append(1)
    else :
        y2.append(0)
        
y2 = np.array(y2)

In [473]:
from sklearn.metrics import accuracy_score
score = accuracy_score(pre,y2)

score

0.8572086596038692

In [482]:
a = []
for i in y2:
    if i == 1 :
        a.append('>50K')
    else :
        a.append('<=50K')
        
pd.DataFrame({'id':X_test.id,'income':a}).to_csv('./data/answer/adult.csv',index=False)

## 바이러스 예측 

In [92]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("./data/insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 7), (268, 7), (1070, 2), (268, 2))

### 결측치 확인

In [93]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 209 to 1140
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1070 non-null   int64  
 1   age       1070 non-null   int64  
 2   sex       1070 non-null   object 
 3   bmi       1070 non-null   float64
 4   children  1070 non-null   int64  
 5   smoker    1070 non-null   object 
 6   region    1070 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 66.9+ KB


In [94]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 1088 to 116
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        268 non-null    int64  
 1   age       268 non-null    int64  
 2   sex       268 non-null    object 
 3   bmi       268 non-null    float64
 4   children  268 non-null    int64  
 5   smoker    268 non-null    object 
 6   region    268 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 16.8+ KB


### 범주형 인코딩 

In [95]:
col_cat = X_train.select_dtypes('object').columns # 이렇게 범주형 뽑아낼 수 있음
col_num = ['age','bmi']

In [96]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for i in col_cat:
    encoder.fit(X_train[i])
    X_train[i] = encoder.transform(X_train[i])
    X_test[i] = encoder.transform(X_test[i])

### 검증용 분리

In [97]:
from sklearn.model_selection import train_test_split
X_tr,X_val,y_tr,y_val = train_test_split(X_train,y_train,test_size = 0.2)

### 수치형 스케일링

In [98]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_tr[col_num])
X_tr[col_num] = scaler.transform(X_tr[col_num])
X_val[col_num] = scaler.transform(X_val[col_num])
X_test[col_num] = scaler.transform(X_test[col_num])

### 컬럼 삭제

In [99]:
X_tr.drop('id',axis=1,inplace = True)
X_val.drop('id',axis=1,inplace = True)
X_test2 = X_test.drop('id',axis=1)

In [100]:
y_tr = y_tr['charges']
y_val = y_val['charges']

### 모델선택

In [101]:
from sklearn.ensemble import RandomForestRegressor

tree = RandomForestRegressor()

tree.fit(X_tr, y_tr)

pre = tree.predict(X_val)

### 모델 평가

In [102]:
from sklearn.metrics import mean_squared_error

score = mean_squared_error(y_val,pre,squared=False)
score

4915.542971860442

In [103]:
# y값 로그변환해서 예측 

y_tr_l = np.log1p(y_tr)

tree2 = RandomForestRegressor()

tree2.fit(X_tr,y_tr_l)
pre = tree2.predict(X_val)

pre2 = np.expm1(pre)

In [104]:
score = mean_squared_error(pre2,y_val,squared=False)
score

4818.457767926338

### 답 예측하기

In [106]:
y = tree2.predict(X_test2)

In [107]:
y2 = np.expm1(y)

In [109]:
pd.DataFrame({'id':X_test.id,'charges':y2}).to_csv('./data/answer/influ.csv',index=False)

In [111]:
score = mean_squared_error(y2,y_test['charges'],squared=False)
score

4746.068199745486

## 바이크쉐어링

In [196]:
train = pd.read_csv('./data/bike_train.csv')
test = pd.read_csv('./data/bike_test.csv')

In [197]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [198]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    6493 non-null   object 
 1   season      6493 non-null   int64  
 2   holiday     6493 non-null   int64  
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   int64  
 5   temp        6493 non-null   float64
 6   atemp       6493 non-null   float64
 7   humidity    6493 non-null   int64  
 8   windspeed   6493 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 456.7+ KB


### 컬럼 삭제하기

In [199]:
train.drop('datetime',axis=1,inplace=True)

In [200]:
test.drop('datetime',axis=1, inplace=True)

### 데이터 나누기

In [201]:
X_train=train.drop(['casual','registered','count'],axis=1)
y_train = train[['casual','registered','count']]

In [202]:
from sklearn.model_selection import train_test_split
X_tr,X_val,y_tr,y_val = train_test_split(X_train,y_train,test_size=0.2)

### 스케일링

In [203]:
X_tr.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
5045,4,0,0,1,10.66,15.15,87,0.0
9368,3,0,1,1,29.52,32.575,42,12.998
1501,2,0,1,2,14.76,16.665,71,19.0012
3952,3,0,1,3,19.68,23.485,82,31.0009
1382,2,0,0,1,17.22,21.21,41,16.9979


In [204]:
col_cat = ['season','holiday','workingday','weather']
col_num = ['temp','atemp','humidity','windspeed']

In [205]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

scaler.fit(X_tr[col_num])
X_tr[col_num] = scaler.transform(X_tr[col_num])
X_val[col_num] = scaler.transform(X_val[col_num])
test[col_num] = scaler.transform(test[col_num])

### 모델생성

In [208]:
from sklearn.ensemble import RandomForestRegressor
tree = RandomForestRegressor()

tree.fit(X_tr,y_tr)

pre = tree.predict(X_val)

In [221]:
pre.shape

(2178, 3)

In [222]:
pre[:,-1]

array([ 93.435     , 378.945     , 104.28083333, ..., 286.06166667,
       419.7065    , 279.16233333])

### 검증하기

In [225]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(pre,y_val)

76.37276774099952

In [227]:
mean_absolute_error(pre[:,-1],y_val['count'])

112.27239609241641

### 답

In [229]:
y = tree.predict(test)

In [234]:
pd.DataFrame(y,columns=['casual','registered','count']).to_csv('./data/answer/bike.csv',index=False)

정리
```
분류 : Classifier
회귀 : Regressor

포레스트 : n_estimator / max_depth 조절하기

인코딩은 한 컬럼씩 for문을 돌려서 실시하기

스케일링은 한번에 실시(랜덤포레스트는 영향 많이 안받음) -> 하나를 넣어도 2차원으로 진행해줘야함

df.describe(include='object')를 이용하면 유니크한 value값 개수 확인가능(인코딩에 사용)

y데이터는 1차원으로 넣어주기 
```

평가지표 
```
from sklearn.metrics 

분류 
정확도 : import acurracy_score
roc_auc : import roc_auc_score
f1 score : import f1_scoer 
           (f1_score(y_true, y_pred, average=[‘micro’, ‘macro’, ‘samples’,’weighted’ 중 하나 선택])

회귀 
mae : import mean_absolute_error
mse : import mean_squared_error
rmse : mean_squared_error(squared=False)
mape : import mean_absolute_persentage_error
r2 : import r2_score
```