<a href="https://colab.research.google.com/github/lookinsight/ml/blob/main/20221113_ML%EC%A0%95%EB%A6%AC_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 머신러닝 정리

## 1. 선형회귀(Linear Regression) 

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.metrics import mean_squared_error

# 데이터 수집

file_name = "insurance.csv"   
url = f'https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/{file_name}'
df = pd.read_csv(url)

df.head()
df.info()
df.describe()
pd.options.display.float_format = '{:.2f}'.format

# 데이터 전처리 (범주형)

df.smoker.unique()
df.smoker.eq('yes')
# df.smoker.eq('yes') * 1
df.smoker = df.smoker.eq('yes').mul(1)
df.sex.unique()
df.region.unique()
df.region.nunique()
df.dtypes

df_dummy = pd.get_dummies(df, columns = ['sex','region'], drop_first = True)

# 데이터 전처리

df_dummy.columns
X = df_dummy[['age', 'bmi', 'children', 'smoker', 'sex_male',
       'region_northwest', 'region_southeast', 'region_southwest']]
y = df_dummy.expenses
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

# 모델학습
model.fit(X_train, y_train) 

# 예측
pred = model.predict(X_test) 

# 모델 평가
comparison = pd.DataFrame({'actual': y_test, 'pred': pred})

 # 평가위한 시각화
plt.figure(figsize = (5, 5))
sns.scatterplot(x = 'actual', y = 'pred', data = comparison)

# MSE
mean_squared_error(y_test, pred, squared = False) 
model.score(X_train, y_train) 
model.coef_
pd.Series(model.coef_, index = X.columns) 
model.intercept_

## 2. 로지스틱 회귀(Logistic Regression) 

In [None]:
import pandas as pd 
pd.options.display.float_format = '{:.2f}'.format

# 데이터 불어오기
file_url = "https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/titanic_train.csv"
df_train = pd.read_csv(file_url, index_col = 0) 

df_train.head() 
df_train.info()
df_train.describe(include=["O"])
# 범주형 컬럼들 확인
df_train.Embarked.unique()
# 상관관계
df_train.corr()

import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(df_train.corr(), cmap='coolwarm', vmin=-1, vmax=1, annot=True)

# 결측치 처리 
df_train.info()
df_train.Embarked.value_counts()
df_train['Embarked'].unique()
df_train.Embarked = df_train['Embarked'].fillna('S')  
df_train.Cabin.unique()
df_train.drop(columns = ['Cabin'], inplace = True) 
df_train.drop(columns = ['Ticket'], inplace = True) 
df_train['Name'].dtype
df_train.Name
# df_train.Name.str.extract('([A-Za-z]+)\.') # A-Z까지, a-z까지, +는 1개 이상, \. 은 여러개
df_train['Title'] = df_train.Name.str.extract('([A-Za-z]+)\.')

df_train.Title.value_counts()
title_unique = df_train.Title.unique()

rarelist = []
for t in title_unique:
    df_tlist = list(df_train.Title).count(t)
    print(df_tlist)
    if df_tlist < 10:
        rarelist.append(t)


df_train.Title = df_train['Title'].replace(rarelist, "Rare")
title_age_mean = df_train.groupby(['Title'])['Age'].mean()

for t in df_train.Title.unique():
    df_train.loc[df_train['Age'].isnull() & (df_train.Title == t), 'Age'] = title_age_mean[t]

df_train.drop(columns = ['Name', 'Title'], inplace = True) 
    
df_train2 = pd.get_dummies(df_train, columns = ['Sex', 'Embarked'], drop_first = True)
df_train2.info()

# 모델링
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

model = LogisticRegression()
X, y = (df_train2.drop(columns = ['Survived']), df_train2.Survived) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100) 

model.fit(X_train, y_train)
pred = model.predict(X_test)

# 모델 평가
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)
model.coef_

pd.options.display.float_format = '{:.4f}'.format
pd.Series(model.coef_[0], index = X.columns) 


def pre_processing(df : pd.DataFrame):
    df.Embarked = df.Embarked.fillna("S") 
    df.Fare = df.Fare.fillna(0)
    df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
    rarelist = [a for a in set(df['Title'])
                if list(df['Title']).count(a) < 10]
    df['Title'] = df['Title'].replace(rarelist, 'Rare') 
    title_age_mean = df.groupby(['Title'])['Age'].mean() 
    for v in df['Title'].unique():
        df.loc[df.Age.isnull() & (df.Title == v), 'Age'] = title_age_mean[v]
    df_clean = df.drop(columns=['Name', 'Ticket', 'Title', 'Cabin'])
    return pd.get_dummies(df_clean,
                          columns = ['Sex', 'Embarked'], drop_first=True)
    
file_url = "https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/titanic_test.csv"
submission = pd.read_csv(f'{file_url}', index_col=0)

# submission_df = pre_processing(submission)
df_sub = pre_processing(submission)
pred_sub = model.predict(df_sub)

result = pd.DataFrame({'PassengerId':df_sub.index,'Survived':pred_sub})
# result.to_csv('submission.csv')

## 3. K-최근접이웃(K-nearest-neigbors)

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
file_url = 'https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/wine.csv'
df = pd.read_csv(file_url)

df.info()
df.describe()

# 출력되는 데이터의 소수점 4자리까지만 보기
pd.options.display.float_format = '{:,.4f}'.format 
df.describe()

df.Class.value_counts()
value_counts = df.Class.value_counts()
sns.barplot(x = value_counts.index, y = value_counts)

# barplot 숫자표시  
value_counts = df.Class.value_counts() 
bar = sns.barplot(x = value_counts.index, y = value_counts) 
for p in bar.patches:
    height = p.get_height() 
    bar.text(p.get_x() + p.get_width() / 2., height + 3, height, ha = 'center', size = 9) 
bar.set_ylim(-5, 100) 
plt.show() 

# 스케일링
# 표준화 스케일링
# 로버스트 스케일링
# 표준화 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# 훈련셋과 시험셋 분리 
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df.drop('Class', axis = 1),    # axis = 0 행, axis = 1 축 
                                                    df.Class, 
                                                    test_size = 0.2, random_state = 100)

# 최대-최소 스케일러 사용 
mm_scaler = MinMaxScaler() 
mm_scaler.fit(X_train)
X_train_scaled = mm_scaler.transform(X_train) 
mm_scaler.fit(X_train)

mm_scaler = MinMaxScaler()  # MinMaxScaler() 은 객체 , 객체는 안에 프로퍼티를 가지고 있음 
 # fit을 한다는건 특정한 데이터를 기준으로 학습을 시킴(프로퍼티라는 내부변수에 저장) 
 # fit 하고 transform (한번에 fit_transform) 이게 가능한건 mm_scaler.fit 이 내부에 저장되어 있기 때문에 X_test는 transform 만 해줘도 됨
X_train_scaled = mm_scaler.fit_transform(X_train)       
X_test_scaled = mm_scaler.transform(X_test) 
X_test_scaled

# 모델링
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier() 
knn.fit(X_train_scaled, y_train) 
pred = knn.predict(X_test_scaled) 

# 정확도보기 
from sklearn.metrics import accuracy_score 
accuracy_score(y_test, pred) 

# 스케일링 안한 데이터 비교
knn2 = KNeighborsClassifier() 
knn2.fit(X_train, y_train) 
pred2 = knn2.predict(X_test) 
accuracy_score(y_test, pred2) 

# 코드 진행 관련 함수화 하기 
# n은 정수형 int 를 패러미터로 받고, 최종 return 으로 실수형 float 를 반환해준다.
def tuning(n: int) -> float:
    knn = KNeighborsClassifier(n_neighbors = n) 
    knn.fit(X_train_scaled, y_train)
    pred = knn.predict(X_test_scaled)
    score = accuracy_score(y_test, pred)  
    return score 

# for 문을 통해서 1 ~ 20까지 tuning 값 구하기 
for t in range(1, 21):
    print(t, tuning(t)) 

k_list = [(t, tuning(t)) for t in range(1, 21)]

sorted(k_list, key = lambda x: x[1], reverse = True) 

import matplotlib.pyplot as plt

x = [4, 5, 10, 4, 3, 11, 14 , 8, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]
classes = [0, 0, 1, 0, 0, 1, 1, 0, 1, 1]

plt.scatter(x, y, c=classes)

## 4. 나이브 베이즈(Naive Bayes) 

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

# 데이터 불러오기 
file_url = 'https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/spam.csv'
df = pd.read_csv(file_url, index_col = 0) 
df.head()
df.info()
df['target'].value_counts()
df.text.nunique()
df.text.unique()

# 특수문자 제거
import string
string.punctuation

# 첫번째 줄 샘플 데이터 보기
first_text = df.text.loc[0] 
first_text

for v in first_text:
    if v not in string.punctuation:
        print(v) 

'r' in 'warrior', 'k' in 'warrior', 'k' not in 'warrior' 

new_text = [] 
for v in first_text:
    if v not in string.punctuation:
        new_text.append(v) 

"".join(new_text) 

def remove_punc(text: str) -> str:
    new_text = [c for c in text if c not in string.punctuation]
    return "".join(new_text)

remove_punc(df.text) 

df.text.apply(lambda text: "".join([c for c in text if c not in string.punctuation])) 
df.text = df['text'].apply(remove_punc) 

import nltk  # 자연어 처리 위한 세트
nltk.download('stopwords') 
# corpus (말뭉치)
from nltk.corpus import stopwords
stopwords.words('english') 

first_text = df.text.iloc[0]

# 불용어 제거
first_text.split() 

stop_eng = stopwords.words('english') 

# 불용어 확인
for word in first_text.split():
    if word in stop_eng:
        print(word) 

# 불용어 아닌 단어 출력
for word in first_text.split():
    if word not in stop_eng:
        print(word) 

# 불용어 아닌 단어 출력 - 소문자로 출력
for word in first_text.split():
    if word not in stop_eng:
        print(word.lower())

def remove_stop_words(text: str) -> str:
    new_words = [] 
    for word in text.split():
        if word not in stop_eng:
            new_words.append(word.lower()) 
    return " ".join(new_words) 

first_text, remove_stop_words(first_text)

df.text = df.text.apply(remove_stop_words) 
df.target.unique() # 0, 1로
df.target = df.target.map({'ham': 0, 'spam':1}) 
df.target.unique()

# 카운트 기반으로 벡터화 하기
df.text
x = df.text  #독립변수 (series 1개 => 소문자)
y = df.target   # 종속변수 (소문자)

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer() # 클래스 -> 객체 -> cv 
cv.fit(x)
cv.vocabulary_
len(cv.vocabulary_)
x = cv.transform(x) 

# 훈련셋 & 시험셋
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100) 

# 모델 학습
from sklearn.naive_bayes import MultinomialNB   

model = MultinomialNB()
model.fit(x_train, y_train) 
pred = model.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, pred)
print(confusion_matrix(y_test, pred))

# 가로 : 실제, 세로: 예측
sns.heatmap(confusion_matrix(y_test, pred), cmap='coolwarm', annot=True, fmt='.0f')
plt.title("Confusion Matrix :")
plt.ylabel('True')
plt.xlabel('Predicted') 
plt.show() 

# 그래프 좀 더 세밀하게 표현
cf_matrix = confusion_matrix(y_test,pred)
group_names = ['TN','FP','FN','TP']
print('group_names')
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
print('group_counts')
group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
print('group_percentages')                     
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='coolwarm')
plt.title("Confusion Matrix :")
plt.ylabel('True')
plt.xlabel('Predicted') 
plt.show() 

## 5. 결정트리(Desision Tree) 

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

# 데이터 불러오기

file_url = 'https://raw.githubusercontent.com/bigdata-young/bigdata_16th/main/data/salary.csv'
df = pd.read_csv(file_url, skipinitialspace = True) 
df.head()
df.info()
df.describe(include = ['O'])
df.describe(include = 'all') 
df['capital-gain'].plot()	

# 전처리
df['class']
df['class'].value_counts()
df['class'] = df['class'].map({'<=50K': 0, '>50K': 1})
df['age'].dtype

for c in df.columns:
    print(c, df[c].dtype)

obj_list = []
for c in df.columns:
    if df[c].dtype == 'object':
        obj_list.append(c)
        # print(c, df[c].dtype)
print(obj_list)

obj_list2 = [c for c in df.columns if df[c].dtype == 'object']
print(obj_list2) 

for o in obj_list:
    if df[o].nunique() > 10:
        print(o,'-', df[o].nunique()) 

df.education.value_counts()
df['education-num']

for n in range(1, 17):
    print(f"**{n}**", df[df['education-num'] == n]['education'].unique())

df.drop(columns = ['education'], axis = 1, inplace = True)
df.info()

df['occupation'].value_counts()
df['native-country'].value_counts()

df.groupby(['native-country'])['class'].mean().sort_values(ascending = False) 
df[df['native-country'] == 'France'].groupby(['occupation'])['class'].mean() 

country_group = df.groupby('native-country').mean()['class']
country_group.index
country_group = country_group.reset_index()

df = df.merge(country_group, on = 'native-country', how = 'left')
df.drop('native-country', axis=1, inplace=True)
df = df.rename(columns = {'class_x': 'class', 'class_y':'native-country'})

# 결측치 처리 더미변수변환
df.isna().mean()
df['native-country'].fillna(-99, inplace=True)
df['workclass'].value_counts() / len(df)
df['workclass'].fillna('Private', inplace=True)
df['occupation'].value_counts()
df['occupation'].fillna('Unknown', inplace=True)
df.info()
df2 = pd.get_dummies(df, drop_first=True)
df2.info()

# 모델링 평가
# 훈련셋 & 시험셋
from sklearn.model_selection import train_test_split
X = df2.drop('class', axis=1)
y = df2['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=100
)

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state = 100)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred

from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

# 하이퍼 파라미터 튜닝
from sklearn.tree import DecisionTreeClassifier 

model = DecisionTreeClassifier(random_state = 100)  
model.fit(X_train, y_train)                      
train_pred = model.predict(X_train)              
test_pred = model.predict(X_test)                

print('Train score :', accuracy_score(y_train, train_pred))
print('Test score :', accuracy_score(y_test, test_pred))

# 깊이를 제한했더니 학습이 덜 되면서 오히려 해로운 데이터에 대한 예측력 상승
model = DecisionTreeClassifier(max_depth = 5, random_state = 100)  
model.fit(X_train, y_train)                      
train_pred = model.predict(X_train)              
test_pred = model.predict(X_test)                
print('Train score :', accuracy_score(y_train, train_pred) )
print('Test score :', accuracy_score(y_test, test_pred))

# max_depth : 7
model = DecisionTreeClassifier(max_depth = 7, random_state = 100)  
model.fit(X_train, y_train)                      
train_pred = model.predict(X_train)              
test_pred = model.predict(X_test)                
print('Train score :', accuracy_score(y_train, train_pred) )
print('Test score :', accuracy_score(y_test, test_pred))

# 함수화

def test_depth(depth: int):
    model = DecisionTreeClassifier(max_depth = depth, random_state = 100)  
    model.fit(X_train, y_train)                      
    train_pred = model.predict(X_train)              
    test_pred = model.predict(X_test)                
    print(f'**{depth}**')
    print('Train score :', accuracy_score(y_train, train_pred) )
    print('Test score :', accuracy_score(y_test, test_pred))

# [test_depth(d) for d in range(1, 20)]
for d in range(1, 21):
    test_depth(d)

# 트리그래프
from sklearn.tree import plot_tree 
plt.figure(figsize=(30,10))  # 그래프 크기 설정
plot_tree(model) 
plt.show()

plt.figure(figsize=(30,10))  # 그래프 크기 설정
plot_tree(model, max_depth=3, fontsize = 15)
plt.show()

# feature name 표시
plt.figure(figsize=(30,10))  # 그래프 크기 설정
plot_tree(model, max_depth=3, fontsize = 15, feature_names = X_train.columns)
plt.show()