In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
train=pd.read_csv('/content/drive/MyDrive/abalone/data/train.csv')
test=pd.read_csv('/content/drive/MyDrive/abalone/data/test.csv')

In [None]:
train.head() # id제외 8가지 feature와 1개 target

In [None]:
train.shape 

In [None]:
test.head()

In [None]:
#결측치 확인
def check_missing_col(dataframe):
    missing_col = []
    counted_missing_col = 0
    for i, col in enumerate(dataframe.columns):
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            counted_missing_col += 1
            print(f'결측치가 있는 컬럼은: {col}입니다')
            print(f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if counted_missing_col == 0:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col = check_missing_col(train)

In [None]:
train.info() #결측치 없음

In [None]:
#고유번호 id열 데이터 프레임에서 제거
train= train.drop(columns=['id'],axis=1)

In [None]:
train

In [None]:
temp= train['Target'].unique()

In [None]:
temp

In [None]:
np.sort(temp) 

In [None]:
temp.max()

In [None]:
train.groupby('Target').count().iloc[:,0] # target 값 별로 데이터 개수 /iloc[:,0] 첫번째 열 접근


In [None]:
plt.rcParams['figure.figsize'] = [10,5]
sns.countplot('Target',data=train)
plt.title("Abalone age by count",fontsize=20)
plt.xlabel('target(age)',fontsize=15)
plt.ylabel('count',fontsize=15)
plt.show()

In [None]:
temp1 = train['Target']==29
temp2 = train['Target']==23
train[temp1|temp2] #29세와 23세의 데이터 프레임을 합치기

In [None]:
train[train['Target']==3]

In [None]:
train.dtypes

In [None]:
train["Gender"].unique()

In [None]:
train.groupby('Gender').count().iloc[:,0]

In [None]:
plt.rcParams['figure.figsize'] = [8,5]
sns.countplot('Gender',data=train)
plt.title('Abalone gender by count',fontsize=30)
plt.show()

In [None]:
sns.kdeplot("Target",hue='Gender',data=train); #I,M,F 순으로 오래사는것을 확인할수있음

In [None]:
data_description=train.describe()
data_description

In [None]:
# 수치형 데이터 통계치 그래프
interest_coloumns = train.columns[1:]
plt.style.use('fivethirtyeight')
fig, ax =plt.subplots(2, 4, figsize = (25, 10))
fig.suptitle('Histogram of interesting features', fontsize=40)
column_idx = 0
for i in range(2):
    for j in range(4):
        ax[i][j].hist(train[interest_coloumns[column_idx]], bins=30, color='#eaa18a', edgecolor='#7bcabf')
        ax[i][j].set_title(interest_coloumns[column_idx])
        ax[i][j].axvline(data_description[interest_coloumns[column_idx]]['mean'], c='#f55354', label = f"mean = {round(data_description[interest_coloumns[column_idx]]['mean'], 2)}")
        ax[i][j].axvline(data_description[interest_coloumns[column_idx]]['50%'], c='#518d7d', label = f"median = {round(data_description[interest_coloumns[column_idx]]['50%'], 2)}")
        ax[i][j].legend()
        column_idx += 1

In [None]:
# 전복의 무게가 좌편향된것 확인

In [None]:
# 수치형 데이터 분포 Target과 상관관계 그래프
def visualize(axx, yfield):
  sns.regplot(x='Target', y=yfield, data=train,  color='#eaa18a', line_kws=  {'color': '#f55354'} , ax = axx) # regplot을 이용하여 추세선과 산점도 그래프를 그려줍니다.
  axx.set_title(yfield)

figure, ((ax1,ax2,ax3,ax4),(ax5,ax6,ax7,ax8)) = plt.subplots(nrows=2, ncols=4)
figure.set_size_inches(20,12)
figure.suptitle('Correlation between target and features', fontsize=40)
for i in range(len(train.columns[1:-1])):
  visualize(eval(f'ax{i+1}'), train.columns[i+1])

In [None]:
#모든 변수들 나이와 양의 상관관계

In [None]:
# 수치형 데이터 상관관계 히트맵 시각화
train_corr = train.drop(columns=['Gender'],axis=1)
scaler= MinMaxScaler() 
train_corr[train_corr.columns] = scaler.fit_transform(train_corr[train_corr.columns])
corr28 = train_corr.corr(method= 'pearson')

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(data = corr28, annot=True, fmt = '.2f', linewidths=.5, cmap='Blues')
plt.title('Correlation between features', fontsize=30)

In [None]:
# Target과 피쳐들의 상관관계
s28 = corr28.unstack()
df_temp28 = pd.DataFrame(s28['Target'].sort_values(ascending=False), columns=['Target'])
df_temp28.style.background_gradient(cmap='viridis')

In [None]:
# 상관관계 0.3 이상이면 유의미 

In [None]:
#Label Encoding이란, 범주형 변수의 문자열을 수치형으로 변환하는 방법
#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i+1  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    print(label_maps)
    return label_maps

# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

In [None]:
train_le = make_label_map(train[['Gender']]) 
gender_df = label_encoder(train[['Gender']], train_le)
train['Gender'] = gender_df[['Gender']]

In [None]:
train.head()

In [None]:
train_x=train.drop(['Target'],axis=1)
train_y=train.Target

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model=RandomForestRegressor()

In [None]:
model.fit(train_x,train_y) #모델 학습

In [None]:
test

In [None]:
gender_df=label_encoder(test[["Gender"]],train_le)
test["Gender"]=gender_df[["Gender"]]
test.head()

In [None]:
test_x=test.drop(['id'],axis=1)

In [None]:
test_x

In [None]:
prediction=model.predict(test_x)

In [None]:
prediction

In [None]:
submission=pd.read_csv('/content/drive/MyDrive/abalone/data/sample_submission.csv')

In [None]:
submission.head()

In [None]:
submission["Target"]=prediction

In [None]:
submission

In [None]:
submission.to_csv('submit.csv', index=False)