In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchsummary import summary
from datetime import datetime

pd.set_option('mode.chained_assignment', None)

# Data preprocessing

In [None]:
birth = pd.read_csv("data/birth.csv")
breed = pd.read_csv("data/breed.csv")
report = pd.read_csv("data/report.csv")
spec = pd.read_csv("data/spec.csv")
submission = pd.read_csv("data/submission.csv")
report.head()

In [None]:
train = report
test = report
train.info()
train.isna().sum()
test.info()
test.isna().sum()

In [None]:
# 把train中乳量是空的刪掉
train = train.dropna(subset=['11'])
train.reset_index(drop=True, inplace=True)

In [None]:
test = test[test['11'].isnull()]
test.reset_index(drop=True, inplace=True)
test

In [None]:
# 將乳牛的空值填入平均體重
avg_weight = birth['6'].mean()
birth['6'] = birth['6'].fillna(birth['6'].mean())

In [None]:
train['6']=train['6'].fillna("NoData").astype("str")
train['7']=train['7'].fillna("NoData").astype("str")
train.dropna(subset=['6', '7', '10'], inplace=True)
train.dropna(axis='columns', inplace=True)

test['6']=test['6'].fillna("NoData").astype("str")
test['7']=test['7'].fillna("NoData").astype("str")

In [None]:
# train 合併 spec, 當年當月有病1, 沒病0
from datetime import datetime
train['health'] = 0
test['health'] = 0
for i in range(len(spec)):
    ym = datetime.strptime(spec['4'][i], "%Y/%m/%d %H:%M")
    if len(train.index[train["5"] == spec["1"][i]]) > 0:
        for j in train.index[train["5"] == spec["1"][i]]:
            if train['2'][j] == ym.year and train['3'][j] == ym.month and train['4'][j]==spec['7'][i]:
                train['health'][j] = 1
    if len(test.index[test["5"] == spec["1"][i]]) > 0:
        for j in test.index[test["5"] == spec["1"][i]]:
            if test['2'][j] == ym.year and test['3'][j] == ym.month and test['4'][j]==spec['7'][i]:
                test['health'][j] = 1

In [None]:
train['weight'] = np.nan
test['weight'] = np.nan
for i in range(len(birth)):
    if len(train.index[train['5'] == birth['1'][i]])>0:
        for j in train.index[train['5'] == birth['1'][i]]:
                train['weight'][j] = birth['6'][i]
    if len(test.index[test['5'] == birth['1'][i]])>0:
        for j in test.index[test['5'] == birth['1'][i]]:
                test['weight'][j] = birth['6'][i]
train['weight'] = train['weight'].fillna(avg_weight)
test['weight'] = test['weight'].fillna(avg_weight)

In [None]:
# 新增season欄位
train['season'] = ""
for index, row in train.iterrows():
    if int(train['3'][index]) >= 3 and int(train['3'][index]) <= 5:
        train['season'][index] = 'Spring'
    elif int(train['3'][index]) >= 6 and int(train['3'][index]) <= 8:
        train['season'][index] = 'Summer'
    elif int(train['3'][index]) >= 9 and int(train['3'][index]) <= 11:
        train['season'][index] = 'Autumn'
    else:
        train['season'][index] = 'Winter'

test['season'] = ""
for index, row in test.iterrows():
    if int(test['3'][index]) >= 3 and int(test['3'][index]) <= 5:
        test['season'][index] = 'Spring'
    elif int(test['3'][index]) >= 6 and int(test['3'][index]) <= 8:
        test['season'][index] = 'Summer'
    elif int(test['3'][index]) >= 9 and int(test['3'][index]) <= 11:
        test['season'][index] = 'Autumn'
    else:
        test['season'][index] = 'Winter'

In [None]:
train

In [None]:
test

# Create model

In [None]:
new_train=pd.DataFrame(data=train,columns=['4','5','9','10','14','18','health','season','weight'])
new_test=pd.DataFrame(data=test,columns=['4','5','9','10','14','18','health','season','weight'])

## Pytorch

In [None]:
new_train.shape, new_test.shape

In [None]:
#把要one hot的類別轉換成數字
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
all_data=pd.concat([new_train,new_test])
all_data['4'] = labelencoder.fit_transform(all_data['4'])
all_data['5'] = labelencoder.fit_transform(all_data['5'])
all_data['season'] = labelencoder.fit_transform(all_data['season'])
all_data['health'] = labelencoder.fit_transform(all_data['health'])
new_train = all_data[0:len(new_train)]
new_test = all_data[len(new_train)::]
all_data=pd.concat([new_train,new_test])

In [None]:
new_train

In [None]:
new_train.values, train['11']

In [None]:
inputs = new_train
targets = train['11']
inputs = torch.tensor(inputs.values)
targets = torch.tensor(targets.values)
targets = targets.reshape(33253, 1)
inputs = inputs.float()
targets = targets.float()

In [None]:
inputs, targets, inputs.shape, targets.shape

In [None]:
train_ds = TensorDataset(inputs, targets)
train_ds[0:3]

In [None]:
batch_size = 64
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [None]:
model = nn.Sequential(
    nn.Linear(9, 256),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(256, 1)
)

In [None]:
summary(model, (33253, 9))

In [None]:
# Generate predictions
preds = model(inputs)
preds

In [None]:
preds, targets

In [None]:
loss_fn = nn.MSELoss()
loss = torch.sqrt(loss_fn(preds, targets))
print(loss)

In [None]:
# Define optimizer
opt = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
# Utility function to train the model
def fit(num_epochs, model, loss_fn, opt, train_dl):
    
    # Repeat for given number of epochs
    for epoch in range(num_epochs):
        
        # Train with batches of data
        for xb, yb in train_dl:
            # 1. Generate predictions
            pred = model(xb)
            
            # 2. Calculate loss

            loss = torch.sqrt(loss_fn(pred, yb))
            
            # 3. Compute gradients
            loss.backward()
            
            # 4. Update parameters using gradients
            opt.step()
            
            # 5. Reset the gradients to zero
            opt.zero_grad()
        
        # Print the progress
        if (epoch+1) % 10 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

In [None]:
fit(200, model, loss_fn, opt, train_dl)

In [None]:
time = datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
torch.save(model.state_dict(), 'weights/' + time + '.pth')

In [None]:
# Generate predictions
preds = model(inputs)
preds

In [None]:
preds = preds.detach().numpy()
preds = pd.DataFrame(preds)
time = datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
sub=pd.read_csv('data/submission.csv',sep=',')
sub['1']=preds
sub.to_csv('output/' + time + '.csv',index=False)