In [5]:
import torch
import pandas as pd
import torch.nn as nn
from torch.nn import functional as F
from d2l import torch as d2l
from tqdm import tqdm
import numpy as np
from torch.utils import data
import wandb

NUM_SAVE = 50
net_list = "in->256->64"

class MLP(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layer1 = nn.Linear(in_features,256)
        self.layer2 = nn.Linear(256,64)
        self.out = nn.Linear(64,1)

    def forward(self, X):
        X = F.relu(self.layer1(X))
        X = F.relu(self.layer2(X))
        return self.out(X)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
print("train_data and test_data shape",train_data.shape,test_data.shape)

# 去掉冗余数据
redundant_cols = ['Address', 'Summary', 'City', 'State']
for c in redundant_cols:
    del test_data[c], train_data[c]

# 数据预处理
large_vel_cols = ['Lot', 'Total interior livable area', 'Tax assessed value', 'Annual tax amount', 'Listed Price', 'Last Sold Price']
for c in large_vel_cols:
    train_data[c] = np.log(train_data[c]+1)
    if c!='Sold Price':
        test_data[c] = np.log(test_data[c]+1)

# 把train和test去除id后放一起，train也要去掉label
all_features = pd.concat((train_data.iloc[:,2:],test_data.iloc[:,1:]))

# 时间数据赋日期格式
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")

for in_object in all_features.dtypes[all_features.dtypes=='object'].index:
    print(in_object.ljust(20),len(all_features[in_object].unique()))

# 查询数字列 ->缺失数据赋0 -> 归一化
numeric_features = all_features.dtypes[all_features.dtypes == 'float64'].index
all_features = all_features.fillna(method='bfill', axis=0).fillna(0)
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))

features = list(numeric_features)
features.extend(['Type','Bedrooms'])   # 加上类别数相对较少的Type, ,'Cooling features'
all_features = all_features[features]

print('before one hot code',all_features.shape)
all_features = pd.get_dummies(all_features,dummy_na=True)
all_features.shape
print('after one hot code',all_features.shape)

n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
print('train feature shape:', train_features.shape)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
print('test feature shape:', test_features.shape)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1), dtype=torch.float)
print('train label shape:', train_labels.shape)

criterion = nn.MSELoss()
in_features = train_features.shape[1]
net = MLP(in_features).to(device)

def load_array(data_arrays, batch_size, is_train=True):  #@save
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    wandb.watch(net)
    train_ls, test_ls = [], []
    train_iter = load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
    for epoch in tqdm(range(num_epochs)):
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = net(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
        record_loss = log_rmse(net.to('cpu'), train_features, train_labels)
        wandb.log({'loss': record_loss,'epoch': epoch})
        train_ls.append(record_loss)
        if (epoch%NUM_SAVE==0 and epoch!=0) or (epoch==num_epochs-1):
            torch.save(net.state_dict(),'checkpoint_'+str(epoch))
            print('save checkpoints on:', epoch, 'rmse loss value is:', record_loss)
        del X, y
        net.to(device)
    wandb.finish()
    return train_ls, test_ls

k, num_epochs, lr, weight_decay, batch_size = 5, 2000, 0.005, 0.05, 256
wandb.init(project="kaggle_predict",
           config={ "learning_rate": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "total_run": num_epochs,
                    "network": net_list}
          )
print("network:",net)

train_ls, valid_ls = train(net, train_features,train_labels,None,None, num_epochs, lr, weight_decay, batch_size)

# 使用现有训练好的net
net.to('cpu')
# 将网络应用于测试集。
preds = net(test_features).detach().numpy()

# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

# 读取已有 继续进行训练
k, num_epochs, lr, weight_decay, batch_size = 5, 500, 0.0005, 0.08, 256
wandb.init(project="kaggle_predict",
           config={ "learning_rate": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "total_run": num_epochs,
                    "network": net_list}
          )
net.load_state_dict(torch.load('checkpoint_19676'))
print("network:",net)
net.to(device)
train_ls, valid_ls = train(net, train_features,train_labels,None,None, num_epochs, lr, weight_decay, batch_size)
net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

# 读取网络参数应用于测试集
net = []
net = MLP(test_features.shape[1])
net.load_state_dict(torch.load('checkpoint_250'))
net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

print(len(all_features['Type'].unique()))
print(len(all_features['Heating'].unique()))
print(len(all_features['Cooling'].unique()))
print(len(all_features['Parking'].unique()))
print(len(all_features['Bedrooms'].unique()))
print(len(all_features['Region'].unique()))
print(len(all_features['Elementary School'].unique()))
print(len(all_features['Middle School'].unique()))
print(len(all_features['High School'].unique()))
print(len(all_features['Flooring'].unique()))
print(len(all_features['Heating features'].unique()))
print(len(all_features['Cooling features'].unique()))
print(len(all_features['Appliances included'].unique()))
print(len(all_features['Laundry features'].unique()))
print(len(all_features['Parking features'].unique()))
print(len(all_features['City'].unique()))







train_data and test_data shape (47439, 41) (31626, 40)
Type                 174
Heating              2660
Cooling              911
Parking              9913
Bedrooms             278
Region               1259
Elementary School    3568
Middle School        809
High School          922
Flooring             1740
Heating features     1763
Cooling features     596
Appliances included  11290
Laundry features     3031
Parking features     9695
before one hot code (79065, 19)
after one hot code (79065, 470)
train feature shape: torch.Size([47439, 470])
test feature shape: torch.Size([31626, 470])
train label shape: torch.Size([47439, 1])


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

network: MLP(
  (layer1): Linear(in_features=470, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


  3%|▎         | 51/2000 [00:40<25:21,  1.28it/s]

save checkpoints on: 50 rmse loss value is: 0.3635750412940979


  5%|▌         | 101/2000 [01:20<26:12,  1.21it/s]

save checkpoints on: 100 rmse loss value is: 0.23198802769184113


  8%|▊         | 151/2000 [02:02<24:51,  1.24it/s]

save checkpoints on: 150 rmse loss value is: 0.23053652048110962


 10%|█         | 201/2000 [02:42<24:09,  1.24it/s]

save checkpoints on: 200 rmse loss value is: 0.2288118153810501


 13%|█▎        | 251/2000 [03:22<23:37,  1.23it/s]

save checkpoints on: 250 rmse loss value is: 0.2326572984457016


 15%|█▌        | 301/2000 [04:02<22:39,  1.25it/s]

save checkpoints on: 300 rmse loss value is: 0.23784029483795166


 18%|█▊        | 351/2000 [04:41<21:14,  1.29it/s]

save checkpoints on: 350 rmse loss value is: 0.2391752302646637


 20%|██        | 401/2000 [05:20<20:35,  1.29it/s]

save checkpoints on: 400 rmse loss value is: 0.24509213864803314


 23%|██▎       | 451/2000 [05:59<19:50,  1.30it/s]

save checkpoints on: 450 rmse loss value is: 0.24015581607818604


 25%|██▌       | 501/2000 [06:38<20:15,  1.23it/s]

save checkpoints on: 500 rmse loss value is: 0.2559889256954193


 28%|██▊       | 551/2000 [07:17<19:01,  1.27it/s]

save checkpoints on: 550 rmse loss value is: 0.2999950051307678


 30%|███       | 601/2000 [07:56<18:03,  1.29it/s]

save checkpoints on: 600 rmse loss value is: 0.3024503290653229


 33%|███▎      | 651/2000 [08:35<17:10,  1.31it/s]

save checkpoints on: 650 rmse loss value is: 0.3513205051422119


 35%|███▌      | 701/2000 [09:14<16:38,  1.30it/s]

save checkpoints on: 700 rmse loss value is: 0.36101698875427246


 38%|███▊      | 751/2000 [09:53<16:00,  1.30it/s]

save checkpoints on: 750 rmse loss value is: 0.3727125823497772


 40%|████      | 801/2000 [10:32<15:49,  1.26it/s]

save checkpoints on: 800 rmse loss value is: 0.42057356238365173


 43%|████▎     | 851/2000 [11:14<16:52,  1.13it/s]

save checkpoints on: 850 rmse loss value is: 0.39280402660369873


 45%|████▌     | 901/2000 [11:56<16:06,  1.14it/s]

save checkpoints on: 900 rmse loss value is: 0.4425584077835083


 48%|████▊     | 951/2000 [12:38<14:58,  1.17it/s]

save checkpoints on: 950 rmse loss value is: 0.4918777644634247


 50%|█████     | 1001/2000 [13:21<14:34,  1.14it/s]

save checkpoints on: 1000 rmse loss value is: 0.4947948753833771


 53%|█████▎    | 1051/2000 [14:03<13:22,  1.18it/s]

save checkpoints on: 1050 rmse loss value is: 0.5482037663459778


 55%|█████▌    | 1101/2000 [14:46<13:32,  1.11it/s]

save checkpoints on: 1100 rmse loss value is: 0.5456303358078003


 58%|█████▊    | 1151/2000 [15:28<11:43,  1.21it/s]

save checkpoints on: 1150 rmse loss value is: 0.5294300317764282


 60%|██████    | 1201/2000 [16:10<11:03,  1.20it/s]

save checkpoints on: 1200 rmse loss value is: 0.5961485505104065


 63%|██████▎   | 1251/2000 [16:52<10:45,  1.16it/s]

save checkpoints on: 1250 rmse loss value is: 0.576357364654541


 65%|██████▌   | 1301/2000 [17:32<08:55,  1.30it/s]

save checkpoints on: 1300 rmse loss value is: 0.5946934223175049


 68%|██████▊   | 1351/2000 [18:14<09:07,  1.19it/s]

save checkpoints on: 1350 rmse loss value is: 0.5888828635215759


 70%|███████   | 1401/2000 [18:55<08:00,  1.25it/s]

save checkpoints on: 1400 rmse loss value is: 0.5864478349685669


 73%|███████▎  | 1451/2000 [19:36<07:22,  1.24it/s]

save checkpoints on: 1450 rmse loss value is: 0.615331768989563


 75%|███████▌  | 1501/2000 [20:17<06:44,  1.23it/s]

save checkpoints on: 1500 rmse loss value is: 0.5999572277069092


 78%|███████▊  | 1551/2000 [20:58<06:04,  1.23it/s]

save checkpoints on: 1550 rmse loss value is: 0.6292240023612976


 80%|████████  | 1601/2000 [21:39<05:17,  1.26it/s]

save checkpoints on: 1600 rmse loss value is: 0.6633264422416687


 83%|████████▎ | 1651/2000 [22:19<04:40,  1.24it/s]

save checkpoints on: 1650 rmse loss value is: 0.6990796327590942


 85%|████████▌ | 1701/2000 [23:00<04:08,  1.20it/s]

save checkpoints on: 1700 rmse loss value is: 0.6621972322463989


 88%|████████▊ | 1751/2000 [23:41<03:21,  1.24it/s]

save checkpoints on: 1750 rmse loss value is: 0.7108193635940552


 90%|█████████ | 1801/2000 [24:21<02:39,  1.25it/s]

save checkpoints on: 1800 rmse loss value is: 0.695854663848877


 93%|█████████▎| 1851/2000 [25:02<02:00,  1.24it/s]

save checkpoints on: 1850 rmse loss value is: 0.7372110486030579


 95%|█████████▌| 1901/2000 [25:43<01:19,  1.25it/s]

save checkpoints on: 1900 rmse loss value is: 0.709581732749939


 98%|█████████▊| 1951/2000 [26:23<00:39,  1.25it/s]

save checkpoints on: 1950 rmse loss value is: 0.6904146671295166


100%|██████████| 2000/2000 [27:03<00:00,  1.23it/s]

save checkpoints on: 1999 rmse loss value is: 0.7546101808547974





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,▅▂▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▃▅▅▄▆▅▆▅▆▆▅▅▆▆▅▇▇▇█▇▇▇█

0,1
epoch,1999.0
loss,0.75461


FileNotFoundError: [Errno 2] No such file or directory: 'checkpoint_19676'