In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np
from torch.utils.data import Dataset, DataLoader
import random


In [86]:
df = pd.read_csv("../../feature1.csv")
df.columns

Index(['Dati', 'Time', 'DEPTH', 'ENGINE_1_FLOWTEMPA',
       'ENGINE_1_FUEL_CONSUMPTION', 'ENGINE_2_FLOWTEMPA',
       'ENGINE_2_FUEL_CONSUMPTION', 'HEADING', 'LATITUDE', 'LONGITUDE',
       'PITCH_1', 'PITCH_2', 'POWER_1', 'POWER_2', 'SOG', 'SOG_SPEEDLOG_LONG',
       'SOG_SPEEDLOG_TRANS', 'SPEED_1', 'SPEED_2', 'STW', 'THRUST_1',
       'THRUST_2', 'TORQUE_1', 'TORQUE_2', 'WIND_ANGLE', 'WIND_SPEED',
       'WIND_ANGLE_TRUE', 'WIND_SPEED_TRUE', 'trip_id', 'MODE', 'datetime',
       'season', 'weekday', 'current', 'direction', 'pressure', 'rain',
       'snowfall', 'weathercode', 'is_weekday', 'effective_wind_factor',
       'effective_wind', 'resist_ratio1', 'resist_ratio2', 'adversarial'],
      dtype='object')

In [87]:
# the minimum case get 94 lines of data
df.groupby("trip_id").count().sort_values("Dati").head(1)

Unnamed: 0_level_0,Dati,Time,DEPTH,ENGINE_1_FLOWTEMPA,ENGINE_1_FUEL_CONSUMPTION,ENGINE_2_FLOWTEMPA,ENGINE_2_FUEL_CONSUMPTION,HEADING,LATITUDE,LONGITUDE,...,pressure,rain,snowfall,weathercode,is_weekday,effective_wind_factor,effective_wind,resist_ratio1,resist_ratio2,adversarial
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1643,94,94,94,94,94,94,94,94,94,94,...,94,94,94,94,94,94,94,94,94,94


In [88]:
df["POWER"] = (df["POWER_1"]+df["POWER_2"])/2
df["SPEED"] = (df["SPEED_1"]+df["SPEED_2"])/2
df["THRUST"] = (df["THRUST_1"]+df["THRUST_2"])/2
df["TORQUE"] = (df["TORQUE_1"]+df["TORQUE_2"])/2
df["PITCH"] = (df["PITCH_1"]+df["PITCH_2"])/2
df["resist_ratio"] = (df["resist_ratio1"]+df["resist_ratio2"])/2
df["FLOWTEMPA"] = (df["ENGINE_1_FLOWTEMPA"]+df["ENGINE_2_FLOWTEMPA"])/2
df["FC"] = (df["ENGINE_1_FUEL_CONSUMPTION"]+df["ENGINE_2_FUEL_CONSUMPTION"])/2

df = df.drop(['PITCH_1', 'PITCH_2', 'POWER_1', 'POWER_2', 'SOG_SPEEDLOG_LONG',
       'SOG_SPEEDLOG_TRANS',  'THRUST_1',
       'THRUST_2', 'TORQUE_1', 'TORQUE_2', 'WIND_ANGLE', 'WIND_SPEED',
       'WIND_ANGLE_TRUE', 'WIND_SPEED_TRUE', 'datetime',
       'resist_ratio1', 'resist_ratio2', 'SPEED_1', 'SPEED_2', "STW",
       'ENGINE_1_FLOWTEMPA', 'ENGINE_2_FLOWTEMPA', 
       "ENGINE_1_FUEL_CONSUMPTION", "ENGINE_2_FUEL_CONSUMPTION"], axis=1)
df = df.drop(["FLOWTEMPA", "PITCH", "POWER", "effective_wind_factor", "THRUST", "pressure", "weekday", "Dati", "Time", "DEPTH"], axis=1)
df["direction"] = df["direction"].apply(lambda x: 1 if x=="H-N" else 0)

In [89]:
df = df[df["adversarial"]==0].drop("adversarial", axis=1)

In [134]:
# get one_hot for column
def one_hot(df, cols):
    for col in cols:
        dummy = pd.get_dummies(df[col],prefix=col, drop_first=True)
        df = pd.concat([df, dummy], axis=1).drop(col, axis=1)
    return df

df = one_hot(df, ["season", "weathercode"])

In [135]:
df.columns

Index(['HEADING', 'LATITUDE', 'LONGITUDE', 'SOG', 'trip_id', 'MODE', 'current',
       'direction', 'rain', 'snowfall', 'is_weekday', 'effective_wind',
       'SPEED', 'TORQUE', 'resist_ratio', 'FC', 'season_1', 'season_2',
       'season_3', 'weathercode_1', 'weathercode_2', 'weathercode_3',
       'weathercode_51', 'weathercode_53', 'weathercode_55', 'weathercode_61',
       'weathercode_63', 'weathercode_71', 'weathercode_73', 'weathercode_75'],
      dtype='object')

In [93]:
# standard scaler
scaler = StandardScaler()
transform_cols = [ x for x in df.columns if (("season" not in x) and ("weathercode" not in x) and (x not in ["MODE", "is_weekday", "direction", ]))]
df[transform_cols] = scaler.fit_transform(df[transform_cols])
# transform_cols

In [94]:
df.columns

Index(['HEADING', 'LATITUDE', 'LONGITUDE', 'SOG', 'trip_id', 'MODE', 'current',
       'direction', 'rain', 'snowfall', 'is_weekday', 'effective_wind',
       'SPEED', 'TORQUE', 'resist_ratio', 'FC', 'season_1', 'season_2',
       'season_3', 'weathercode_1', 'weathercode_2', 'weathercode_3',
       'weathercode_51', 'weathercode_53', 'weathercode_55', 'weathercode_61',
       'weathercode_63', 'weathercode_71', 'weathercode_73', 'weathercode_75'],
      dtype='object')

In [127]:
class Cell_data(Dataset):
    def __init__(self, df, x_cols, y_cols, epi_size=94, train = True, train_test_split = 0.8, rand_seed=1):
        ##########################inputs##################################
        #data_dir(string) - directory of the data#########################
        #size(int) - size of the images you want to use###################
        #train(boolean) - train data or test data#########################
        #train_test_split(float) - the portion of the data for training###
        #augment_data(boolean) - use data augmentation or not#############
        super(Cell_data, self).__init__()
        # todo
        #initialize the data class
        trips = list(df.trip_id.unique())
        self.train = train
        self.epi_size = epi_size
        self.x_cols = x_cols
        self.y_cols = y_cols

        # train_test_split
        random.seed(rand_seed)
        train_size = int(np.ceil(len(trips)*train_test_split))
        train_trips = random.sample(trips, k=train_size)
        if train:
            self.trips_id = train_trips
        else:
            self.trips_id = [ x for x in trips if x not in train_trips]

    # convert a df to tensor
    def df_to_tensor(self, df):
        if torch.cuda.is_available():
            device = torch.device('cuda:0')
        else:
            device = torch.device('cpu')
        return torch.from_numpy(df.values.astype("float")).float().to(device)
    
    def __getitem__(self, idx):
        #load corresponding trip id from index idx of your data
        data = df[df.trip_id==(self.trips_id[idx])].reset_index(drop=True)
        #get the needed episodes from the trips
        len_data = len(data)
        if len_data>self.epi_size:
            starting_idx = random.randint(0, len_data-self.epi_size)
            end_idx = starting_idx+ self.epi_size
            data = data.iloc[starting_idx:end_idx]

        #return X and y in tensors
        X = data[self.x_cols]
        y = data[self.y_cols]
        return self.df_to_tensor(X), self.df_to_tensor(y)
        

    def __len__(self):
        return len(self.trips_id)

In [128]:
class Lstm(nn.Module):
    def __init__(self, input_size, hidden_size=50, output_size=1, num_layers=1):
        super().__init__()
 
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)  # utilize the LSTM model in torch.nn
        self.fc = nn.Linear(hidden_size, output_size)
 
    def forward(self, x):
        x, _ = self.lstm(x)  # _x is input, size (seq_len, batch, input_size)
        x = self.fc(x)
        return x

In [131]:
# ----------------- train -------------------
x_cols = ['HEADING', 'LATITUDE', 'LONGITUDE', 'SOG', 'MODE', 'current',
       'direction', 'rain', 'snowfall', 'is_weekday', 'effective_wind',
       'TORQUE', 'resist_ratio', 'season_1', 'season_2',
       'season_3', 'weathercode_1', 'weathercode_2', 'weathercode_3',
       'weathercode_51', 'weathercode_53', 'weathercode_55', 'weathercode_61',
       'weathercode_63', 'weathercode_71', 'weathercode_73', 'weathercode_75']
y_cols = ['FC']
input_size = len(x_cols)
output_size = len(y_cols)
batch_size=16
lr = 1e-6

In [None]:
device = torch.device("cpu")
model = Lstm(input_size, 20, output_size=output_size, num_layers=1)  # 20 hidden units

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
trainset = Cell_data(df, x_cols, y_cols, epi_size=94, train = True, train_test_split = 0.8, rand_seed=1)
trainloader = DataLoader(trainset, batch_size = batch_size, shuffle=True, drop_last=True)

testset = Cell_data(df, x_cols, y_cols, epi_size=94, train = False, train_test_split = 0.8, rand_seed=1)
testloader = DataLoader(testset, batch_size = batch_size, drop_last=True)


n_epochs = 2000
for epoch in range(n_epochs):
    epoch_loss = 0
    train_rmse = 0
    model.train()
    for i, data in enumerate(trainloader):
        X_batch, y_batch = data
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
    print('Epoch %d / %d --- Loss: %.4f' % (epoch + 1, n_epochs, epoch_loss / trainset.__len__()))

    # Validation
    if epoch % 100 != 0:
        continue
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for i, data in enumerate(testloader):
            X_valid, y_valid = data
            y_pred = model(X_valid)
            loss = np.sqrt(criterion(y_pred, y_valid))
            total_loss += loss.item()
    print('Loss: %.4f' % (total_loss / testset.__len__()))
