In [4]:
import pandas as pd
import geopandas as gpd
# Load the TensorBoard notebook extension
%load_ext tensorboard

import os
from dataloader import fishingDataLoader
import tensorflow as tf
from keras.callbacks import TensorBoard
import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [5]:
def emptyLog():
    folder = "logs"
    for f in os.listdir(folder):
        os.remove(os.path.join(folder, f))
    log_dir = f"{folder}/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = TensorBoard(
        log_dir=log_dir,
        histogram_freq=1,
        write_graph=True,
        write_images=False,
        update_freq="epoch",
    )

In [6]:
#emptyLog()

In [7]:
def loadFishingData():
    loader = fishingDataLoader()
    data = loader.loadAllTrainingData()
    return data.reset_index(drop=True)

def loadIresData():
    data = pd.read_csv("test/iris_original.csv")
    return data

def createHotVector(y):
    if isinstance(y, pd.DataFrame):
        if len(lables.columns) == 1: # check if Datafram has only one colum
            return -1
        
        
        y_vector = np_utils.to_categorical(y.to_numpy())
    else:
        y_vector = np_utils.to_categorical(y)
    return pd.DataFrame(y_vector)

def cleanrows(df: pd.DataFrame):
    old_count_rows = df.shape[0]
    df = df.dropna()
    new_conut_rows = df.shape[0]
    print(f"deletet {old_count_rows-new_conut_rows} rows")
    return df

def generateClassWeightsFromHotVektor(lable: pd.DataFrame):
    n_samples, n_classes = lable.shape
    feature_index, idx, count = tf.unique_with_counts(tf.argmax(lable,axis=1))
    class_weight = dict()
    
    for key, samples_of_class in zip(feature_index.numpy(),count.numpy()):
        score = n_samples/(n_classes*samples_of_class)
        class_weight[key] = score 
    return class_weight
    

def normalizeColums(df: pd.DataFrame, name_of_cols):
    df[name_of_cols] = df[name_of_cols]/ df[name_of_cols].abs().max()
    return df
    
def printFeatureDistribution(features):
    if features.shape[-1] != 1:
        features = tf.argmax(features,axis=1)
    
    if isinstance(features, pd.DataFrame):
        features = features.to_numpy()
    feature_index, idx, count = tf.unique_with_counts(features)
    feature_index = feature_index.numpy()
    count = count.numpy()
    percent = tf.round((count/ sum(count))*10000)/100
    d = {
        "amount": count,
        "percent %": percent
    }
    dist = pd.DataFrame(d, index=feature_index)
    print(dist)
    
def prepareDataset(raw_data: pd.DataFrame, testSize = 0.2):
    
    raw_data = cleanrows(raw_data) #Del row with nan values
    
    nameOfFeatureCols = raw_data.columns[:-1]
    number_coloums = len(raw_data.columns)
    nameOfTargetCol = raw_data.columns[-1]
    raw_data = pd.get_dummies(raw_data, columns=[nameOfTargetCol], dtype=float)
    
    target_colums = len(raw_data.columns)-number_coloums+1 # how many target coloums exists
    
    train_data, val_data = train_test_split(raw_data, test_size=testSize)
    
    train_features, train_lable = train_data.iloc[:,:-target_colums], train_data.iloc[:,-target_colums:]
    val_features, val_lable = val_data.iloc[:,:-target_colums], val_data.iloc[:,-target_colums:]
    

    return train_features, train_lable, val_features, val_lable

def turnUnixTimeToDate(date: int):
    return datetime.datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S')


In [8]:
#raw_data = loadFishingData()

train_features, train_lables, val_features, val_lables = prepareDataset(loadFishingData(),testSize=0.2)


TARGETS = len(train_lables.columns)
INPUTS = train_features.shape[-1]

                 mmsi     timestamp  distance_from_shore  distance_from_port  \
0        1.252340e+12  1.325376e+09             0.000000             0.00000   
1        1.252340e+12  1.325378e+09             0.000000             0.00000   
2        1.252340e+12  1.325379e+09             0.000000             0.00000   
3        1.252340e+12  1.325380e+09             0.000000             0.00000   
4        1.252340e+12  1.325381e+09             0.000000             0.00000   
...               ...           ...                  ...                 ...   
1545318  4.393595e+13  1.480030e+09        132057.359375        507208.40625   
1545319  4.393595e+13  1.480030e+09        132057.359375        507208.40625   
1545320  4.393595e+13  1.480030e+09        132057.359375        507208.40625   
1545321  4.393595e+13  1.480030e+09        132057.359375        507208.40625   
1545322  4.393595e+13  1.480030e+09        132030.843750        506601.81250   

         speed      course        lat  

In [None]:
loader = fishingDataLoader()
print(loader.file_list)
loader.genSmalerDataset(sample=60000)

In [None]:
groups = train_features.groupby('mmsi')
print(type(groups))
for name,group in groups:
    print(name)

In [None]:
class_weights = generateClassWeightsFromHotVektor(train_lables)
print(class_weights)

printFeatureDistribution(train_lables)

print(INPUTS, TARGETS)

In [None]:
def genModel(output_bias=None):
    if output_bias is not  None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    
    model = tf.keras.Sequential(
        name= "simpleModel",
        layers = [
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(10, activation=tf.nn.relu, input_dim=INPUTS, name='dense1'),  # input shape required
        tf.keras.layers.Dense(30, activation=tf.nn.relu, name='dense2'),
        tf.keras.layers.Dense(TARGETS, activation=tf.nn.softmax, name='dense3', bias_initializer=output_bias)
    ])
    
    model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[
        'accuracy',
        tf.keras.metrics.Recall(name="Recall"),
        tf.keras.metrics.Precision(name="Precision"),
    ]
)
    return model

In [None]:

# Include the epoch in the file name (uses `str.format`)
checkpoint_path = "training/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq=5)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Early stop
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", min_delta=0.001, patience=1)


In [None]:
class SimpleModel(tf.keras.Model):
    INPUTS = 19
    TARGETS = 6

    def __init__(self, input_dim = 9 ,output_bias=None):
        super(SimpleModel, self).__init__(name='SimpleModel')
        if output_bias is not None:
            output_bias = tf.keras.initializers.Constant(output_bias)

        self.batchNorm = tf.keras.layers.BatchNormalization(name='BatchNormalization')
        self.Layer1 = tf.keras.layers.Dense(20, activation=tf.nn.relu,name='dense1')  # input shape required
        self.Layer2 = tf.keras.layers.Dense(20, activation=tf.nn.relu, name='dense2')
        self.Layer3 = tf.keras.layers.Dense(10, activation=tf.nn.relu, name='dense3')
        self.Output = tf.keras.layers.Dense(self.TARGETS, activation=tf.nn.softmax, name='outout',
                                            bias_initializer=output_bias)

    def call(self, inputs, training=None, mask=None):
        x = self.batchNorm(inputs)
        x = self.Layer1(x)
        x = self.Layer2(x)
        x = self.Layer3(x)
        x = self.Output(x)
        
        return x

In [None]:
#model = genModel()
model = SimpleModel()

model.build((None,9))
model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[
        'accuracy',
        tf.keras.metrics.Recall(name="Recall"),
        tf.keras.metrics.Precision(name="Precision"),
    ]
)

print(model.summary())

In [None]:
results = model.evaluate(train_features, train_lables, batch_size=10000)
print("Loss: {:0.4f}".format(results[0]))

In [None]:
model.save_weights(checkpoint_path.format(epoch=0))

#print(features.iloc[0],lables.iloc[0])

history = model.fit(
    x = train_features,
    y = train_lables,
    batch_size=20000,
    epochs= 10,
    shuffle=True,
    callbacks=[tensorboard_callback],
    #callbacks= [early_stop_callback],
    # class_weight=class_weights,
    validation_data=(val_features,val_lables),
)

In [None]:
fig, ax = plt.subplots(ncols=2 ,figsize =(15, 7)) 

for metric in history.history.keys():
    if "loss" in metric or 'val' not in metric:
        continue
    data = history.history[metric]
    one = data[:-1]
    two = data[1:]
    ax[0].set_title('development plot')
    ax[0].plot(data, label=metric)
    ax[1].set_title('change plot')
    ax[1].plot(np.array(two)-np.array(one) , label=metric)
    
#plt.ylim(0, 15)
plt.legend()
plt.show()

In [None]:
#evaluation_data = features.iloc[::1000]
#evaluation_label = lables.iloc[::1000]

amount = 100
indize = np.linspace(0,len(val_features)-1, num=amount, dtype=int)

evaluation_data = val_features.iloc[indize]
evaluation_label = val_lables.iloc[indize]

evaluation_predictions = model.predict(evaluation_data)

#print("Prediction:\t {}".format(tf.math.argmax(evaluation_predictions, axis=1)))
#print("Labels:\t\t {}".format(tf.argmax(evaluation_label.to_numpy(),axis=1)))

cunf_matrix = confusion_matrix(tf.argmax(evaluation_label,axis=1),tf.argmax(evaluation_predictions, axis=1))
print(cunf_matrix)

In [None]:
#print(train_features.iloc[0][0])
vesselForPlot = train_features[train_features["mmsi"]==215151145083937.0].sort_values(by=["timestamp"])
print(vesselForPlot.loc[vesselForPlot["distance_from_port"]==0.0])
startTime = 0
endtime = 10
error = 4
vesselForPlot = vesselForPlot.iloc[startTime:endtime+1]

minLat, maxLat = vesselForPlot["lat"].min()-error,vesselForPlot["lat"].max()+error
minLon, maxLon = vesselForPlot["lon"].min()-error,vesselForPlot["lon"].max()+error
print(turnUnixTimeToDate(vesselForPlot.iloc[startTime]["timestamp"]))
print(turnUnixTimeToDate(vesselForPlot.iloc[endtime]["timestamp"]))
 
fig, ax = plt.subplots(figsize=(7,7))
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

world.plot(ax=ax)
ax.scatter(x=vesselForPlot["lat"],y=vesselForPlot["lon"],s=10,c="r")

plt.xlim([minLat, maxLat])
plt.ylim([minLon,maxLon])
plt.show()

In [None]:
%tensorboard --logdir logs/fit

print("127.0.0.1:6006")