# Laboratory work #3

Import all significant libraries for this project.

In [25]:
# Import TensorFlow & Keras Libraries
import tensorflow as tf
from keras.layers import Dense, Flatten, Input
from keras.layers import Conv2D, RNN
from keras.losses import CategoricalCrossentropy
from keras.optimizers import Adam
from keras.models import Model

# Import scikit-learn libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Import libraries for text cleaning
import re
import string
import nltk
from nltk.corpus import stopwords

# Import other libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Who\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Exercise 1

#### Downloading Dataset
Our dataset is Possible Asteroid Impacts with Earth (from [kaggle](https://www.kaggle.com/datasets/nasa/asteroid-impacts))

### Preprocessing Dataset

In [3]:
# Loading dataset to pandas DataFrame
asteroid_df = pd.read_csv("datasets\\asteroid_dataset\\asteroid_classification.csv")

In [4]:
# Drop unnecessary columns for analysis from the dataframe
asteroid_df.drop(["Object Name", "Epoch (TDB)", "Perihelion Argument (deg)", "Node Longitude (deg)",
                  "Mean Anomoly (deg)", "Perihelion Distance (AU)", "Aphelion Distance (AU)",
                  "Minimum Orbit Intersection Distance (AU)", "Orbital Reference"], axis=1, inplace=True)
print(asteroid_df.columns)

Index(['Object Classification', 'Orbit Axis (AU)', 'Orbit Eccentricity',
       'Orbit Inclination (deg)', 'Orbital Period (yr)', 'Asteroid Magnitude'],
      dtype='object')


In [5]:
# Drop null values of dataframe as we have only one null value
asteroid_df.dropna(inplace=True)
asteroid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15634 entries, 0 to 15634
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Object Classification    15634 non-null  object 
 1   Orbit Axis (AU)          15634 non-null  float64
 2   Orbit Eccentricity       15634 non-null  float64
 3   Orbit Inclination (deg)  15634 non-null  float64
 4   Orbital Period (yr)      15634 non-null  float64
 5   Asteroid Magnitude       15634 non-null  float64
dtypes: float64(5), object(1)
memory usage: 855.0+ KB


In [6]:
# Change class names
asteroid_df["Object Classification"].mask(asteroid_df["Object Classification"] == "Apollo Asteroid" ,
                                          "Apollo", inplace=True)
asteroid_df["Object Classification"].mask(asteroid_df["Object Classification"] == "Aten Asteroid",
                                          "Aten", inplace=True)
asteroid_df["Object Classification"].mask(asteroid_df["Object Classification"] == "Amor Asteroid",
                                          "Amor", inplace=True)
asteroid_df["Object Classification"].mask(asteroid_df["Object Classification"] == "Apohele Asteroid",
                                          "Apohele", inplace=True)
# Drop unnecessary class
necessary_class = ["Apollo", "Aten", "Amor"]
asteroid_df = asteroid_df[asteroid_df["Object Classification"].isin(necessary_class)]

In [7]:
def remove_outliers(dataframe, features):
    """Function to remove the outliers;
    :param dataframe: pandas DataFrame with data;
    :param features: list with features from dataframe.
    """
    # Copy dataframe to another variable
    dataframe_copy = dataframe.copy()

    # Iterate through features
    for feature in features:
        if dataframe[feature].dtype == object:
            continue
        # Calculate q1, q3 and iqr
        q3 = dataframe[feature].quantile(0.75)
        q1 = dataframe[feature].quantile(0.25)
        iqr = q3 - q1

        # Get local minimum and maximum
        local_min = q1 - (1.5 * iqr)
        local_max = q3 + (1.5 * iqr)

        # Remove the outliers
        dataframe_copy = dataframe_copy[(dataframe_copy[feature] >= local_min) &
                                        (dataframe_copy[feature] <= local_max)]

    return dataframe_copy

In [8]:
# Extract features from the df
asteroid_features = asteroid_df.columns.tolist()
print(asteroid_features)

['Object Classification', 'Orbit Axis (AU)', 'Orbit Eccentricity', 'Orbit Inclination (deg)', 'Orbital Period (yr)', 'Asteroid Magnitude']


In [9]:
# Remove outliers from the dataframe
asteroid_df = remove_outliers(asteroid_df, asteroid_features)

In [10]:
# View count of class names
print(asteroid_df.iloc[:, 0].value_counts())

Apollo    6651
Amor      5686
Aten       965
Name: Object Classification, dtype: int64


In [11]:
# Define number of classes
num_classes = len(necessary_class)

In [12]:
# Normalise dataset
norm_asteroid_df = asteroid_df.copy()
# apply normalization techniques
for column in norm_asteroid_df:
    if norm_asteroid_df[column].dtype == object:
        continue
    norm_asteroid_df[column] = norm_asteroid_df[column] / norm_asteroid_df[column].abs().max()
# View normalised dataset
print(norm_asteroid_df.head())

  Object Classification  Orbit Axis (AU)  Orbit Eccentricity  \
1                  Amor         0.807646            0.590282   
2                  Amor         0.758732            0.610967   
4                  Amor         0.587438            0.469295   
8                  Amor         0.570204            0.427279   
9                Apollo         0.418531            0.469619   

   Orbit Inclination (deg)  Orbital Period (yr)  Asteroid Magnitude  
1                 0.296677             0.727119            0.492063  
2                 0.239656             0.661017            0.425397  
4                 0.304293             0.450847            0.561905  
8                 0.216477             0.430508            0.419048  
9                 0.240286             0.271186            0.451746  


In [13]:
# Edit name of column "Object Classification" with _
# For using this name in next cell
number_asteroid_df = norm_asteroid_df.copy()
number_asteroid_df.rename(columns={"Object Classification": "Object_Classification"}, inplace=True)
# Replace string class to numbers
obj_class = {"Apollo": 0, "Aten": 1, "Amor": 2}
number_asteroid_df.Object_Classification = [obj_class[item] for item in number_asteroid_df.Object_Classification]
# View new dataset
print(number_asteroid_df.head())

   Object_Classification  Orbit Axis (AU)  Orbit Eccentricity  \
1                      2         0.807646            0.590282   
2                      2         0.758732            0.610967   
4                      2         0.587438            0.469295   
8                      2         0.570204            0.427279   
9                      0         0.418531            0.469619   

   Orbit Inclination (deg)  Orbital Period (yr)  Asteroid Magnitude  
1                 0.296677             0.727119            0.492063  
2                 0.239656             0.661017            0.425397  
4                 0.304293             0.450847            0.561905  
8                 0.216477             0.430508            0.419048  
9                 0.240286             0.271186            0.451746  


In [14]:
# One-hot Encoding the Object Classification Feature
one_hot = OneHotEncoder()
# Copy our dataset
onehot_asteroid_df = norm_asteroid_df.copy()
# Fitting one-hot encoder
encoded = one_hot.fit_transform(onehot_asteroid_df[["Object Classification"]])
onehot_asteroid_df[one_hot.categories_[0]] = encoded.toarray()
# Drop unnecessary "Object Classification" feature
onehot_asteroid_df.drop(["Object Classification"], axis=1, inplace=True)
print(onehot_asteroid_df.head())

   Orbit Axis (AU)  Orbit Eccentricity  Orbit Inclination (deg)  \
1         0.807646            0.590282                 0.296677   
2         0.758732            0.610967                 0.239656   
4         0.587438            0.469295                 0.304293   
8         0.570204            0.427279                 0.216477   
9         0.418531            0.469619                 0.240286   

   Orbital Period (yr)  Asteroid Magnitude  Amor  Apollo  Aten  
1             0.727119            0.492063   1.0     0.0   0.0  
2             0.661017            0.425397   1.0     0.0   0.0  
4             0.450847            0.561905   1.0     0.0   0.0  
8             0.430508            0.419048   1.0     0.0   0.0  
9             0.271186            0.451746   0.0     1.0   0.0  


In [15]:
# Change data type in one-hot encoded column
column_dtype_dict = {"Amor": int,
                     "Apollo": int,
                     "Aten": int}
norm_onehot_asteroid_df = onehot_asteroid_df.astype(column_dtype_dict)
print(norm_onehot_asteroid_df.dtypes)
print(norm_onehot_asteroid_df.head())

Orbit Axis (AU)            float64
Orbit Eccentricity         float64
Orbit Inclination (deg)    float64
Orbital Period (yr)        float64
Asteroid Magnitude         float64
Amor                         int32
Apollo                       int32
Aten                         int32
dtype: object
   Orbit Axis (AU)  Orbit Eccentricity  Orbit Inclination (deg)  \
1         0.807646            0.590282                 0.296677   
2         0.758732            0.610967                 0.239656   
4         0.587438            0.469295                 0.304293   
8         0.570204            0.427279                 0.216477   
9         0.418531            0.469619                 0.240286   

   Orbital Period (yr)  Asteroid Magnitude  Amor  Apollo  Aten  
1             0.727119            0.492063     1       0     0  
2             0.661017            0.425397     1       0     0  
4             0.450847            0.561905     1       0     0  
8             0.430508            0.419048 

### Split Dataset to Train & Test sets

In [20]:
# Split Categorical Dataset
x = norm_asteroid_df.drop(["Object Classification"], axis=1)
y = norm_asteroid_df["Object Classification"]
# Split to train test sets
catg_X_train, catg_X_test, catg_y_train, catg_y_test = train_test_split(x, y, test_size=0.20)

In [17]:
# Split Numeric Dataset
x = number_asteroid_df.drop(["Object_Classification"], axis=1)
y = number_asteroid_df["Object_Classification"]
# Split to train test sets
num_X_train, num_X_test, num_y_train, num_y_test = train_test_split(x, y, test_size=0.20)

In [41]:
# Split One-Hot Dataset
x = norm_onehot_asteroid_df.drop(["Apollo", "Aten", "Amor"], axis=1)
y = norm_onehot_asteroid_df[["Apollo", "Aten", "Amor"]]
# Split to train test sets
oneh_X_train, oneh_X_valid, oneh_X_test = np.split(x.sample(frac=1), [int(0.8*len(x)), int(0.9*len(x))])
oneh_y_train, oneh_y_valid, oneh_y_test = np.split(y.sample(frac=1), [int(0.8*len(y)), int(0.9*len(y))])

In [42]:
def xy_df_to_dataset(x, y, shuffle=True, batch_size=32):
    """
    Function for converting dataframe var to tf dataset.
    :param x:
    :param y:
    :param shuffle:
    :param batch_size:
    :return:
    """
    df = x.copy()
    labels = y.copy()
    df = {key: value[:, tf.newaxis] for key, value in df.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [43]:
# Define batch size variable
batch_size = 32
# Get TensorFlow Dataset object
train_ds = xy_df_to_dataset(oneh_X_train, oneh_y_train, batch_size=batch_size)
valid_ds = xy_df_to_dataset(oneh_X_valid, oneh_y_valid, shuffle=False, batch_size=batch_size)
test_ds = xy_df_to_dataset(oneh_X_test, oneh_y_test, shuffle=False, batch_size=batch_size)

  df = {key: value[:, tf.newaxis] for key, value in df.items()}
  df = {key: value[:, tf.newaxis] for key, value in df.items()}
  df = {key: value[:, tf.newaxis] for key, value in df.items()}


### Build model

In [44]:
def model_compile(model, learn_rate=0.001):
    """
    Help function for compiling model;
    :param model: built model;
    :param learn_rate: learning rate for optimizer;
    """
    model.compile(loss=CategoricalCrossentropy,
                  optimizer=Adam(learning_rate=learn_rate),
                  metrics=["accuracy"])

In [45]:
# Set Input layer for input data
input_shape = len(asteroid_features)-1
inputs = Input(shape=(input_shape,))

In [46]:
# Set Dense (Fully Connected) layer with 100 hidden unit and "relu" activation function
x = Dense(100, activation="relu")(inputs)
# Set another Dense layer with 10 hidden unit and "relu" activation function
x = Dense(10, activation="relu")(x)
# Set output layer with num_classes hidden unit and "sigmoid" activation function
outputs = Dense(num_classes, activation="sigmoid")(x)

In [47]:
# Define our model
onehot_model = Model(inputs, outputs, name="onehot_model")

In [48]:
# View summary of the model
onehot_model.summary()

Model: "onehot_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 5)]               0         
                                                                 
 dense_3 (Dense)             (None, 100)               600       
                                                                 
 dense_4 (Dense)             (None, 10)                1010      
                                                                 
 dense_5 (Dense)             (None, 3)                 33        
                                                                 
Total params: 1,643
Trainable params: 1,643
Non-trainable params: 0
_________________________________________________________________


In [49]:
# Compile the model
model_compile(onehot_model)

### Fit model

In [50]:
# Fitting the model
onehot_history = onehot_model.fit(train_ds,
                                  batch_size=batch_size,
                                  epochs=10,
                                  validation_data=valid_ds,
                                  use_multiprocessing=True)

Epoch 1/10


ValueError: in user code:

    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\input_spec.py", line 197, in assert_input_compatibility
        raise ValueError(

    ValueError: Missing data for input "input_4". You passed a data dictionary with keys ['Orbit Axis (AU)', 'Orbit Eccentricity', 'Orbit Inclination (deg)', 'Orbital Period (yr)', 'Asteroid Magnitude']. Expected the following keys: ['input_4']


### Evaluate model

In [37]:
loss, accuracy = onehot_model.evaluate(oneh_X_test, oneh_y_test)
print(f"Model loss on the test set: {loss}")
print(f"Model accuracy on the test set = {accuracy}")

TypeError: in user code:

    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1727, in test_function  *
        return step_function(self, iterator)
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1713, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1701, in run_step  **
        outputs = model.test_step(data)
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1667, in test_step
        self.compute_loss(x, y, y_pred, sample_weight)
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\losses.py", line 158, in __call__
        return losses_utils.compute_weighted_loss(
    File "D:\.main\.code\data_analysis_labs\venv\lib\site-packages\keras\utils\losses_utils.py", line 328, in compute_weighted_loss
        losses = tf.convert_to_tensor(losses)

    TypeError: Failed to convert elements of <keras.losses.CategoricalCrossentropy object at 0x0000026144355600> to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.


## Exercise 2

## Exercise 3

#### Downloading Dataset
Our dataset is Emotion Detection from Text (from [kaggle](https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text?resource=download))