In [76]:
import os
import math
from zipfile import ZipFile
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup
import pickle

In [77]:
df = pd.read_csv("nvdatabase_clean.csv", low_memory=False)
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df.sample(5)

Unnamed: 0,user_id,subject,elites,nvresponses,campaigner violence,repressive violence,rating,endyear,method,method_id,theme,classification,country
18860,1114,carleton university students win divestment fr...,no,no,no,no,10.0,1987,007 slogans caricatures and symbols,7,national-ethnic identity,change,canada
6808,499,international groups boycott nestle products t...,no,no,no,no,10.0,1984,050 teach-ins,50,human rights,change,united states
3786,270,greek citizens protest austerity package 2011,no,no,yes,yes,3.5,2011,048 protest meetings,48,economic justice,defense,greece
11612,729,milanese catholics and bishop ambrose defend t...,yes,no,no,no,10.0,300,001 public speeches,1,human rights,defense,italy
9456,640,mexicans in chihuahua protest electoral fraud ...,yes,no,no,yes,5.0,1988,172 nonviolent obstruction,172,human rights,change,mexico


In [78]:
users = df[["user_id", "subject", "country", "theme", "classification", "elites", "nvresponses", "campaigner violence", "repressive violence"]]
users = users.drop_duplicates(subset=['user_id'], keep='first')
users = users.rename(columns={'campaigner violence': 'campaigner_violence', 'repressive violence': 'repressive_violence'})
users = users.reset_index(drop=True)

ratings = df[["user_id", "method_id", "rating", "endyear"]]
ratings = ratings.drop_duplicates(keep='first')
ratings = ratings.reset_index(drop=True)

methods = pd.read_csv("nvdatabase_methods.csv", low_memory=False)
methods = methods[["method_id", "method", "genres"]]
#methods['genres'] = methods['genres'].fillna("unknown")
users.head(5)

Unnamed: 0,user_id,subject,country,theme,classification,elites,nvresponses,campaigner_violence,repressive_violence
0,0,atlanta unions campaign to unionize atlanta ol...,united states,democracy,change,no,no,no,no
1,1,chinese migrant workers protest for equal civi...,china,economic justice,change,no,no,yes,yes
2,2,university of missouri students protest agains...,united states,human rights,change,yes,no,no,yes
3,3,argentinian police force strike for better pay...,argentina,economic justice,change,no,no,no,no
4,4,torres strait soldiers stage stayathome strike...,australia,economic justice,change,no,no,no,no


In [79]:
users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
#users["elites"] = users["elites"].apply(lambda x: f"elites_{x}")
#users["nvresponses"] = users["nvresponses"].apply(lambda x: f"nvresponses_{x}")
#users["campaigner violence"] = users["campaigner violence"].apply(lambda x: f"campaignerviolence_{x}")
#users["repressive violence"] = users["repressive violence"].apply(lambda x: f"repressiveviolence_{x}")
#users["subject"] = users["subject"].apply(lambda x: f"subject_{x}")
#users["clusters"] = users["clusters"].apply(lambda x: f"clusters_{x}")
#users["country"] = users["country"].apply(lambda x: f"country_{x}")
#users["themes"] = users["themes"].apply(lambda x: f"themes_{x}")
#users["classifications"] = users["classifications"].apply(lambda x: f"classifications_{x}")

methods["method_id"] = methods["method_id"].apply(lambda x: f"method_{x}")

ratings["method_id"] = ratings["method_id"].apply(lambda x: f"method_{x}")
ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

ratings.sample(5)

Unnamed: 0,user_id,method_id,rating,endyear
4782,user_634,method_16,5.5,1970
5460,user_700,method_19,10.0,2005
648,user_82,method_120,5.5,1952
4714,user_623,method_37,10.0,2007
3528,user_474,method_3,4.0,2010


In [80]:
genres = [
    "methods of nonviolent protest and persuasion",
    "formal statements",
    "communications with a wider audience",
    "group representations",
    "symbolic public acts",
    "pressures on individuals",
    "drama and music",
    "processions",
    "honoring the dead",
    "public assemblies",
    "withdrawal and renunciation", 
    "methods of noncooperation",
    "social noncooperation",
    "ostracism of persons",
    "noncooperation with social events, customs and institutions",
    "withdrawal from the social system",
    "economic noncooperation boycotts",
    "action by consumers",
    "action by workers and producers",
    "action by middlemen",
    "action by owners and management",
    "action by holders of financial resources",
    "action by governments",
    "economic noncooperation strikes",
    "symbolic strikes",
    "agricultural strikes",
    "strikes by special groups",
    "ordinary industrial strikes",
    "restricted strikes",
    "multi-industry strikes",
    "combination of strikes and economic closures",
    "political noncooperation",
    "rejection of authority",
    "citizens’ noncooperation with government",
    "citizens’ alternatives to obedience",
    "action by government personnel",
    "domestic governmental action",
    "international governmental action",  
    "methods of nonviolent intervention",
    "psychological intervention",
    "physical intervention",
    "social intervention",
    "economic intervention",
    "political intervention", 
    "additional methods",]

for genre in genres:
    methods[genre] = methods["genres"].apply(lambda values: int(genre in values.split("|")))

methods.head(200)

Unnamed: 0,method_id,method,genres,methods of nonviolent protest and persuasion,formal statements,communications with a wider audience,group representations,symbolic public acts,pressures on individuals,drama and music,processions,honoring the dead,public assemblies,withdrawal and renunciation,methods of noncooperation,social noncooperation,ostracism of persons,"noncooperation with social events, customs and institutions",withdrawal from the social system,economic noncooperation boycotts,action by consumers,action by workers and producers,action by middlemen,action by owners and management,action by holders of financial resources,action by governments,economic noncooperation strikes,symbolic strikes,agricultural strikes,strikes by special groups,ordinary industrial strikes,restricted strikes,multi-industry strikes,combination of strikes and economic closures,political noncooperation,rejection of authority,citizens’ noncooperation with government,citizens’ alternatives to obedience,action by government personnel,domestic governmental action,international governmental action,methods of nonviolent intervention,psychological intervention,physical intervention,social intervention,economic intervention,political intervention,additional methods
0,method_1,001 public speeches,methods of nonviolent protest and persuasion|f...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,method_2,002 letters of opposition or support,methods of nonviolent protest and persuasion|f...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,method_3,003 declarations by organizations and institut...,methods of nonviolent protest and persuasion|f...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,method_4,004 signed public statements,methods of nonviolent protest and persuasion|f...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,method_5,005 declarations of indictment and intention,methods of nonviolent protest and persuasion|f...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,method_6,006 group or mass petitions,methods of nonviolent protest and persuasion|f...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,method_7,007 slogans caricatures and symbols,methods of nonviolent protest and persuasion|c...,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,method_8,008 banners posters and displayed communications,methods of nonviolent protest and persuasion|c...,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,method_9,009 leaflets pamphlets and books,methods of nonviolent protest and persuasion|c...,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,method_10,010 newspapers and journals,methods of nonviolent protest and persuasion|c...,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [81]:
ratings_group = ratings.sort_values(by=["endyear"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "method_ids": list(ratings_group.method_id.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "endyear": list(ratings_group.endyear.apply(list)),})
ratings_data.head(10)

Unnamed: 0,user_id,method_ids,ratings,endyear
0,user_0,"[method_4, method_1, method_5, method_16, meth...","[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, ...","[1993, 1993, 1993, 1993, 1993, 1993, 1993, 199..."
1,user_1,"[method_39, method_38, method_1]","[2.0, 2.0, 2.0]","[2011, 2011, 2011]"
2,user_10,"[method_6, method_37, method_34, method_2, met...","[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, ...","[2013, 2013, 2013, 2013, 2013, 2013, 2013, 201..."
3,user_100,"[method_151, method_171, method_47, method_80,...","[4.5, 4.5, 4.5, 4.5, 4.5, 4.5]","[1955, 1955, 1955, 1955, 1955, 1955]"
4,user_1000,"[method_11, method_14, method_15, method_172, ...","[10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10....","[2008, 2008, 2008, 2008, 2008, 2008, 2008, 200..."
5,user_1001,"[method_159, method_15, method_89, method_47]","[7.0, 7.0, 7.0, 7.0]","[2003, 2003, 2003, 2003]"
6,user_1002,"[method_47, method_8, method_7]","[9.0, 9.0, 9.0]","[2010, 2010, 2010]"
7,user_1003,"[method_34, method_124, method_172, method_92,...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[2010, 2010, 2010, 2010, 2010, 2010, 2010, 201..."
8,user_1004,"[method_106, method_105]","[8.0, 8.0]","[2005, 2005]"
9,user_1005,"[method_159, method_2, method_13, method_34]","[9.0, 9.0, 9.0, 9.0]","[2011, 2011, 2011, 2011]"


In [82]:
sequence_length = 4
step_size = 2


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.method_ids = ratings_data.method_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["endyear"]

In [83]:
ratings_data_methods = ratings_data[["user_id", "method_ids"]].explode(
    "method_ids", ignore_index=True
)
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_methods, ratings_data_rating], axis=1)
ratings_data_transformed = ratings_data_transformed.join(
    users.set_index("user_id"), on="user_id"
)
ratings_data_transformed.sample(5)

Unnamed: 0,user_id,method_ids,ratings,subject,country,theme,classification,elites,nvresponses,campaigner_violence,repressive_violence
3329,user_722,"[method_38, method_39, method_23, method_159]","[10.0, 10.0, 10.0, 10.0]",us national womans party campaigns for suffrag...,united states,democracy,change,no,no,no,yes
1347,user_269,"[method_162, method_37, method_16, method_8]","[5.0, 5.0, 5.0, 5.0]",italian students protest austerity education r...,italy,economic justice,defense,yes,no,no,no
3762,user_8,"[method_47, method_38, method_8, method_1]","[7.0, 7.0, 7.0, 7.0]",south african students demand zero percent fee...,south africa,democracy,change,no,no,yes,yes
2459,user_537,"[method_1, method_5, method_82, method_200]","[10.0, 10.0, 10.0, 10.0]",south koreans stop plan for nuclear waste dump...,south korea,environment,defense,no,no,no,yes
4294,user_898,"[method_38, method_48, method_97, method_104]","[6.0, 6.0, 6.0, 6.0]",the force ouvrière labor union strikes for eco...,wallis and futuna,economic justice,change,no,yes,yes,no


In [84]:
ratings_data_transformed.dropna(subset = ["method_ids"], inplace=True)
ratings_data_transformed = ratings_data_transformed.reset_index(drop=True)
ratings_data_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4463 entries, 0 to 4462
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   user_id              4463 non-null   object
 1   method_ids           4463 non-null   object
 2   ratings              4463 non-null   object
 3   subject              4463 non-null   object
 4   country              4463 non-null   object
 5   theme                4463 non-null   object
 6   classification       4463 non-null   object
 7   elites               4463 non-null   object
 8   nvresponses          4463 non-null   object
 9   campaigner_violence  4463 non-null   object
 10  repressive_violence  4463 non-null   object
dtypes: object(11)
memory usage: 383.7+ KB


In [85]:
ratings_data_transformed.method_ids = ratings_data_transformed.method_ids.apply(
    lambda x: ",".join(x)
)
ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
    lambda x: ",".join([str(v) for v in x])
)

#del ratings_data_transformed["zip_code"]

ratings_data_transformed.rename(
    columns={"method_ids": "sequence_method_ids", "ratings": "sequence_ratings"},
    inplace=True,
)

In [86]:
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

train_data.to_csv("train_data.csv", index=False, sep="|", header=False)
test_data.to_csv("test_data.csv", index=False, sep="|", header=False)

## Define metadata

In [87]:
CSV_HEADER = list(ratings_data_transformed.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
"method_id": list(methods.method_id.unique()),
"user_id": list(users.user_id.unique()),
"subject": list(users.subject.unique()), 
"country": list(users.country.unique()), 
"theme": list(users.theme.unique()), 
"classification": list(users.classification.unique()), 
"elites": list(users.elites.unique()), 
"nvresponses": list(users.nvresponses.unique()), 
"campaigner_violence": list(users.campaigner_violence.unique()), 
"repressive_violence": list(users.repressive_violence.unique()),
}

USER_FEATURES = ["subject", "country", "theme", "classification", "elites", "nvresponses", "campaigner_violence", "repressive_violence"]

METHOD_FEATURES = ["genres"]

## Create `tf.data.Dataset` for training and evaluation

In [88]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        method_ids_string = features["sequence_method_ids"]
        sequence_method_ids = tf.strings.split(method_ids_string, ",").to_tensor()

        # The last method id in the sequence is the target method.
        features["target_method_id"] = sequence_method_ids[:, -1]
        features["sequence_method_ids"] = sequence_method_ids[:, :-1]

        ratings_string = features["sequence_ratings"]
        sequence_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        target = sequence_ratings[:, -1]
        features["sequence_ratings"] = sequence_ratings[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset


## Create model inputs

In [89]:

def create_model_inputs():
    return {
        "user_id": layers.Input(name="user_id", shape=(1,), dtype=tf.string),
        "sequence_method_ids": layers.Input(name="sequence_method_ids", shape=(sequence_length - 1,), dtype=tf.string),
        "target_method_id": layers.Input(name="target_method_id", shape=(1,), dtype=tf.string),
        "sequence_ratings": layers.Input(name="sequence_ratings", shape=(sequence_length - 1,), dtype=tf.float32),
        "subject": layers.Input(name="subject", shape=(1,), dtype=tf.string),
        "country": layers.Input(name="country", shape=(1,), dtype=tf.string),
        "theme": layers.Input(name="theme", shape=(1,), dtype=tf.string),
        "classification": layers.Input(name="classification", shape=(1,), dtype=tf.string),
        "elites": layers.Input(name="elites", shape=(1,), dtype=tf.string),
        "nvresponses": layers.Input(name="nvresponses", shape=(1,), dtype=tf.string),
        "campaigner_violence": layers.Input(name="campaigner_violence", shape=(1,), dtype=tf.string),
        "repressive_violence": layers.Input(name="repressive_violence", shape=(1,), dtype=tf.string),
    }

## Encode input features

The `encode_input_features` method works as follows:

1. Each categorical user feature is encoded using `layers.Embedding`, with embedding
dimension equals to the square root of the vocabulary size of the feature.
The embeddings of these features are concatenated to form a single input tensor.

2. Each movie in the movie sequence and the target movie is encoded `layers.Embedding`,
where the dimension size is the square root of the number of movies.

3. A multi-hot genres vector for each movie is concatenated with its embedding vector,
and processed using a non-linear `layers.Dense` to output a vector of the same movie
embedding dimensions.

4. A positional embedding is added to each movie embedding in the sequence, and then
multiplied by its rating from the ratings sequence.

5. The target movie embedding is concatenated to the sequence movie embeddings, producing
a tensor with the shape of `[batch size, sequence length, embedding size]`, as expected
by the attention layer for the transformer architecture.

6. The method returns a tuple of two elements:  `encoded_transformer_features` and
`encoded_other_features`.

In [90]:
def encode_input_features(
    inputs,
    include_user_id=True,
    include_user_features=True,
    include_method_features=True,
):

    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("user_id")
    if include_user_features:
        other_feature_names.extend(USER_FEATURES)

    ## Encode user features
    for feature_name in other_feature_names:
        # Convert the string input values into integer indices.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(
            inputs[feature_name]
        )
        # Compute embedding dimensions
        embedding_dims = int(math.sqrt(len(vocabulary)))
        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    ## Create a single embedding vector for the user features
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a method embedding encoder
    method_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["method_id"]
    method_embedding_dims = int(math.sqrt(len(method_vocabulary)))
    # Create a lookup to convert string values to integer indices.
    method_index_lookup = StringLookup(
        vocabulary=method_vocabulary,
        mask_token=None,
        num_oov_indices=0,
        name="method_index_lookup",
    )
    # Create an embedding layer with the specified dimensions.
    method_embedding_encoder = layers.Embedding(
        input_dim=len(method_vocabulary),
        output_dim=method_embedding_dims,
        name=f"method_embedding",
    )
    # Create a vector lookup for method genres.
    genre_vectors = methods[genres].to_numpy()
    method_genres_lookup = layers.Embedding(
        input_dim=genre_vectors.shape[0],
        output_dim=genre_vectors.shape[1],
        embeddings_initializer=tf.keras.initializers.Constant(genre_vectors),
        trainable=False,
        name="genres_vector",
    )
    # Create a processing layer for genres.
    method_embedding_processor = layers.Dense(
        units=method_embedding_dims,
        activation="relu",
        name="process_method_embedding_with_genres",
    )

    ## Define a function to encode a given method id.
    def encode_method(method_id):
        # Convert the string input values into integer indices.
        method_idx = method_index_lookup(method_id)
        method_embedding = method_embedding_encoder(method_idx)
        encoded_method = method_embedding
        if include_method_features:
            method_genres_vector = method_genres_lookup(method_idx)
            encoded_method = method_embedding_processor(
                layers.concatenate([method_embedding, method_genres_vector])
            )
        return encoded_method

    ## Encoding target_method_id
    target_method_id = inputs["target_method_id"]
    encoded_target_method = encode_method(target_method_id)

    ## Encoding sequence method_ids.
    sequence_methods_ids = inputs["sequence_method_ids"]
    encoded_sequence_methods = encode_method(sequence_methods_ids)
    # Create positional embedding.
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=method_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    # Retrieve sequence ratings to incorporate them into the encoding of the method.
    sequence_ratings = tf.expand_dims(inputs["sequence_ratings"], -1)
    # Add the positional encoding to the method encodings and multiply them by rating.
    encoded_sequence_methods_with_poistion_and_rating = layers.Multiply()(
        [(encoded_sequence_methods + encodded_positions), sequence_ratings]
    )

    # Construct the transformer inputs.
    for encoded_method in tf.unstack(
        encoded_sequence_methods_with_poistion_and_rating, axis=1
    ):
        encoded_transformer_features.append(tf.expand_dims(encoded_method, 1))
    encoded_transformer_features.append(encoded_target_method)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features


## Create a BST model

In [91]:
include_user_id = False
include_user_features = False
include_method_features = False

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3


def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_user_id, include_user_features, include_method_features
    )

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


model = create_model()

  return bool(asarray(a1 == a2).all())


In [92]:
# Compile the model.
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError()],
)

# Read the training data.
train_dataset = get_dataset_from_csv("train_data.csv", shuffle=True, batch_size=265)

# Fit the model with the training data.
model.fit(train_dataset, epochs=5)

# Read the test data.
test_dataset = get_dataset_from_csv("test_data.csv", batch_size=265)
test_user = get_dataset_from_csv("test_user.csv", batch_size=265)

# Evaluate the model on the test data.
_, rmse = model.evaluate(test_dataset, verbose=0)
print(f"Test MAE: {round(rmse, 3)}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test MAE: 2.756


In [93]:
y_pred = model.predict(test_dataset)
print(y_pred)

[[ 6.1385484e+00]
 [ 5.6737719e+00]
 [ 7.1088386e+00]
 [ 9.0249515e-01]
 [ 4.2111135e+00]
 [ 4.4524488e+00]
 [ 7.6636081e+00]
 [ 1.9369163e-01]
 [ 1.6544901e-01]
 [ 7.4390340e+00]
 [ 6.0674796e+00]
 [ 5.8100338e+00]
 [ 6.0764737e+00]
 [ 6.6607285e+00]
 [ 6.5107646e+00]
 [ 8.0524263e+00]
 [ 4.2825103e+00]
 [ 8.0404215e+00]
 [ 7.0408292e+00]
 [ 3.7213247e+00]
 [ 3.7969105e+00]
 [ 3.6298106e+00]
 [ 9.9553311e-01]
 [ 7.1969810e+00]
 [ 4.2591399e-01]
 [ 5.4283252e+00]
 [ 5.3757977e+00]
 [ 4.6963590e-01]
 [ 5.7461982e+00]
 [ 7.5069699e+00]
 [ 6.9226012e+00]
 [ 6.4154043e+00]
 [ 5.4283252e+00]
 [ 5.4041209e+00]
 [ 8.0404215e+00]
 [ 7.6636081e+00]
 [ 4.3426585e+00]
 [ 5.8258481e+00]
 [ 3.9749594e+00]
 [ 5.2025394e+00]
 [ 3.7213247e+00]
 [ 5.8342977e+00]
 [ 7.1452088e+00]
 [ 5.8130021e+00]
 [ 7.6707727e-01]
 [ 1.0178797e-01]
 [ 1.3894079e+00]
 [ 8.6734629e-01]
 [ 1.6343377e+00]
 [ 2.5411551e+00]
 [ 4.1981759e+00]
 [ 6.0785551e+00]
 [ 5.2722125e+00]
 [ 4.0958562e+00]
 [ 4.8567805e+00]
 [ 2.69659

In [95]:
#user_id = test_dataset.user_id.sample(1).iloc[0]
#dataset = train_data.filter(filter_f).batch(1)
print(model.predict(test_user, ))

[[5.2196474]]


In [96]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 sequence_method_ids (InputLaye  [(None, 3)]         0           []                               
 r)                                                                                               
                                                                                                  
 method_index_lookup (StringLoo  multiple            0           ['target_method_id[0][0]',       
 kup)                                                             'sequence_method_ids[0][0]']    
                                                                                                  
 method_embedding (Embedding)   multiple             2800        ['method_index_lookup[0][0]',    
                                                                  'method_index_lookup[1][0]

 r)                                                                                               
                                                                                                  
 classification (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 country (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 elites (InputLayer)            [(None, 1)]          0           []                               
                                                                                                  
 nvresponses (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 repressiv

In [94]:
user_id = test_dataset.user_id.sample(1).iloc[0]
dataset = train_data.filter(filter_f).batch(1)
print(model.predict(user_id))

AttributeError: 'MapDataset' object has no attribute 'user_id'

In [None]:
print(model.predict([[
    "user_id": input("user_1"), 
    "sequence_method_ids": "method_39, method_38, method_1", 
    "target_method_id": "method_2",
    "sequence_ratings": "2.0, 2.0, 2.0",
    "subject": "Chinese Migrant Workers Protest for Equal Civil Rights 2011",
    "country": "china",
    "theme": "economic justice",
    "classification": "change"
    "elites": "no",
    "nvresponses": "no",
    "campaigner_violence": "yes",
    "repressive_violence": "yes",
]]))

In [None]:
print(model.predict([{
        "user_id": layers.Input(name="user_id", shape=(1,), dtype=tf.string),
        "sequence_method_ids": layers.Input(name="sequence_method_ids", shape=(sequence_length - 1,), dtype=tf.string),
        "target_method_id": layers.Input(name="target_method_id", shape=(1,), dtype=tf.string),
        "sequence_ratings": layers.Input(name="sequence_ratings", shape=(sequence_length - 1,), dtype=tf.float32),
        "subject": layers.Input(name="subject", shape=(1,), dtype=tf.string),
        "country": layers.Input(name="country", shape=(1,), dtype=tf.string),
        "theme": layers.Input(name="theme", shape=(1,), dtype=tf.string),
        "classification": layers.Input(name="classification", shape=(1,), dtype=tf.string),
        "elites": layers.Input(name="elites", shape=(1,), dtype=tf.string),
        "nvresponses": layers.Input(name="nvresponses", shape=(1,), dtype=tf.string),
        "campaigner_violence": layers.Input(name="campaigner_violence", shape=(1,), dtype=tf.string),
        "repressive_violence": layers.Input(name="repressive_violence", shape=(1,), dtype=tf.string),
    }]))

In [None]:
print(model.predict([{
    "user_id": np.full((1,), input("user_1")),
    "sequence_method_ids": np.full((1,), input("[method_39, method_38, method_1]")), 
    "target_method_id": np.full((1,), input("method_2")),
    "sequence_ratings": np.full((1,), input("[2.0, 2.0, 2.0]")),
    "subject": np.full((1,), input("Chinese Migrant Workers Protest for Equal Civil Rights 2011")),
    "country": np.full((1,), input("china")),
    "theme": np.full((1,), input("economic justice")),
    "classification": np.full((1,), input("change")),
    "elites": np.full((1,), input("no")),
    "nvresponses": np.full((1,), input("no")),
    "campaigner_violence": np.full((1,), input("yes")),
    "repressive_violence": np.full((1,), input("yes")),

}]))

In [None]:
print(model.predict([{
    "user_id": input("user_1"),
    "sequence_method_ids": input("[method_39, method_38, method_1]"), 
    "target_method_id": input("method_2"),
    "sequence_ratings": input("[2.0, 2.0, 2.0]"),
    "subject": input("Chinese Migrant Workers Protest for Equal Civil Rights 2011"),
    "country": input("china"),
    "theme": input("economic justice"),
    "classification": input("change"),
    "elites": input("no"),
    "nvresponses": input("no"),
    "campaigner_violence": input("yes"),
    "repressive_violence": input("yes"),

}]))