In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# KERAS LIBRARIES
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Sequential
from keras.optimizers import SGD

path_to_claims = os.path.join("datasets", "population", "Population_claims.csv")
path_to_groundtruth = os.path.join("datasets", "population", "Population_groundtruth.csv")

In [None]:
# Load all the claims
claims_input = pd.read_csv(path_to_claims)

In [5]:
# Show one claim
# "ObjectID","PropertyID","PropertyValue","SourceID","TimeStamp"
claims_input.sample(5)

Unnamed: 0,ObjectID,PropertyID,PropertyValue,SourceID,TimeStamp
33631,bogotá,Population2005,6778691,955285: Mijotoba,
16775,"hutchinson township, minnesota",Population2000,1120,5512121: CapitalBot,
49804,gmina sokółka,Population2006,26406,5277842: Kotbot,
21872,"socorro, new mexico",Population2000,8877,5512121: CapitalBot,
14122,"concord township, michigan",Population2000,2692,5512121: CapitalBot,


In [6]:
print("Number of claims:", len(claims_input))

Number of claims: 49955


In [7]:
print("Number of sources:", len(set(claims_input["SourceID"])))

Number of sources: 4264


In [8]:
print("Number of cities:", len(set(claims_input["ObjectID"])))

Number of cities: 41196


In [9]:
print("Number of properties:", len(set(claims_input["PropertyID"])))

Number of properties: 91


In [10]:
# Load the ground truth
ground_truth = pd.read_csv(path_to_groundtruth)

In [11]:
# Stats:
ground_truth.sample(5)

Unnamed: 0,ObjectID,PropertyID,PropertyValue
203,"perryton, texas",Population2000,7774
204,"milan, michigan",Population2000,4775
88,"summerville, south carolina",Population2000,27752
37,"utica, new york",Population2000,60651
250,"egg harbor, wisconsin",Population2000,250


In [12]:
print("Size of the test set: ", len(ground_truth))

Size of the test set:  308


In [13]:
print("Number of cities: ", len(set(ground_truth["ObjectID"])))

Number of cities:  305


In [14]:
print("Number of years: ", len(set(ground_truth["PropertyID"])))

Number of years:  3


In [15]:
# To represent the claims we will use a tripe (<PropertyID>, <ObjectID>, <PropertyValue>)
# In this way we will have a matrix like: 
#
# "Population2000"-"eagle point, oregon"-"2020302"
# "Population2005"-"eagle point, oregon"-"2022456" 
# "Population2000"-"modesto, california"-"9293219"
# ...
# A claim like this has the meaning of 
# "The source <SourceID> claims that <PropertyID> for <ObjectID> has the value of <PropertyValue>"
#
# In our case this can be translated as:
# "The source <SourceID> claims that the population in the year <PropertyID> for the city <ObjectID> was <PropertyValue>
#
# The next phase would be the one in which we build our matrix.
# The claim matrix is the following:
#
#              <PID, OID, PV> <PID, OID, PV> <PID, OID, PV> ... <PID, OID, PV>
# <SourceID_1>       1              0              0        ...       0
# <SourceID_2>       1              0              0        ...       0
# <SourceID_3>       0              0              1        ...       0
# <SourceID_4>       0              1              0        ...       0
#     ...           ...            ...            ...       ...      ...
# <SourceID_5>       1              0              0        ...       0
#
#
# We assume that foreach claim in the ground truth, then at least one source say something on it.
# There are only 15 claims, (PID, OID, PV), that are in the ground truth but not in the claims file.
#
# For this dataset we don't have a big number of claims, what can we do?
#  - Data Augmentation: try to produce more data from the claims
#      assuming that certain triples (not considered) are true... (how this would affect the accuracy?)
#  - Data mining: try to find online for the years of interest the values.
#
# How can we proceed if there is no ground truth for a couple (<PropertyID>, <ObjectID>) ?
# We can consider the tuple PropertyID-ObjectID from the ground truth and reason in this way:
#  - The triple PropertyID-ObjectID-PropertyValue extracted from the ground truth will have an
#     associated label of 1
#  - Other claims with the same PropertyID-ObjectID can be considered and,
#     because they have a different value from the one in the ground truth
#     the label value will be 0.
# 
# Should we use a range or the exact value?
#   this would not affect the number of considered claims, since we're using the tuple PID, OID for building the sensing
#   matrix, so we can use the exact value.
#
# The input of the network is a vector [S1Ci, S2Ci, S3Ci, ... , SNCi] a column of the sensing matrix, 
#    before the fit of the model we need to reshape the matrix.
# 
# At this point we should build our sensing matrix header.
# The header is made by the tripe (<PropertyID>, <ObjectID>, <PropertyValue>). 

In [16]:
# Build the ground_truth vector.

# tc_indexes is a dictionary that is used as inverted index:
#   foreach claim it's able to tell us the position of the truth value (1/0) for a claim in the tc_labels array.
tc_indexes = dict()
tc_labels = np.array([], dtype=int)
sources = set()

# Add to the ground truth claims all the claims in the ground truth file and put each value at 1
# Add to the ground truth claims also all the claims with the same propertyID and objectID
#   but have a different PropertyValue taken from the file of claims.
# Add also every sources that says something in a set.
for index, row in ground_truth.iterrows():
    tc_id = row["PropertyID"]+"|"+row["ObjectID"]+"|"+str(row["PropertyValue"])
    if tc_id not in tc_indexes:
        tc_indexes[tc_id] = tc_labels.size
        tc_labels = np.append(tc_labels, 1)
    filtered_claims = claims_input[claims_input["PropertyID"] == row["PropertyID"]]
    filtered_claims = filtered_claims[filtered_claims["ObjectID"] == row["ObjectID"]]
    
    for idx, claim in filtered_claims.iterrows():
        if str(claim["PropertyValue"]) != str(row["PropertyValue"]):
            tc_id = claim["PropertyID"]+"|"+claim["ObjectID"]+"|"+str(claim["PropertyValue"])
            if tc_id not in tc_indexes:
                tc_indexes[tc_id] = tc_labels.size
                tc_labels = np.append(tc_labels, 0)
        sources.add(claim["SourceID"])
            
    
print("Number of claims :", tc_labels.size)
print("Number of sources:", len(sources))

Number of claims : 700
Number of sources: 643


In [17]:
# Building the sensing matrix

claims_matrix = []

for source_num, source in enumerate(sources):  # foreach claims
    claim_vector = np.zeros(len(tc_indexes), dtype=int)       # initialize the vector with the claims
    
    source_claims = claims_input[claims_input["SourceID"] == source]
    for index, row in source_claims.iterrows():
        row_claim = row["PropertyID"]+"|"+row["ObjectID"]+"|"+str(row["PropertyValue"])
        if row_claim in tc_indexes:
            claim_vector[tc_indexes[row_claim]] = 1
    claims_matrix.append(claim_vector)
    if source_num % 200 == 0:
        print("Processed source %i " % source_num)
print("Done...")
# Our claim matrix is in the form specified above.

Processed source 0 
Processed source 200 
Processed source 400 
Processed source 600 
Done...


In [18]:
claims_matrix = np.array(claims_matrix, dtype=int)

print("Sensing Matrix shape: ", claims_matrix.shape)


Sensing Matrix shape:  (643, 700)


In [19]:
# Add to the claims matrix the "Ground Truth Source" in this way we have our input in matrix and can split, after a 
# transpose operation, the data in train set and test set.

sensing_matrix_with_truth = claims_matrix.transpose()

tc_labels_shaped = tc_labels.reshape(sensing_matrix_with_truth.shape[0], 1)
claims_sources_with_truth = np.append(sensing_matrix_with_truth, tc_labels_shaped, axis=1)

In [20]:
# Defining constants...
random_seed = 7

# # INSTANCE'S CONSTANTS
sources_num = len(sources)

# # MODEL'S CONSTANTS
train_set_size = 0.9
test_set_size = 1-train_set_size

learning_rate = 0.025
batch_size = 20
epochs = 150  

In [21]:
# # SPLIT IN TRAIN SET AND DATA SET.
train_set, test_set = train_test_split(claims_sources_with_truth, train_size=train_set_size, test_size=test_set_size, random_state=random_seed)


In [22]:
# # EXTRACT LABELS (Yt) and SAMPLES (X) For TRAIN-SET
train_set_x = train_set[:, 0:train_set.shape[1]-1]
# print(train_set_x.shape)
train_set_y = train_set[:, train_set.shape[1]-1]
# print(train_set_y.shape)


In [23]:
# # EXTRACT LABELS (Yt) and SAMPLES (X) For TEST-SET
test_set_x = test_set[:, 0:test_set.shape[1]-1]
# print(test_set_x.shape)
test_set_y = test_set[:, test_set.shape[1]-1]
# print(test_set_y.shape)


In [24]:
# DEFINE THE MODEL

# Hyper-parameters: we should find the shl_units and thl_units that provide the best performances...
# The first layer is always equals to the number of sources
#   because every source should say something on the available claims.
# The other layers can vary... what's the number?

fhl_units = sources_num
shl_units = int(sources_num/2)  # this is the default according to the paper
thl_units = shl_units

model = Sequential([
    Dense(units=fhl_units, input_dim=sources_num, activation='relu'),  # input layer
    Dropout(0.2, noise_shape=None, seed=random_seed),
    Dense(units=shl_units, activation='relu'),  # first hidden layer
    Dense(units=thl_units, activation='relu'),  # second hidden layer
    Dense(units=1, activation='sigmoid')    # output layer, 1 neuron,
])


In [25]:
# # COMPILE THE MODEL

# OPTIMIZER => SGD with a learning defined by the user.
# LOSS => cross-entropy

model.compile(SGD(lr=learning_rate), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# # FIT THE MODEL

history = model.fit(train_set_x, train_set_y, batch_size=batch_size, epochs=epochs, verbose=2)

Epoch 1/150
 - 0s - loss: 0.6887 - acc: 0.5968
Epoch 2/150
 - 0s - loss: 0.6824 - acc: 0.5587
Epoch 3/150
 - 0s - loss: 0.6760 - acc: 0.5714
Epoch 4/150
 - 0s - loss: 0.6702 - acc: 0.5619
Epoch 5/150
 - 0s - loss: 0.6625 - acc: 0.6048
Epoch 6/150
 - 0s - loss: 0.6547 - acc: 0.6492
Epoch 7/150
 - 0s - loss: 0.6441 - acc: 0.7270
Epoch 8/150
 - 0s - loss: 0.6335 - acc: 0.7349
Epoch 9/150
 - 0s - loss: 0.6189 - acc: 0.7397
Epoch 10/150
 - 0s - loss: 0.6047 - acc: 0.7397
Epoch 11/150
 - 0s - loss: 0.5896 - acc: 0.7397
Epoch 12/150
 - 0s - loss: 0.5753 - acc: 0.7397
Epoch 13/150
 - 0s - loss: 0.5622 - acc: 0.7413
Epoch 14/150
 - 0s - loss: 0.5524 - acc: 0.7397
Epoch 15/150
 - 0s - loss: 0.5439 - acc: 0.7413
Epoch 16/150
 - 0s - loss: 0.5377 - acc: 0.7413
Epoch 17/150
 - 0s - loss: 0.5304 - acc: 0.7444
Epoch 18/150
 - 0s - loss: 0.5221 - acc: 0.7460
Epoch 19/150
 - 0s - loss: 0.5161 - acc: 0.7492
Epoch 20/150
 - 0s - loss: 0.5068 - acc: 0.7571
Epoch 21/150
 - 0s - loss: 0.5032 - acc: 0.7651
E

In [None]:
# # TESTING THE MODEL

evaluation = model.evaluate(x=test_set_x, y=test_set_y, batch_size=batch_size, verbose=2)

print("\n%s: %.2f%% - %s: %.2f%%" %
      (model.metrics_names[0], evaluation[0]*100, model.metrics_names[1], evaluation[1]*100))