In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, auc, log_loss
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.random_projection import GaussianRandomProjection
from sklearn.decomposition import TruncatedSVD, PCA
#import scipy.sparse
from scipy import sparse as sp

# For baseline 
import keras # if you don't have it and you are using conda: type "conda install keras" in terminal
import tensorflow as tf
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LeakyReLU

# Data understanding

## Collect initial data 

**Task: Collect initial data**\
Acquire within the project the data (or access to the data) listed in the
project resources. This initial collection includes data loading if necessary
for data understanding. 

**Output: Initial data collection report**\
List 
* the **dataset** (or datasets) acquired 
* together with their **locations** within the project
* the **methods** used to acquire them
* and any **problems** encountered. 

In [19]:
file = 'training_sample.tsv'

In [20]:
column_names = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified",\
               "engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]

In [21]:
df = pd.read_csv(file, header=None, names=column_names, delimiter='\x01')

In [22]:
pd.set_option('display.max_columns', None)
print(df.shape)
display(df.head())

(80425, 24)


Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engaged_follows_engaging,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t56898\t137\t174\t63247\t10526\t131\t3197\...,,3C21DCFB8E3FEC1CB3D2BFB413A78220,Video,,,Retweet,76B8A9C3013AE6414A3E6012413CDC3B,1581467323,D1AA2C85FA644D64346EDD88470525F2,737,706,False,1403069820,000046C8606F1C3F5A7296222C88084B,131,2105,False,1573978269,False,,,,
1,101\t102463\t10230\t10105\t21040\t10169\t12811...,,3D87CC3655C276F1771752081423B405,,BB422AA00380E45F312FD2CAA75F4960,92D397F8E0F1E77B36B8C612C2C51E23,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1580975391,4DC65AC7BD963DE1F7617C047C33DE99,52366425,2383,True,1230139136,00006047187D0D18598EF12A650E1DAC,22,50,False,1340673962,False,,,,
2,101\t56898\t137\t11255\t22037\t10263\t168\t111...,DB32BD91C2F1B37BE700F374A07FBC61,3701848B96AA740528A2B0E247777D7D,,2423BA02A75DB2189335DDC3FB6B74A1,6D323BE93766E79BE423FAC5C28BE39B,Retweet,22C448FF81263D4BAF2A176145EE9EAD,1581257232,5C671539CB41B9807E209349B101E9FF,988,167,False,1530094483,0000648BAA193AE4C625DDF789B57172,251,719,False,1456473671,False,,,,
3,101\t13073\t28757\t106\t100\t14120\t131\t120\t...,,18176C6AD2871729384062F073CCE94D,Video,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581164292,70B900BE17416923D1E236A38798F202,1228134,5413,False,1378699943,000071667F50BAFEA722A8E8284581E5,18,58,False,1378427564,False,,,,1581305000.0
4,101\t3460\t1923\t6632\t2824\t30368\t2179\t1881...,,AF11AF01F842E7F120667B7B0B38676D,,,,Quote,22C448FF81263D4BAF2A176145EE9EAD,1581233650,E94C0E9E8494F3D603F9D1A5C5242E3D,73,299,False,1549054499,00007745A6EE969F1A0F44B10DC17671,268,526,False,1252294800,False,,,,


## Describe data 

**Task: Describe data**\
Examine the “gross” or “surface” properties of the acquired data and
report on the results.

**Output: Data description report**\
Describe the data wich has been acquired, including: 
* the **format** of the data
* the **quantity** of data, for example number of records and fields in each table,
* the **identities** of the fields and any other surface features of the data which have been discovered. 

Does the data acquired satisfy the relevant requirements?

| Feature category    | Feature name                 | Feature dtype | Feature description                                                                           |
|---------------------|------------------------------|---------------|-----------------------------------------------------------------------------------------------|
| User features       | userId                       | string        | User identifier                                                                               |
| User features       | follower count               | int           | Number of followers of the user                                                               |
| User features       | following count              | int           | Number of accounts this user is following                                                     |
| User features       | is verified                  | bool          | Is the account verified?                                                                      |
| User features       | account creation             | timestamp     | in ms int Unix timestamp (in seconds) of the creation time of the account                     |
| Tweet features      | tweetId                      | string        | Tweet identifier                                                                              |
| Tweet features      | presentMedia                 | list[string]  | Tab-separated list of media types;  media type can be in (Photo, Video, Gif)                  |
| Tweet features      | presentLinks                 | list[string]  | Tab-separated list of links included in the tweet                                             |
| Tweet features      | presentDomains               | list[string]  | Tab-separated list of domains (e.g. twitter.com) included in the tweet                        |
| Tweet features      | tweetType                    | string        | Tweet type, can be either Retweet, Quote, Reply, or Toplevel                                  |
| Tweet features      | language                     | string        | Identifier corresponding to inferred language of the tweet                                    |
| Tweet features      | tweet timestamp              | int           | Unix timestamp, in seconds of the creation time of the Tweet                                  |
| Tweet features      | tweet tokens                 | list[int]     | Ordered list of Bert ids corresponding to Bert tokenization of Tweet text                     |
| Tweet features      | tweet hashtags               | list[string]  | Tab-separated list of hashtags present in the tweet                                           |
| Engagement features | reply engagement timestamp   | int           | Unix timestamp, in seconds, of the Reply engagement if one exists.                            |
| Engagement features | retweet engagement timestamp | int           | Unix timestamp, in seconds, of the Retweet engagement if one exists.                          |
| Engagement features | quote engagement timestamp   | int           | Unix timestamp, in seconds, of the Quote engagement if one exists.                            |
| Engagement features | like engagement timestamp    | int           | Unix timestamp, in seconds, of the Like engagement if one exists.                             |
| Engagement features | engageeFollowsEngager        | bool          | Does the account of the engaged tweet author follow the account that has made the engagement? |

## Explore data 


**Task: Explore data**\
This task tackles the data mining questions, which can be addressed
using querying, visualization and reporting. These include: 
* **distribution of key attributes**, for example the target attribute of a prediction task
* **relations** between pairs or small numbers of attributes
* results of **simple aggregations**
* properties of **significant sub-populations**
* **simple statistical analyses**. These analyses may address directly the data mining goals. they may also contribute to or refine the data description, quality reports, feed into the transformation and other data preparation needed for further analysis.


**Output: Data exploration report**\
Describe results of this task including 
* **first findings** or **initial hypothesis** and their **impact** on the remainder of the project. 
* If appropriate, include **graphs and plots**, which indicate data characteristics or lead to interesting **data subsets** for further examination.

## Verify data quality

**Task: Verify data quality**\
Examine the quality of the data, addressing questions such as: 
* is the data **complete** (does it cover all the cases required)? 
* Is it correct or does it contain **errors** and if there are errors **how common** are they?
* Are there **missing values** in the data? If so how are they represented, where do they occur and how common are they?


**Output: Data quality report**\
List the results of the data quality verification; if quality problems
exist, list possible solutions. Solutions to data quality problems
generally depend heavily on both data and business knowledge.

In [23]:
display(df)

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engaged_follows_engaging,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t56898\t137\t174\t63247\t10526\t131\t3197\...,,3C21DCFB8E3FEC1CB3D2BFB413A78220,Video,,,Retweet,76B8A9C3013AE6414A3E6012413CDC3B,1581467323,D1AA2C85FA644D64346EDD88470525F2,737,706,False,1403069820,000046C8606F1C3F5A7296222C88084B,131,2105,False,1573978269,False,,,,
1,101\t102463\t10230\t10105\t21040\t10169\t12811...,,3D87CC3655C276F1771752081423B405,,BB422AA00380E45F312FD2CAA75F4960,92D397F8E0F1E77B36B8C612C2C51E23,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1580975391,4DC65AC7BD963DE1F7617C047C33DE99,52366425,2383,True,1230139136,00006047187D0D18598EF12A650E1DAC,22,50,False,1340673962,False,,,,
2,101\t56898\t137\t11255\t22037\t10263\t168\t111...,DB32BD91C2F1B37BE700F374A07FBC61,3701848B96AA740528A2B0E247777D7D,,2423BA02A75DB2189335DDC3FB6B74A1,6D323BE93766E79BE423FAC5C28BE39B,Retweet,22C448FF81263D4BAF2A176145EE9EAD,1581257232,5C671539CB41B9807E209349B101E9FF,988,167,False,1530094483,0000648BAA193AE4C625DDF789B57172,251,719,False,1456473671,False,,,,
3,101\t13073\t28757\t106\t100\t14120\t131\t120\t...,,18176C6AD2871729384062F073CCE94D,Video,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581164292,70B900BE17416923D1E236A38798F202,1228134,5413,False,1378699943,000071667F50BAFEA722A8E8284581E5,18,58,False,1378427564,False,,,,1.581305e+09
4,101\t3460\t1923\t6632\t2824\t30368\t2179\t1881...,,AF11AF01F842E7F120667B7B0B38676D,,,,Quote,22C448FF81263D4BAF2A176145EE9EAD,1581233650,E94C0E9E8494F3D603F9D1A5C5242E3D,73,299,False,1549054499,00007745A6EE969F1A0F44B10DC17671,268,526,False,1252294800,False,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80420,101\t56898\t137\t14796\t13711\t17617\t10161\t1...,FC7321735734C2FC8A3CAE30D266CD71,533F80610C8C2F4345517986B5BB58E5,,,,Retweet,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581212842,0BCA6643D664442CA7901690F5843C1A,1432450,1869,True,1220061361,06209A39A94A7AF33B253C1EFA2D52E5,32,443,False,1393279525,False,,,,
80421,101\t56898\t137\t11885\t11273\t40154\t10206\t1...,0FE0A5F06FA20E3C2CDE7F65ACA0046C,DAC3216BB2DC4747BB2CCBA6D253A308,,,,Retweet,06D61DCBBE938971E1EA0C38BD9B5446,1581533061,F44A5E2FD8B6A2ACF0A1B97D57ED3C92,29193,1434,False,1251330211,0620C4B9A7E8153DFD1ECEE5FE257F9C,54,755,False,1275826974,False,,,,
80422,101\t22800\t10531\t10124\t28780\t104939\t10230...,,5F9EFD38F96180EAB6BAA74481C0E6FE,,,,Quote,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581119848,95CD94FE6760E0A5C8A183D821B8460A,96,242,False,1559761254,062154E2ED505B1DA7A9883921E42838,31,65,False,1543720066,True,,,,1.581122e+09
80423,101\t18249\t112\t187\t169\t16745\t26133\t117\t...,,8CB10325EAAD5E121E686EF222B8598C,Photo,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581052325,9DF1155503CCA735A24A0B61E0445EF7,3134,5000,False,1452619872,0621E5EBF6FB229F57303B1FB6CF7B3A,7,100,False,1395188402,False,,,,


**Completeness**\
The dataset provided by twitter is sparse but the sparseness is justified and NaN values are not considered as missing due to technical errors but due to the nature of social networks where hashtags, media and links are not always part of a message (tweet).

**Possible errors**\
Why do some users have the exactly same timestamp for two types of engagements (e.g. row_id 80424)?\

**Missing values** (only unjustified ones):\
so far none


# Data Preparation

## Select data

**Task: Select data**\
Decide on the data to be used for analysis. Criteria include relevance
to the data mining goals, quality and technical constraints such as
limits on data volume or data types. Note that data selection covers
selection of attributes (columns) as well as selection of records (rows)
in a table.

**Output: Rationale for inclusion/exclusion**\
List the data to be included/excluded and the reasons for these decisions.

## Clean data

**Task: Clean data**\
Raise the data quality to the level required by the selected analysis
techniques. This may involve selection of clean subsets of the data, the
insertion of suitable defaults or more ambitious techniques such as the
estimation of missing data by modeling.


**Output: Data cleaning report**\
Describe what decisions and actions were taken to address the data
quality problems reported during the verify data quality task of the
data understanding phase. Transformations of the data for cleaning
purposes and the possible impact on the analysis results should be
considered.

## Construct data

**Task: Construct data**\
This task includes constructive data preparation operations such as the
production of derived attributes, entire new records or transformed
values for existing attributes.


**Outputs: Derived attributes**\
Derived attributes are new attributes that are constructed from one or more
existing attributes in the same record. Examples: area = length * width.
Generated records
Describe the creation of completely new records. Example: create
records for customers who made no purchase during the past year.
There was no reason to have such records in the raw data, but for
modeling purposes it might make sense to explicitly represent the fact
that certain customers made zero purchases.

### Enhance dataset with derived columns

In [24]:
display(df.head())

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engaged_follows_engaging,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t56898\t137\t174\t63247\t10526\t131\t3197\...,,3C21DCFB8E3FEC1CB3D2BFB413A78220,Video,,,Retweet,76B8A9C3013AE6414A3E6012413CDC3B,1581467323,D1AA2C85FA644D64346EDD88470525F2,737,706,False,1403069820,000046C8606F1C3F5A7296222C88084B,131,2105,False,1573978269,False,,,,
1,101\t102463\t10230\t10105\t21040\t10169\t12811...,,3D87CC3655C276F1771752081423B405,,BB422AA00380E45F312FD2CAA75F4960,92D397F8E0F1E77B36B8C612C2C51E23,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1580975391,4DC65AC7BD963DE1F7617C047C33DE99,52366425,2383,True,1230139136,00006047187D0D18598EF12A650E1DAC,22,50,False,1340673962,False,,,,
2,101\t56898\t137\t11255\t22037\t10263\t168\t111...,DB32BD91C2F1B37BE700F374A07FBC61,3701848B96AA740528A2B0E247777D7D,,2423BA02A75DB2189335DDC3FB6B74A1,6D323BE93766E79BE423FAC5C28BE39B,Retweet,22C448FF81263D4BAF2A176145EE9EAD,1581257232,5C671539CB41B9807E209349B101E9FF,988,167,False,1530094483,0000648BAA193AE4C625DDF789B57172,251,719,False,1456473671,False,,,,
3,101\t13073\t28757\t106\t100\t14120\t131\t120\t...,,18176C6AD2871729384062F073CCE94D,Video,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581164292,70B900BE17416923D1E236A38798F202,1228134,5413,False,1378699943,000071667F50BAFEA722A8E8284581E5,18,58,False,1378427564,False,,,,1581305000.0
4,101\t3460\t1923\t6632\t2824\t30368\t2179\t1881...,,AF11AF01F842E7F120667B7B0B38676D,,,,Quote,22C448FF81263D4BAF2A176145EE9EAD,1581233650,E94C0E9E8494F3D603F9D1A5C5242E3D,73,299,False,1549054499,00007745A6EE969F1A0F44B10DC17671,268,526,False,1252294800,False,,,,


### T4. Extract the Social Network
* Twitter Social Network is directional (follower – following) 
* parse the `engaged_follows_engaging` field: each example gives you an edge
* Create the adjacency matrix representation of the social graph
* If an edge exists between two users, 0 otherwise
* how can you use this information?

In [40]:
# FROM TUWEL
unique_user_ids = df['engaging_user_id'].append(df['engaged_with_user_id']).unique()

m = len(unique_user_ids)

userId_to_userIDX = dict(zip(unique_user_ids, range(m)))
userIDX_to_userId = dict(zip(range(m), unique_user_ids))

e_df = df[df['engaged_follows_engaging']][['engaging_user_id', 'engaged_with_user_id', 'engaged_follows_engaging']]

e_df['engaging_user_idx'] = e_df['engaging_user_id'].map(userId_to_userIDX)
e_df['engaged_with_user_idx'] = e_df['engaged_with_user_id'].map(userId_to_userIDX)

e_df.sort_values(by=['engaging_user_idx'], inplace = True)
e_df.drop_duplicates(inplace = True)
e_df.reset_index(drop = True, inplace = True)

display(e_df.head())



Unnamed: 0,engaging_user_id,engaged_with_user_id,engaged_follows_engaging,engaging_user_idx,engaged_with_user_idx
0,0,33,True,463,896
1,0,94,True,463,377
2,0,784,True,463,860
3,0,606,True,463,434
4,0,642,True,463,14


In [43]:
SN = sp.csr_matrix((e_df.engaged_follows_engaging, (e_df.engaging_user_idx, e_df.engaged_with_user_idx)), shape=(m, m))
SN.shape

TypeError: no supported conversion for types: (dtype('O'),)

## Format data

**Task: Format data**\
Formatting transformations refer to primarily syntactic modifications
made to the data that do not change its meaning, but might be required
by the modeling tool.


**Output: Reformatted data**\
Some tools have requirements on the order of the attributes, such as
the first field being a unique identifier for each record or the last field
being the outcome field the model is to predict.
It might be important to change the order of the records in the dataset.
Perhaps the modeling tool requires that the records be sorted according
to the value of the outcome attribute. A common situation is that the
records of the dataset are initially ordered in some way but the modeling
algorithm needs them to be in a fairly random order. For example, when
using neural networks it is generally best for the records to be presented
in a random order although some tools handle this automatically without explicit user intervention

In [26]:
# Parse attributes containing tab-separated lists into lists.
df['text_tokens'] = df['text_tokens'].str.split('\t')

def to_hex_list(x):
    output = str(x).split('\t')
#     output = [int(val, 16) for val in str(x).split('\t')] 
    return output

cols_to_process = ['hashtags', 'present_media', 'present_links', 'present_domains']

for col in cols_to_process:  
    df[col] = df[col].apply(lambda x: to_hex_list(x) if isinstance(x, str)  else x)


# Transform raw timestamps into human-readable timestamps.
cols_to_process = ['tweet_timestamp', 'engaging_user_account_creation', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

for col in cols_to_process:  
    df[col] = df[col].apply(lambda x: pd.Timestamp(x, unit='s'))

# Modelling

## T1. Split into train, dev, test
* Sub-sample to create test, non-test datasets
* Optionally split non-test into train and dev
* e.g., to implement k-fold validation

In [27]:
# Train/Test split

from sklearn.model_selection import train_test_split
X_nontest, X_test = train_test_split(df, test_size = 0.1, random_state = 42)
X_train, X_dev = train_test_split(X_nontest, test_size = 0.1, random_state = 42)
print(X_train.shape, X_dev.shape, X_test.shape)

(65143, 24) (7239, 24) (8043, 24)


## T5. Implement a Baseline

Implement the neural network approach described in the challenge paper: https://arxiv.org/abs/2004.13715.
This Baseline will be used in the evaluation phase to compare our Naive Bayes and Multivariate Regression approaches to it.

In [28]:
numeric_cols = ['engaged_with_user_follower_count', 
                'engaged_with_user_following_count', 
                'engaged_with_user_account_creation',
                'engaging_user_follower_count', 
                'engaging_user_following_count',
                'engaging_user_account_creation',
                'tweet_timestamp',
               ]

categorical_cols = ['tweet_type', 'language', 
                    'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engaged_follows_engaging']

id_cols = ['tweet_id', 'engaged_with_user_id', 'engaging_user_id']

response_cols = ['reply_timestamp', 
                 'retweet_timestamp',
                 'retweet_with_comment_timestamp', 
                 'like_timestamp'
                ]

tweet_feature_cols = ['text_tokens', 'hashtags', 'present_media', 'present_links', 'present_domains']

In [29]:
df.columns

Index(['text_tokens', 'hashtags', 'tweet_id', 'present_media', 'present_links',
       'present_domains', 'tweet_type', 'language', 'tweet_timestamp',
       'engaged_with_user_id', 'engaged_with_user_follower_count',
       'engaged_with_user_following_count', 'engaged_with_user_is_verified',
       'engaged_with_user_account_creation', 'engaging_user_id',
       'engaging_user_follower_count', 'engaging_user_following_count',
       'engaging_user_is_verified', 'engaging_user_account_creation',
       'engaged_follows_engaging', 'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp'],
      dtype='object')

In [30]:
def numeric_cat(x, nq=49):
    '''
    Encodes a numeric column to categoric.

    Parameters:
        x (pd.Series): The pandas Series which shall be tranformed.
        nq (int): The number of buckets onto which the column should be distrubed.

    Returns:
        A pd.Series containing a categorical encoding of the given numeric column.
    '''
    # pd.cut and pd.qcut put NaN into bucket -1
    #might have to use cut for equal distance bins instead of equal size bins
    cat_type = pd.api.types.CategoricalDtype(categories=[*range(nq+1)], ordered=False)
    return (pd.qcut(x, nq).cat.codes + 1).astype(cat_type)

def categorical_cat(x):
    return x.astype('category')

def id_cat(x, buckets = 1000):
    return x.apply(lambda y : abs(hash(y)) % buckets).astype('category')

def encode_tweetFeature(x, replacement = ['Unknown']):
    return pd.Series([replacement if y is np.nan else y for y in x])

def encode_response(x):
    return x.notnull().astype('int8')

In [31]:
df[numeric_cols] = df[numeric_cols].apply(numeric_cat)
df[categorical_cols] = df[categorical_cols].apply(categorical_cat)
df[id_cols] = df[id_cols].apply(id_cat)
df[tweet_feature_cols] = df[tweet_feature_cols].apply(encode_tweetFeature)
df[response_cols] = df[response_cols].apply(encode_response)

df.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engaged_follows_engaging,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,"[101, 56898, 137, 174, 63247, 10526, 131, 3197...",[Unknown],722,[Video],[Unknown],[Unknown],Retweet,76B8A9C3013AE6414A3E6012413CDC3B,43,196,9,28,False,26,897,22,46,False,48,False,0,0,0,0
1,"[101, 102463, 10230, 10105, 21040, 10169, 1281...",[Unknown],964,[Unknown],[BB422AA00380E45F312FD2CAA75F4960],[92D397F8E0F1E77B36B8C612C2C51E23],TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,3,35,49,40,True,3,373,7,3,False,9,False,0,0,0,0
2,"[101, 56898, 137, 11255, 22037, 10263, 168, 11...",[DB32BD91C2F1B37BE700F374A07FBC61],656,[Unknown],[2423BA02A75DB2189335DDC3FB6B74A1],[6D323BE93766E79BE423FAC5C28BE39B],Retweet,22C448FF81263D4BAF2A176145EE9EAD,26,282,11,11,False,41,620,29,37,False,22,False,0,0,0,0
3,"[101, 13073, 28757, 106, 100, 14120, 131, 120,...",[Unknown],226,[Video],[Unknown],[Unknown],TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,18,720,44,45,False,24,516,6,4,False,13,False,0,0,0,1
4,"[101, 3460, 1923, 6632, 2824, 30368, 2179, 188...",[Unknown],428,[Unknown],[Unknown],[Unknown],Quote,22C448FF81263D4BAF2A176145EE9EAD,24,660,1,16,False,44,300,29,33,False,3,False,0,0,0,0


### Onehot encode tweet feature columns

In [32]:
mlb_text_tokens = MultiLabelBinarizer(sparse_output=True)
mlb_hashtags = MultiLabelBinarizer(sparse_output=True)
mlb_present_media = MultiLabelBinarizer(sparse_output=True)
mlb_present_links = MultiLabelBinarizer(sparse_output=True)
mlb_present_domains = MultiLabelBinarizer(sparse_output=True)

X_text_tokens = mlb_text_tokens.fit_transform(df.text_tokens)
X_hashtags = mlb_hashtags.fit_transform(df.hashtags)
X_present_media = mlb_present_links.fit_transform(df.present_media)
X_present_links = mlb_present_links.fit_transform(df.present_links)
X_present_domains = mlb_present_domains.fit_transform(df.present_domains)

X_tweet_features = scipy.sparse.hstack([X_text_tokens, X_hashtags, X_present_media,
                             X_present_links, X_present_domains])

X_tweet_features

<80425x92969 sparse matrix of type '<class 'numpy.longlong'>'
	with 3493421 stored elements in COOrdinate format>

### Onehot encode numeric, categorical and id colums

In [33]:
numeric_oh = OneHotEncoder()
categorical_oh = OneHotEncoder()
id_oh = OneHotEncoder()

X_numeric = numeric_oh.fit_transform(df[numeric_cols])
X_categorical = categorical_oh.fit_transform(df[categorical_cols])
X_id = id_oh.fit_transform(df[id_cols])

rng = np.random.RandomState(42)

# use truncated SVD for dim. reduction
numeric_trans = TruncatedSVD(n_components=16, random_state=rng)
categorical_trans = TruncatedSVD(n_components=16, random_state=rng)
id_trans = TruncatedSVD(n_components=16, random_state=rng)
tweet_features_trans = TruncatedSVD(n_components=16, random_state=rng)

X_numeric_trans = numeric_trans.fit_transform(X_numeric)
X_categorical_trans = categorical_trans.fit_transform(X_categorical)
X_id_trans = id_trans.fit_transform(X_id)
X_tweet_features_trans = tweet_features_trans.fit_transform(X_tweet_features)

X_trans = np.hstack([X_numeric_trans, X_categorical_trans, X_id_trans, X_tweet_features_trans])

y = df['like_timestamp']

X_trans.shape, y.shape

((80425, 64), (80425,))

### Build model

In [34]:
# define loss functions for tf
def pr_auc(y_true, y_pred):
    return tf.py_function(compute_prauc, (y_pred, y_true), tf.float64)

#huber = tf.keras.losses.Huber()
#huber = tf.losses.Huber()
# Building the model

def generate_model():
    model = Sequential()

    model.add(Dense(128, input_shape=(64,)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    model.add(Dense(32))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    model.add(Dense(1))
    model.add(Activation('relu'))

    opt = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss = "huber_loss", optimizer=opt, metrics=[pr_auc])
    
    #model.summary()
    
    return model

In [35]:
model = generate_model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)               

## Naive Bayes 

## Multivariate Regression

# Evaluation

## Eevaluate as a binary classification task in two ways
* Area Under Precision-Recall Curve
  * generate precision-recall pairs for various probability thresholds
    * assumes anything above threshold is predicted as relevant
* Cross-Entropy Loss = Log-Loss (for binary classification)
  * measure how good the predicted probabilities are

### T2.1 Parse test to create the ground truth output file
engaging user id; tweet id; label


In [36]:
# from TUWEL: not tested yet
def has_timestamp(x):
  return int(not pd.isnull(x))

def create_labels(col):
  copy_test = X_test.copy()
  copy_test['label'] = copy_test.apply(lambda row: has_timestamp(row[col]), axis = 1)
  return copy_test[['engaging_user_id', 'tweet_id', 'label']]

def write_groundtruth(engagement):
  gt = create_labels(engagement + '_timestamp')
  gt.to_csv('gt_' + engagement + '.csv', index = False)
    
write_groundtruth('reply')
write_groundtruth('retweet')
write_groundtruth('retweet_with_comment')
write_groundtruth('like')

### T2.2: Implement the `read_predictions` function 
from https://recsys-twitter.com/code/snippets

In [44]:
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
  prec, recall, thresh = precision_recall_curve(gt, pred)
  prauc = auc(recall, prec)
  return prauc

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# from TUWEL: needs to be checked
def read_predictions(file):
  pred_data = pd.read_csv(file, names = ['user_id', 'tweet_id', 'label'], header = 0)
  sorted_data = pred_data.sort_values(['user_id', 'tweet_id'])
  return sorted_data['label'].to_numpy()

  
ground_truth = read_predictions("gt_like.csv") # will return data in the form (tweet_id, user_id, labed (1 or 0))
#ground_truth = read_predictions("gt_like.csv")
#ground_truth = read_predictions("gt_like.csv")
#ground_truth = read_predictions("gt_like.csv")

#predictions = read_predictions("predictions.csv") # will return data in the form (tweet_id, user_id, prediction)

### Evaluate base line

In [38]:
from sklearn.model_selection import StratifiedKFold

num_folds = 10
epochs = 200
batch_size = 8000

prauc_per_fold = []
loss_per_fold = []

kfold = StratifiedKFold(n_splits=num_folds, shuffle=True)

fold_no = 1
for train, test in kfold.split(X_trans, y):
    model = generate_model()
    
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    
    model.fit(X_trans[train], y[train], batch_size=batch_size,
              epochs=epochs, verbose=0)
    
    scores = model.evaluate(X_trans[test], y[test], verbose=0)
    print(f'Score for fold {fold_no}: loss of {scores[0]}; prauc of {scores[1]}')
    
    loss_per_fold.append(scores[0])
    prauc_per_fold.append(scores[1])
    
    fold_no += 1
    
print('------------------------------------------------------------------------')
print('Score per fold:')
print('Fold |  Loss  |  PRAUC')
for i in range(num_folds):
    print('%4d | %.4f | %.4f' % (i+1, loss_per_fold[i], prauc_per_fold[i]))
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print('        mean  +- std')
print('PRAUC: %.4f +- %.4f' % (np.mean(prauc_per_fold), np.std(prauc_per_fold)))
print('Loss:  %.4f +- %.4f' % (np.mean(loss_per_fold), np.std(loss_per_fold)))
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Training for fold 1 ...
Score for fold 1: loss of 0.10375839534715739; prauc of 0.7555845379829407
------------------------------------------------------------------------
Training for fold 2 ...
Score for fold 2: loss of 0.10264259849790909; prauc of 0.7656937837600708
------------------------------------------------------------------------
Training for fold 3 ...
Score for fold 3: loss of 0.10296858814441341; prauc of 0.7625969052314758
------------------------------------------------------------------------
Training for fold 4 ...
Score for fold 4: loss of 0.10250346904270138; prauc of 0.7727589011192322
------------------------------------------------------------------------
Training for fold 5 ...
Score for fold 5: loss of 0.10422143597819535; prauc of 0.7592910528182983
------------------------------------------------------------------------
Training for fold 6 ...
Score for fold 6: loss of 0.103078873021571

### Evaluate Naive Bayes model

### Evaluate Regression model

### Compare naive bayes and regression models to baseline