In [104]:
import dataset as ds
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### We select the cutoff for the "time since root tweet" limit in fetching tree data

In [105]:
CUTOFF = 10000

In [106]:
exp_name = "twitter15"

In [107]:
data_builder = ds.DatasetBuilder(exp_name, time_cutoff=CUTOFF)
dataset = data_builder.create_dataset(dataset_type="raw", standardize_features=False)

Considering 4 classes problem
We consider tweets emitted no later than 10000mins after the root tweet
Features that will be considered: user_only
Len train/val/test 1005 149 336
Oversampling...
Before oversampling: 1490 trees, 1005 train trees
After oversampling: 1490 trees, 1005 train trees
Dataset loaded in 41.678s


In [None]:
raw_train = dataset['train']
raw_validate = dataset['val']
raw_test = dataset['test']

In [None]:
len(raw_train)

In [None]:
raw_train[0][0]

In [None]:
edge_feature_names = ["created_at",
                    "favourites_count", 
                    "followers_count", 
                    "friends_count", 
                    "geo_enabled",
                    "has_description",
                    "len_name",
                    "len_screen_name",
                    "listed_count",
                    "statuses_count", 
                    "verified"
                     ]
edge_feature_names = sorted(edge_feature_names)
edge_feature_names = ["label",
                      "root_id",
                      "in_tweet_idx",
                      "out_tweet_idx",
                      "latency",
                      "in_uid",
                      "out_uid"] + edge_feature_names
n_cols = len(edge_feature_names)
n_cols

In [None]:
edge_feature_names

### We set the prediction latency level for our model: X minutes means our model classifies using features available X minutes after the root tweet is emitted 

In [None]:
PREDICTION_LATENCY = 120 # in minutes after the first tweet is emitted

In [None]:
def get_df_from_raw(raw_data):
    data_dict = {name:[] for name in edge_feature_names}
    for features_sequence in raw_data:
        for dp in features_sequence:
            for i in range(n_cols):
                data_dict[edge_feature_names[i]].append(dp[i])
    df = pd.DataFrame(data=data_dict)
    return df

In [None]:
df_train = get_df_from_raw(raw_train)
df_validate = get_df_from_raw(raw_validate)
df_test = get_df_from_raw(raw_test)

In [None]:
df_train.head()

In [None]:
class_labels = list(df_train.label.unique())
lookup_dict = {label:integer for integer, label in enumerate(class_labels)}

In [None]:
class_labels

In [None]:
list(lookup_dict.items())

In [None]:
len(df_train.in_tweet_idx.unique())

In [None]:
df_train.label

In [None]:
# df_test = df_train.groupby(['out_uid']).nunique().loc[:, ['label', 'favourites_count']]
# df_test.favourites_count.unique()

### Applying log transforms where needed

In [None]:
to_log = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count']
for colname in to_log:
    df_train.loc[:, colname] = np.log(df_train.loc[:, colname].values + 1)
    df_validate.loc[:, colname] = np.log(df_validate.loc[:, colname].values + 1)
    df_test.loc[:, colname] = np.log(df_test.loc[:, colname].values + 1)
df_train.label = df_train.label.apply(lambda x: lookup_dict[x])
df_validate.label = df_validate.label.apply(lambda x: lookup_dict[x])
df_test.label = df_test.label.apply(lambda x: lookup_dict[x])

In [None]:
df_train.dtypes

In [None]:
def cut_by_latency(df):
    df = df.loc[df.latency <= PREDICTION_LATENCY]
    return df

In [None]:
# df_train = cut_by_latency(df_train)
# df_validate = cut_by_latency(df_validate)
# df_test = cut_by_latency(df_test)

In [None]:
len(df_train.root_id.unique())

First analysis based on a simple aggregation of the features by root_id
- in_tweet -> nunique
- latency -> mean
- in_uid -> nunique
- created_at -> mean
- followers_count -> mean
- favourites_count -> mean
- friends_count -> mean
- geo_enabled -> mean
- has_description -> mean
- statuses_count -> mean
- verified -> mean

In [None]:
df_train.columns

In [None]:
cols_kept = [colname for colname in df_train.columns if colname not in ['len_name', 'len_screen_name', 'listed_count']]
cols_kept

In [None]:
count_cols = ['out_tweet_idx', 'out_uid', 'in_tweet_idx', 'in_uid']

In [None]:
mean_cols = [colname for colname in cols_kept if colname not in count_cols]

In [None]:
sum_cols = []
sum_cols

In [None]:
print(len(cols_kept), len(mean_cols), len(count_cols), len(sum_cols))

In [None]:
df_train = df_train[cols_kept]
df_validate = df_validate[cols_kept]
df_test = df_test[cols_kept]

In [None]:
df_train.head()

In [None]:
def aggregation_v0(df):
    aggregations_dict = {name:'sum' for name in sum_cols}
    aggregations_dict.update({name: 'mean' for name in mean_cols})
    aggregations_dict.update({name: 'nunique' for name in count_cols})
    aggregated_data = df.groupby('root_id').agg(aggregations_dict)
    aggregated_data = aggregated_data.reset_index(drop=True)
#     aggregated_data = aggregated_data.drop(columns='root_id')
    return aggregated_data

### We aggregate the features as planned and fit a GB with Decision Trees model, evaluate its ROC-AUC perf on val

In [None]:
df_train = aggregation_v0(df_train)
df_validate = aggregation_v0(df_validate)
df_test = aggregation_v0(df_test)

In [None]:
len(df_train)

In [None]:
df_train.head()

In [None]:
def binned_plot(data, feature_col, target_col, nquantiles=5):
    feature_vals = data.loc[:, feature_col].values
    target_vals = data.loc[:, target_col].values
    quantiles_to_compute = np.linspace(0, 1, num=nquantiles+2)
    bin_edges = np.quantile(feature_vals, q=quantiles_to_compute)
    nx, _ = np.histogram(feature_vals, bins=bin_edges)
    sum_x, _ = np.histogram(feature_vals, bins=bin_edges, weights=feature_vals)
    sum_y, _ = np.histogram(feature_vals, bins=bin_edges, weights=target_vals)
    plt.plot(sum_x / nx, sum_y/nx, color='black')
    plt.xlabel('Feature {}'.format(feature_col))
    plt.ylabel('Target mean')
    plt.ylim(target_vals.min(), target_vals.max())
    plt.grid()
    plt.show()
    plt.close()

In [None]:
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score

In [None]:
dataset = lgb.Dataset(df_train.iloc[:, 1:].drop(columns='root_id'), label=df_train.label)
dataset.construct()
params = {
    'objective':'multiclass',
    'num_class':4,
    'bagging_freq':5,
    'feature_fraction':1.0,
    'boosting_type':'gbdt',
    'max_depth':5,
    'learning_rate':0.005, # range tested is 0.001, 0.01, 0.005
    'n_estimators':2000,# range tested is 1000, 2000
    'verbosity':2
    }

In [None]:
gbm_model = lgb.train(params, train_set=dataset)

In [None]:
def return_accuracy(data):
    preds = gbm_model.predict(data.iloc[:, 1:].drop(columns='root_id'), raw_score=False).argmax(1)
    return accuracy_score(data.iloc[:, 0].values, preds)

In [None]:
print(f"LighGBM, {exp_name}, 0.005, 2000, {return_accuracy(df_train):.4f}, {return_accuracy(df_validate):.4f}, {return_accuracy(df_test):.4f}")

In [None]:
lgb.plot_importance(gbm_model)

### Text features from MLP

In [None]:
import torch
import torch.nn as nn

In [None]:
data_builder = ds.DatasetBuilder(exp_name, time_cutoff=CUTOFF, features_to_consider='text_only')
dataset = data_builder.create_dataset(dataset_type="raw", standardize_features=False)

In [None]:
def to_dataset(raw_dt):
    mlp_data = [tree[0] for tree in raw_dt]
    mlp_data = np.stack([np.array([dp[1], lookup_dict[dp[0]]] + dp[7:]) for dp in mlp_data])
    return mlp_data

In [None]:
train_mlp = to_dataset(dataset['train'])
val_mlp = to_dataset(dataset['val'])
test_mlp = to_dataset(dataset['test'])

In [None]:
np.unique(train_mlp[:, 1])

In [None]:
n_hidden_layers = 3
hidden_size = 24
batch_size = 64
lr = 0.0005
# dropout_prob =0.5
n_epochs = 100

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_seq = []
model_seq.append(nn.Linear(in_features=768, out_features=hidden_size))
for _ in range(n_hidden_layers):
#     model_seq.append(nn.Dropout(dropout_prob))
    model_seq.append(nn.Linear(in_features=hidden_size, out_features=hidden_size))
model_seq.append(nn.Linear(in_features=hidden_size, out_features=4))
mlp = nn.Sequential(*model_seq)
mlp = mlp.to(device)

In [None]:
optim = torch.optim.Adam(mlp.parameters(), lr=lr) #[param for name, param in mlp.named_parameters()]

In [None]:
train_loader = torch.utils.data.DataLoader(train_mlp[:, 1:], batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_mlp[:, 1:], batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_mlp[:, 1:], batch_size=batch_size, shuffle=True)

In [None]:
def eval_loader(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for dp in loader:
            y = dp[:, 0].to(device).long()
            x = dp[:, 1:].to(device).float()
            logits = mlp(x)
            _, preds = logits.max(dim=1)
            correct += float(preds.eq(y).sum().item())
            total += y.size(0)
    return correct / total

In [None]:
loss_func = nn.CrossEntropyLoss(reduction='mean')
for epoch in range(n_epochs):
    epoch_loss = 0.
    
    #TRAIN
    mlp.train()
    for dp in train_loader:
        y = dp[:, 0].to(device).long()
        x = dp[:, 1:].to(device).float()
        logits = mlp(x)

        
        loss = loss_func(logits, y)
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        epoch_loss += loss.item()
    
    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1}: Mean Loss = {epoch_loss/len(train_loader):.3f}")
        
    
    #EVAL
    if (epoch+1) % 100 == 0:
        print(f"Accuracy Epoch {epoch+1} on Val: {eval_loader(mlp, val_loader):.3f}")

In [None]:
print(f"MLPtext, {exp_name}, 100, 64, 5e-4, 24, 3, {eval_loader(mlp, train_loader):.4f}, {eval_loader(mlp, val_loader):.4f}, {eval_loader(mlp, test_loader):.4f}")

### SEIZ features