In [1]:
import pandas as pd
import numpy as np
import torch

from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data


In [2]:
project_path = "/Users/onature/Desktop/FALL-2024/MATH482/Projects/Twitch-GNN"
data_path  = "/data/twitch_gamers"
node_path = project_path + data_path + "/large_twitch_features.csv"
edge_path = project_path + data_path + "/large_twitch_edges.csv"


In [3]:
df = pd.read_csv(node_path)

df

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0
...,...,...,...,...,...,...,...,...,...
168109,4965,0,810,2016-07-20,2018-10-08,168109,0,EN,0
168110,4128,1,2080,2013-01-31,2018-10-12,168110,0,EN,0
168111,3545,0,1797,2013-11-08,2018-10-10,168111,0,EN,1
168112,892736,1,2135,2012-12-07,2018-10-12,168112,0,EN,0


In [4]:
df = df.drop("updated_at", axis=1)

df['created_at'] = pd.to_datetime(df['created_at'])

df


Unnamed: 0,views,mature,life_time,created_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,0,0,EN,1
1,500,0,2699,2011-05-19,1,0,EN,0
2,382502,1,3149,2010-02-27,2,0,EN,1
3,386,0,1344,2015-01-26,3,0,EN,0
4,2486,0,1784,2013-11-22,4,0,EN,0
...,...,...,...,...,...,...,...,...
168109,4965,0,810,2016-07-20,168109,0,EN,0
168110,4128,1,2080,2013-01-31,168110,0,EN,0
168111,3545,0,1797,2013-11-08,168111,0,EN,1
168112,892736,1,2135,2012-12-07,168112,0,EN,0


In [5]:

reference_date = pd.to_datetime('2018-12-31')  # example reference
df['time_since_creation'] = (reference_date - df['created_at']).dt.days

# 3) One-Hot Encode Language
df_lang = pd.get_dummies(df['language'], prefix='lang')


y_views = df['views'].values  # This is our regression target

# Next, remove "views" from the features so we don't feed the target as input
# Suppose your features are everything else: "mature", "life_time", "time_since_creation", etc.
df_features = pd.concat([
    df[['mature', 'life_time', 'dead_account', 'affiliate',
        'time_since_creation']],
    df_lang
], axis=1)

df_features


Unnamed: 0,mature,life_time,dead_account,affiliate,time_since_creation,lang_CS,lang_DA,lang_DE,lang_EN,lang_ES,...,lang_NL,lang_NO,lang_OTHER,lang_PL,lang_PT,lang_RU,lang_SV,lang_TH,lang_TR,lang_ZH
0,1,969,0,1,1049,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,0,2699,0,0,2783,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,1,3149,0,1,3229,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,0,1344,0,0,1435,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,0,1784,0,0,1865,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168109,0,810,0,0,894,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
168110,1,2080,0,0,2160,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
168111,0,1797,0,1,1879,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
168112,1,2135,0,0,2215,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# 5) Scale the Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(df_features.values)  # shape: [num_nodes, num_features]
x = torch.tensor(X, dtype=torch.float)

# 6) Convert the target to float
y_views = torch.tensor(np.log1p(y_views), dtype=torch.float)


y_views.shape

torch.Size([168114])

In [7]:
edges_df = pd.read_csv(edge_path)
edges_df.head()

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118


In [8]:
edge_index = torch.tensor(edges_df[['numeric_id_1', 'numeric_id_2']].values.T, dtype=torch.long)
edge_index.shape
print(edge_index)

tensor([[ 98343,  98343,  98343,  ..., 151702, 118034,  27819],
        [141493,  58736, 140703,  ..., 128281,  38021, 153993]])


In [9]:

data = Data(
    x=x,                # Node features
    edge_index=edge_index,  # Edge list
    y=y_views           # Target variable (views)
)

data

Data(x=[168114, 26], edge_index=[2, 6797557], y=[168114])

In [10]:

# -----------------------------------
# 6) Train/Validation/Test Split
# -----------------------------------
num_nodes = df_features.shape[0]

all_indices = np.arange(num_nodes)
np.random.shuffle(all_indices)

# Define split ratios
train_ratio, val_ratio = 0.7, 0.15
train_size = int(train_ratio * num_nodes)
val_size = int(val_ratio * num_nodes)

train_idx = torch.tensor(all_indices[:train_size], dtype=torch.long)
val_idx = torch.tensor(all_indices[train_size:train_size + val_size], dtype=torch.long)
test_idx = torch.tensor(all_indices[train_size + val_size:], dtype=torch.long)

# Create boolean masks for PyTorch Geometric
data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
data.val_mask = torch.zeros(num_nodes, dtype=torch.bool)
data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)

data.train_mask[train_idx] = True
data.val_mask[val_idx] = True
data.test_mask[test_idx] = True

# -----------------------------------
# 7) Print Final Data Object
# -----------------------------------
print(data)
# Data(x=[num_nodes, num_features], edge_index=[2, num_edges], y=[num_nodes],
#      train_mask=[num_nodes], val_mask=[num_nodes], test_mask=[num_nodes])




# Save the data object to a file
data_file = f"{project_path}/twitch_graph_data.pt"
torch.save(data, data_file)
print(f"Graph data saved to {data_file}")

Data(x=[168114, 26], edge_index=[2, 6797557], y=[168114], train_mask=[168114], val_mask=[168114], test_mask=[168114])
Graph data saved to /Users/onature/Desktop/FALL-2024/MATH482/Projects/Twitch-GNN/twitch_graph_data.pt
