In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from community import community_louvain
import math

In [None]:
edges = pd.read_csv("/content/drive/MyDrive/ES_Twitch_Dataset/Final_Edges.csv", usecols=['Source', 'Destination', 'Target'])

In [None]:
# Features
# Originalmente estaban en un json, con una herramienta online lo pase a csv para trabajar más fácilmente
# Los features no tienen nombre, estan como números
features = pd.read_csv("/content/drive/MyDrive/ES_Twitch_Dataset/ES_features.csv").T
features = features.reset_index()
features.rename(columns={'index':'Nodo'}, inplace=True)
features['Nodo'] = features[['Nodo']].astype(int)
features.head()

Unnamed: 0,Nodo,0,1,2,3,4,5,6,7,8,...,67,68,69,70,71,72,73,74,75,76
0,0,515.0,1222.0,1466.0,1060.0,653.0,359.0,861.0,920.0,224.0,...,,,,,,,,,,
1,1,1951.0,2770.0,1523.0,861.0,1190.0,846.0,920.0,224.0,810.0,...,,,,,,,,,,
2,2,308.0,3152.0,2212.0,2323.0,2921.0,653.0,920.0,224.0,3097.0,...,,,,,,,,,,
3,3,194.0,433.0,2392.0,330.0,2928.0,473.0,48.0,920.0,224.0,...,,,,,,,,,,
4,4,1726.0,216.0,3059.0,653.0,1093.0,1174.0,2814.0,119.0,1644.0,...,,,,,,,,,,


Junto el dataframe de las aristas con el de features.

Cada fila del df, va a tener los 2 nodos y los features correspondientes a cada uno.

In [None]:
df = edges.merge(features, how = 'inner', left_on='Source', right_on='Nodo')
df = df.merge(features, how = 'inner', left_on='Destination', right_on='Nodo')
df.drop(['Nodo_x', 'Nodo_y'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Source,Destination,Target,0_x,1_x,2_x,3_x,4_x,5_x,6_x,...,67_y,68_y,69_y,70_y,71_y,72_y,73_y,74_y,75_y,76_y
0,0,1819,1,515.0,1222.0,1466.0,1060.0,653.0,359.0,861.0,...,,,,,,,,,,
1,3,1819,1,194.0,433.0,2392.0,330.0,2928.0,473.0,48.0,...,,,,,,,,,,
2,10,1819,1,839.0,166.0,917.0,1083.0,653.0,210.0,2063.0,...,,,,,,,,,,
3,11,1819,1,3040.0,1079.0,2085.0,2042.0,653.0,856.0,653.0,...,,,,,,,,,,
4,18,1819,1,1199.0,509.0,496.0,861.0,621.0,846.0,920.0,...,,,,,,,,,,


Divido el dataframe en el set que usaré para entrenar el modelo y el que voy a usar para testear. Será un 80% para entrenar y 20% respectivamente

In [None]:
x_train, x_test = train_test_split(df , test_size = 0.2, random_state=123)

### Imputación de nulos

In [None]:
# Si hay NaN, pasa a ser el promedio de la columna.
x_train = x_train.T.apply(lambda row: row.fillna(row.mean()), axis=1).T
x_test = x_test.T.apply(lambda row: row.fillna(row.mean()), axis=1).T
x_train.head()

Unnamed: 0,Source,Destination,Target,0_x,1_x,2_x,3_x,4_x,5_x,6_x,...,67_y,68_y,69_y,70_y,71_y,72_y,73_y,74_y,75_y,76_y
9680,4557.0,4142.0,1.0,515.0,221.0,2927.0,653.0,210.0,1649.0,920.0,...,811.979021,1866.34965,2442.097902,770.195804,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0
118520,2854.0,451.0,0.0,1948.0,3152.0,1129.0,861.0,2124.0,48.0,920.0,...,811.979021,1866.34965,2442.097902,770.195804,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0
69296,2643.0,788.0,0.0,1133.0,216.0,2140.0,2928.0,1126.0,2003.0,2814.0,...,811.979021,1866.34965,2442.097902,770.195804,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0
47907,1348.0,2154.0,1.0,1234.0,3152.0,1129.0,2995.0,846.0,1687.0,2282.0,...,811.979021,1866.34965,2442.097902,770.195804,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0
101097,2837.0,1975.0,1.0,1199.0,509.0,2927.0,1769.0,861.0,1955.0,2928.0,...,811.979021,1866.34965,2442.097902,770.195804,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0


## Nuevos features

In [None]:
grafo_x = nx.from_pandas_edgelist(x_train[['Source','Destination']], source='Source', target='Destination', create_using = nx.DiGraph())
grafo_y = nx.from_pandas_edgelist(x_test[['Source','Destination']], source='Source', target='Destination', create_using = nx.DiGraph())

In [None]:
def aux_count(df, column1, column2):
  grouped = df.groupby([column1]).agg({column2:['count']})
  level0 = grouped.columns.get_level_values(0)
  level1 = grouped.columns.get_level_values(1)
  grouped.columns = level0 + '_' + level1
  grouped.reset_index(inplace=True) 
  return grouped

### Los seguidores/siguiendo (grados) de los nodos.

In [None]:
x_train = x_train.merge(aux_count(x_train, 'Source', 'Source'), left_on='Source', right_on='Source', how='outer')
x_train.rename(columns={'Source_count': 'Following_Src'}, inplace = True)
x_train = x_train.merge(aux_count(x_train, 'Destination', 'Source'), left_on='Source', right_on='Destination', how='outer')
x_train.rename(columns={'Source_count': 'Followers_Src', 'Destination_x': 'Destination'}, inplace = True)
x_train = x_train.merge(aux_count(x_train, 'Destination', 'Destination'), left_on='Destination', right_on='Destination', how='outer')
x_train.rename(columns={'Destination_count': 'Followers_Dst'}, inplace = True)
x_train = x_train.merge(aux_count(x_train, 'Source', 'Destination'), left_on='Destination', right_on='Source', how='outer')
x_train.rename(columns={'Destination_count': 'Following_Dst', 'Source_x': 'Source'}, inplace = True)
x_train.drop(['Destination_y', 'Source_y'], axis=1, inplace=True)
x_train = x_train.iloc[:94994] #Se genera una fila nueva con todos NaN, así que la borro.
x_train = x_train.fillna(0) #Si hay NaN es porque tiene 0 seguidores o no sigue a nadie

x_test = x_test.merge(aux_count(x_test, 'Source', 'Source'), left_on='Source', right_on='Source', how='outer')
x_test.rename(columns={'Source_count': 'Following_Src'}, inplace = True)
x_test = x_test.merge(aux_count(x_test, 'Destination', 'Source'), left_on='Source', right_on='Destination', how='outer')
x_test.rename(columns={'Source_count': 'Followers_Src', 'Destination_x': 'Destination'}, inplace = True)
x_test = x_test.merge(aux_count(x_test, 'Destination', 'Destination'), left_on='Destination', right_on='Destination', how='outer')
x_test.rename(columns={'Destination_count': 'Followers_Dst'}, inplace = True)
x_test = x_test.merge(aux_count(x_test, 'Source', 'Destination'), left_on='Destination', right_on='Source', how='outer')
x_test.rename(columns={'Destination_count': 'Following_Dst', 'Source_x': 'Source'}, inplace = True)
x_test.drop(['Destination_y', 'Source_y'], axis=1, inplace=True)
x_test = x_test.iloc[:23749] #Se genera una fila nueva con todos NaN, así que la borro.
x_test = x_test.fillna(0) #Si hay NaN es porque tiene 0 seguidores o no sigue a nadie

x_train.head()

Unnamed: 0,Source,Destination,Target,0_x,1_x,2_x,3_x,4_x,5_x,6_x,...,71_y,72_y,73_y,74_y,75_y,76_y,Following_Src,Followers_Src,Followers_Dst,Following_Dst
0,4557.0,4142.0,1.0,515.0,221.0,2927.0,653.0,210.0,1649.0,920.0,...,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0,20.0,14.0,184.0,291
1,182.0,4142.0,1.0,2858.0,1618.0,2263.0,846.0,2744.0,510.0,2814.0,...,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0,256.0,80.0,184.0,291
2,112.0,4142.0,1.0,1550.0,2060.0,1081.0,861.0,2172.0,2928.0,920.0,...,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0,24.0,8.0,184.0,291
3,1476.0,4142.0,1.0,3040.0,2465.0,2927.0,653.0,1093.0,1174.0,2814.0,...,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0,32.0,18.0,184.0,291
4,1560.0,4142.0,1.0,89.0,1535.0,333.0,1423.0,653.0,1499.0,2919.0,...,2535.545455,436.0,1213.0,1598.0,1260.0,3057.0,56.0,28.0,184.0,291


### Devuelve el seguido

In [None]:
def get_follows_back(grafo, x, y):
    return 1 if grafo.has_edge(y, x) else 0

x_train['Follow_Back'] = x_train.apply(lambda row: get_follows_back(grafo_x, row['Source'], row['Destination']), axis = 1)
x_test['Follow_Back'] = x_test.apply(lambda row: get_follows_back(grafo_y, row['Source'], row['Destination']), axis = 1)

In [None]:
x_train['Follow_Back'].value_counts()

0    94622
1      372
Name: Follow_Back, dtype: int64

### Comunidades

In [None]:
grafo_x_2 = nx.from_pandas_edgelist(x_train[['Source','Destination']], source='Source', target='Destination', create_using = nx.Graph())
grafo_y_2 = nx.from_pandas_edgelist(x_test[['Source','Destination']], source='Source', target='Destination', create_using = nx.Graph())

In [None]:
comms_x = community_louvain.best_partition(grafo_x_2)
comms_y = community_louvain.best_partition(grafo_y_2)

In [None]:
x_train['Comm_Src'] = x_train['Source'].map(comms_x)
x_train['Comm_Dst'] = x_train['Destination'].map(comms_x)

x_test['Comm_Src'] = x_test['Source'].map(comms_y)
x_test['Comm_Dst'] = x_test['Destination'].map(comms_y)

In [None]:
x_train['Comm_Src'].value_counts()

4    25863
0    19904
2    14099
1    13062
5    11149
3     8028
6     2889
Name: Comm_Src, dtype: int64

### Camino más corto

Busco el camino más corto entre los nodos (sin contar la unión directa)

In [None]:
def get_shortest_path(grafo, x, y):
    distance = -1
    try:
        if grafo.has_edge(x, y):
            grafo.remove_edge(x, y)
            distance = nx.shortest_path_length(grafo, source=x, target=y)
            grafo.add_edge(x, y)
        else:
            distance = nx.shortest_path_length(grafo, source=x, target=y)
    except:
        distance = -1
    return distance

In [None]:
x_train['Shortest_Path'] = x_train.apply(lambda row: get_shortest_path(grafo_x, row['Source'], row['Destination']), axis = 1)
x_test['Shortest_Path'] = x_test.apply(lambda row: get_shortest_path(grafo_y, row['Source'], row['Destination']), axis = 1)

In [None]:
x_train['Shortest_Path'].value_counts()

 3    47821
 2    33615
 4    13527
 5       30
-1        1
Name: Shortest_Path, dtype: int64

### Indice de Katz

In [None]:
katz_x = nx.katz.katz_centrality(grafo_x, alpha = 0.005, beta = 1)
katz_y = nx.katz.katz_centrality(grafo_y, alpha = 0.005, beta = 1)

In [None]:
x_train['Katz_Src'] = x_train['Source'].apply(lambda x: katz_x.get(x))
x_train['Katz_Dst'] = x_train['Destination'].apply(lambda x: katz_x.get(x))

x_test['Katz_Src'] = x_test['Source'].apply(lambda x: katz_y.get(x))
x_test['Katz_Dst'] = x_test['Destination'].apply(lambda x: katz_y.get(x))

### Hits

In [None]:
hubs_x, authorities_x = nx.hits(grafo_x)
hubs_y, authorities_y = nx.hits(grafo_y)

In [None]:
x_train['Hubs_Src'] = x_train['Source'].apply(lambda x: hubs_x.get(x))
x_train['Authorities_Src'] = x_train['Source'].apply(lambda x: authorities_x.get(x))
x_train['Hubs_Dst'] = x_train['Destination'].apply(lambda x: hubs_x.get(x))
x_train['Authorities_Dst'] = x_train['Destination'].apply(lambda x: authorities_x.get(x))

x_test['Hubs_Src'] = x_test['Source'].apply(lambda x: hubs_y.get(x))
x_test['Authorities_Src'] = x_test['Source'].apply(lambda x: authorities_y.get(x))
x_test['Hubs_Dst'] = x_test['Destination'].apply(lambda x: hubs_y.get(x))
x_test['Authorities_Dst'] = x_test['Destination'].apply(lambda x: authorities_y.get(x))

### Page Rank

In [None]:
page_rank_x = nx.pagerank(grafo_x)
page_rank_y = nx.pagerank(grafo_y)

In [None]:
x_train['PageRank_Src'] = x_train['Source'].apply(lambda x: page_rank_x.get(x))
x_train['PageRank_Dst'] = x_train['Destination'].apply(lambda x: page_rank_x.get(x))

x_test['PageRank_Src'] = x_test['Source'].apply(lambda x: page_rank_y.get(x))
x_test['PageRank_Dst'] = x_test['Destination'].apply(lambda x: page_rank_y.get(x))

### Distancia Jaccard

In [None]:
# Para seguidos
def jaccard_for_followees(graph, a, b):
    try:
        if len(set(graph.successors(a))) == 0  | len(set(graph.successors(b))) == 0:
            return 0
        return (len(set(graph.successors(a)).intersection(set(graph.successors(b))))) / (len(set(graph.successors(a)).union(set(graph.successors(b)))))
    except:
        return 0

# Para seguidores
def jaccard_for_followers(graph, a, b):
    try:
        if len(set(graph.predecessors(a))) == 0  | len(set(g.predecessors(b))) == 0:
            return 0
        return (len(set(graph.predecessors(a)).intersection(set(graph.predecessors(b))))) / (len(set(graph.predecessors(a)).union(set(graph.predecessors(b)))))
    except:
        return 0

In [None]:
x_train['Jaccard_Followers'] = x_train.apply(lambda row: jaccard_for_followers(grafo_x, row['Source'], row['Destination']), axis=1)
x_train['Jaccard_Followees'] = x_train.apply(lambda row: jaccard_for_followees(grafo_x, row['Source'], row['Destination']), axis=1)

x_test['Jaccard_Followers'] = x_test.apply(lambda row: jaccard_for_followers(grafo_y, row['Source'], row['Destination']), axis=1)
x_test['Jaccard_Followees'] = x_test.apply(lambda row: jaccard_for_followees(grafo_y, row['Source'], row['Destination']), axis=1)

### Distancia Coseno

In [None]:
# Para seguidos
def cosine_for_followees(graph, a, b):
    try:
        if len(set(graph.successors(a))) == 0  | len(set(graph.successors(b))) == 0:
            return 0
        return (len(set(graph.successors(a)).intersection(set(graph.successors(b))))) / (math.sqrt(len(set(graph.successors(a)))*len((set(graph.successors(b))))))
    except:
        return 0

# Para seguidores
def cosine_for_followers(graph, a, b):
    try:
        
        if len(set(graph.predecessors(a))) == 0  | len(set(graph.predecessors(b))) == 0:
            return 0
        return (len(set(graph.predecessors(a)).intersection(set(graph.predecessors(b))))) / (math.sqrt(len(set(graph.predecessors(a))))*(len(set(graph.predecessors(b)))))
    except:
        return 0

In [None]:
x_train['Cosine_Followers'] = x_train.apply(lambda row: cosine_for_followers(grafo_x, row['Source'], row['Destination']), axis=1)
x_train['Cosine_Followees'] = x_train.apply(lambda row: cosine_for_followees(grafo_x, row['Source'], row['Destination']), axis=1)

x_test['Cosine_Followers'] = x_test.apply(lambda row: cosine_for_followers(grafo_y, row['Source'], row['Destination']), axis=1)
x_test['Cosine_Followees'] = x_test.apply(lambda row: cosine_for_followees(grafo_y, row['Source'], row['Destination']), axis=1)

### Preferential Attachment

In [None]:
#Para seguidos
def preferential_followees(graph, a, b):
    try:
        if len(set(graph.successors(a))) == 0  | len(set(graph.successors(b))) == 0:
            return 0
        return (len(set(graph.successors(a))) * len((set(graph.successors(b)))))
    except:
        return 0

#Para seguidores
def preferential_followers(graph, a, b):
    try: 
        if len(set(graph.predecessors(a))) == 0  | len(set(graph.predecessors(b))) == 0:
            return 0
        return (len(set(graph.predecessors(a))) * len(set(graph.predecessors(b))))
    except:
        return 0

In [None]:
x_train['Preferential_Followers'] = x_train.apply(lambda row: preferential_followers(grafo_x, row['Source'], row['Destination']), axis=1)
x_train['Preferential_Followees'] = x_train.apply(lambda row: preferential_followees(grafo_x, row['Source'], row['Destination']), axis=1)

x_test['Preferential_Followers'] = x_test.apply(lambda row: preferential_followers(grafo_y, row['Source'], row['Destination']), axis=1)
x_test['Preferential_Followees'] = x_test.apply(lambda row: preferential_followees(grafo_y, row['Source'], row['Destination']), axis=1)

Con esto termino y estoy listo para entrenar los distintos modelos.

In [None]:
x_train.head()

Unnamed: 0,Source,Destination,Target,0_x,1_x,2_x,3_x,4_x,5_x,6_x,...,Hubs_Dst,Authorities_Dst,PageRank_Src,PageRank_Dst,Jaccard_Followers,Jaccard_Followees,Cosine_Followers,Cosine_Followees,Preferential_Followers,Preferential_Followees
0,4557.0,4142.0,1.0,515.0,221.0,2927.0,653.0,210.0,1649.0,920.0,...,0.002854,0.001849,0.00013,0.001153,0,0.023026,0.001453,0.091756,2576,5820
1,182.0,4142.0,1.0,2858.0,1618.0,2263.0,846.0,2744.0,510.0,2814.0,...,0.002854,0.001849,0.000449,0.001153,0,0.207506,0.017621,0.344399,14720,74496
2,112.0,4142.0,1.0,1550.0,2060.0,1081.0,861.0,2172.0,2928.0,920.0,...,0.002854,0.001849,0.0001,0.001153,0,0.016129,0.001921,0.05983,1472,6984
3,1476.0,4142.0,1.0,3040.0,2465.0,2927.0,653.0,1093.0,1174.0,2814.0,...,0.002854,0.001849,0.00013,0.001153,0,0.012539,0.005124,0.041451,3312,9312
4,1560.0,4142.0,1.0,89.0,1535.0,333.0,1423.0,653.0,1499.0,2919.0,...,0.002854,0.001849,0.000241,0.001153,0,0.054711,0.006162,0.141004,5152,16296


In [None]:
x_train.to_csv('/content/drive/MyDrive/ES_Twitch_Dataset/Train_Final.csv')
x_test.to_csv('/content/drive/MyDrive/ES_Twitch_Dataset/Test_Final.csv')