In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import scipy

# Import Data

In [2]:
df = pd.read_csv('D:/1 kuliah/0 SKRIPSI/data/data_agama.csv', low_memory=False)
df.shape

(506386, 12)

# Preprocessing

In [3]:
df.isna().sum()

tweetID           428
screen_name       498
nretweets      208952
nlikes         208952
nreplies       208952
tweet             105
lat            478200
lon            478200
location       465877
address        440903
isretweet       47542
created_at      56247
dtype: int64

In [4]:
df = df.dropna(subset=['screen_name', 'tweet'])
df.isna().sum()

tweetID             0
screen_name         0
nretweets      208450
nlikes         208450
nreplies       208450
tweet               0
lat            477698
lon            477698
location       465375
address        440401
isretweet       47040
created_at      55745
dtype: int64

In [5]:
df.dtypes

tweetID         object
screen_name     object
nretweets      float64
nlikes         float64
nreplies       float64
tweet           object
lat            float64
lon            float64
location        object
address         object
isretweet      float64
created_at      object
dtype: object

In [6]:
df['created_at'] = pd.to_datetime(df['created_at'])
df.dtypes

tweetID                object
screen_name            object
nretweets             float64
nlikes                float64
nreplies              float64
tweet                  object
lat                   float64
lon                   float64
location               object
address                object
isretweet             float64
created_at     datetime64[ns]
dtype: object

In [7]:
df.to_csv('data_agama_clean.csv')

# Load Data setelah Preprocessing

In [3]:
df = pd.read_csv('C:/Users/naila/0 SKRIPSI/data_agama_clean.csv', low_memory=False)
df.shape

(505884, 13)

In [4]:
df['created_at'] = pd.to_datetime(df['created_at'])
df.dtypes

Unnamed: 0              int64
tweetID                object
screen_name            object
nretweets             float64
nlikes                float64
nreplies              float64
tweet                  object
lat                   float64
lon                   float64
location               object
address                object
isretweet             float64
created_at     datetime64[ns]
dtype: object

# Slice setiap tahun

In [5]:
data_2009 = df[df['created_at'].dt.year == 2009]
data_2010 = df[df['created_at'].dt.year == 2010]
data_2011 = df[df['created_at'].dt.year == 2011]
data_2012 = df[df['created_at'].dt.year == 2012]
data_2013 = df[df['created_at'].dt.year == 2013]
data_2014 = df[df['created_at'].dt.year == 2014]
data_2015 = df[df['created_at'].dt.year == 2015]
data_2016 = df[df['created_at'].dt.year == 2016]
data_2017 = df[df['created_at'].dt.year == 2017]
data_2018 = df[df['created_at'].dt.year == 2018]
data_2019 = df[df['created_at'].dt.year == 2019]

In [6]:
print(data_2009.shape, data_2010.shape, data_2011.shape, data_2012.shape, data_2013.shape, data_2014.shape,
      data_2015.shape, data_2016.shape, data_2017.shape, data_2018.shape, data_2019.shape)

(4058, 13) (20380, 13) (49107, 13) (73436, 13) (78468, 13) (79930, 13) (55425, 13) (34091, 13) (32450, 13) (18777, 13) (4014, 13)


# Graf

In [7]:
import re

def extract_mentions(tweet_text):
    mentions = [re.sub(r'[@~`!#$%^&*-+=\\\|"/.,:;]', '', word) for word in tweet_text.split() if word.startswith('@')]
    return mentions

## Graf Setiap Tahun (Directed)

In [8]:
# Membuat 10 objek graf, masing-masing untuk tahun 2009-2019
graphs_by_year = {}
for year in range(2009, 2020):
    graphs_by_year[year] = nx.DiGraph()

# Menambahkan tautan (edge) berdasarkan mention user dalam tweet untuk setiap tahun
for _, row in df.iterrows():
    tweet_text = row['tweet']
    mentions = extract_mentions(tweet_text)
    
    # Cek apakah nilai tahun tidak kosong (NaN)
    if pd.notna(row['created_at']):
        year = row['created_at'].year
        # Menambahkan nodes
        for mention in mentions:
            if mention:  # Memastikan mention tidak kosong (hanya akan menambahkan node dengan label yang valid)
                graphs_by_year[year].add_node(row['screen_name'])
                graphs_by_year[year].add_node(mention)
                if mention != row['screen_name']:  # Memastikan tidak ada self-loop (misalnya @user menyebut dirinya sendiri)
                    if graphs_by_year[year].has_edge(row['screen_name'], mention):
                        # Jika edge sudah ada, tambahkan bobot (weight) sebesar 1
                        graphs_by_year[year][row['screen_name']][mention]['weight'] += 1
                    else:
                        # Jika edge belum ada, buat edge baru dengan bobot (weight) 1
                        graphs_by_year[year].add_edge(row['screen_name'], mention, weight=1)

In [9]:
for year in range(2009, 2020):
    graph = graphs_by_year[year]
    num_nodes = graph.number_of_nodes()
    num_edges = graph.number_of_edges()

    node_df = pd.DataFrame(list(graph.nodes()), columns=['user'])
    
    print(f"Year {year}: Nodes={num_nodes}, Edges={num_edges}")

Year 2009: Nodes=2476, Edges=1591
Year 2010: Nodes=14407, Edges=10620
Year 2011: Nodes=26516, Edges=20006
Year 2012: Nodes=25850, Edges=19426
Year 2013: Nodes=20773, Edges=16402
Year 2014: Nodes=17908, Edges=15380
Year 2015: Nodes=12124, Edges=11456
Year 2016: Nodes=6410, Edges=6348
Year 2017: Nodes=5186, Edges=5671
Year 2018: Nodes=3914, Edges=4791
Year 2019: Nodes=1032, Edges=1030


# Katz Centrality (Directed)

In [10]:
# Kamus untuk menyimpan hasil sentralitas Katz Centrality untuk setiap tahun
katz_centrality_by_year = {}

# Loop untuk menghitung sentralitas Katz Centrality untuk setiap tahun dari 2009 hingga 2019
for year, graph in graphs_by_year.items():
    katz_centrality = nx.katz_centrality(graph, alpha=0.1, beta=1.0, max_iter=5000)
    katz_centrality_by_year[year] = katz_centrality

In [11]:
top_10_katz_centrality_by_year = pd.DataFrame()
for year, katz_centrality in katz_centrality_by_year.items():
    # Sort katz_centrality menjadi descending order dan ambil 10 user pertama
    yearly = pd.DataFrame(list(katz_centrality_by_year[year].items()), columns=['Node', 'Katz Centrality'])
    top_10 = yearly.sort_values(by='Katz Centrality', ascending=False).head(10).reset_index(drop=True)
    top_10['Year'] = year
    # Simpan hasilnya ke dalam list top_10_users_by_year
    top_10_katz_centrality_by_year = pd.concat([top_10_katz_centrality_by_year, top_10])

In [12]:
top_30_katz_centrality_by_year = pd.DataFrame()
for year, katz_centrality in katz_centrality_by_year.items():
    # Sort katz_centrality menjadi descending order dan ambil 10 user pertama
    yearly = pd.DataFrame(list(katz_centrality_by_year[year].items()), columns=['Node', 'Katz Centrality'])
    top_30 = yearly.sort_values(by='Katz Centrality', ascending=False).head(30).reset_index(drop=True)
    top_30['Year'] = year
    # Simpan hasilnya ke dalam list top_10_users_by_year
    top_30_katz_centrality_by_year = pd.concat([top_30_katz_centrality_by_year, top_30])

In [13]:
top_30_katz_centrality_by_year

Unnamed: 0,Node,Katz Centrality,Year
0,Metro_TV,0.037665,2009
1,tifsembiring,0.034087,2009
2,vincentrompies,0.032618,2009
3,addthis,0.030132,2009
4,radityadika,0.028625,2009
...,...,...,...
25,KoRnHurry,0.038483,2019
26,mohmahfudmd,0.038423,2019
27,PDemokrat,0.038183,2019
28,basuki_btp,0.038125,2019


In [36]:
node_frequencies = top_30_katz_centrality_by_year['Node'].value_counts()
top_nodes = node_frequencies.index[:5].tolist()

In [37]:
full_graph_katz = nx.DiGraph()
for graph in graphs_by_year.values():
    full_graph_katz = nx.compose(full_graph_katz, graph)

In [38]:
top_nodes

['YouTube', 'Metro_TV', 'ulil', 'mohmahfudmd', 'KPK_RI']

In [39]:
subgraph_katz = full_graph_katz.subgraph(top_nodes)

In [40]:
subgraph_edges = list(subgraph_katz.edges())

edges_data = []
for edge in subgraph_edges:
    source, target = edge
    # Ganti bagian ini dengan cara Anda mendapatkan informasi tentang tepi (edge) dari subgraf
    edges_data.append({'Node': source, 'Target': target})

edges_df = pd.DataFrame(edges_data)

In [41]:
edges_df

Unnamed: 0,Node,Target
0,ulil,Metro_TV
1,Metro_TV,KPK_RI
2,mohmahfudmd,KPK_RI
3,mohmahfudmd,Metro_TV


In [27]:
top_30_katz_centrality_by_year.to_csv('Top 30 Users with Katz Centrality.csv')

# Laplacian Centrality

In [20]:
import networkit as nk

In [21]:
# Kamus untuk menyimpan hasil sentralitas Katz Centrality untuk setiap tahun
nx_to_nk = {}
laplacian_centrality_by_year = {}

# Loop untuk menghitung sentralitas Laplacian Centrality untuk setiap tahun dari 2009 hingga 2019
for year, graph in graphs_by_year.items():
    nx_to_nk[year] = nk.nxadapter.nx2nk(graphs_by_year[year])
    laplacian_centrality = nk.centrality.LaplacianCentrality(nx_to_nk[year]).run()
    laplacian_centrality_by_year[year] = laplacian_centrality

In [22]:
# Membuat DataFrame untuk hasil akhir
top_30_laplacian_by_year = pd.DataFrame()

# Loop untuk setiap tahun dari 2009 hingga 2019
for year in range(2009, 2020):
    centrality_year = pd.DataFrame(laplacian_centrality_by_year[year].ranking(), columns=['ID', 'Laplacian Centrality'])
    index = pd.DataFrame(range(0, 2476), columns=['ID'])
    nodes_year = pd.DataFrame(graphs_by_year[year].nodes, columns=['Username'])
    nama = pd.concat([index, nodes_year], axis=1)
    merged_table = nama.merge(centrality_year, on='ID', how='left')
    top_30_laplacian = merged_table.sort_values(by='Laplacian Centrality', ascending=False).head(30)

    # Menambahkan kolom tahun ke hasil DataFrame
    top_30_laplacian['Year'] = year

    # Menggabungkan hasil untuk setiap tahun
    top_30_laplacian_by_year = pd.concat([top_30_laplacian_by_year, top_30_laplacian], ignore_index=True)

# Menampilkan hasil akhir
top_30_laplacian_by_year

Unnamed: 0,ID,Username,Laplacian Centrality,Year
0,1420.0,FasJuw,56.0,2009
1,448.0,ribosa,52.0,2009
2,161.0,chibialfa,48.0,2009
3,887.0,vinskatan,44.0,2009
4,135.0,raynardrheda,42.0,2009
...,...,...,...,...
325,193.0,tandanVradio,68.0,2019
326,495.0,oaseminang,62.0,2019
327,812.0,katakitatweet,58.0,2019
328,570.0,nfia82,56.0,2019


In [42]:
node_frequenciess = top_30_laplacian_by_year['Username'].value_counts()
top_nodess = node_frequenciess.index[:5].tolist()

In [43]:
top_nodess

['detikcom', 'fahiraidris', 'sudjiwotedjo', 'fadjroeL', 'M4ngU5il']

In [44]:
full_graph_laplacian = nx.DiGraph()
for graph in graphs_by_year.values():
    full_graph_laplacian = nx.compose(full_graph_laplacian, graph)

In [45]:
subgraph_laplacian = full_graph_laplacian.subgraph(top_nodess)
subgraph_edgess = list(subgraph_laplacian.edges())

edges_datas = []
for edge in subgraph_edgess:
    source, target = edge
    # Ganti bagian ini dengan cara Anda mendapatkan informasi tentang tepi (edge) dari subgraf
    edges_datas.append({'Node': source, 'Target': target})

edges_dfs = pd.DataFrame(edges_datas)

In [46]:
edges_dfs

Unnamed: 0,Node,Target
0,fadjroeL,sudjiwotedjo
1,fadjroeL,detikcom
2,fahiraidris,sudjiwotedjo
3,fahiraidris,fadjroeL
4,fahiraidris,M4ngU5il
5,sudjiwotedjo,M4ngU5il
6,sudjiwotedjo,detikcom


In [37]:
top_30_laplacian_by_year.to_csv('Top 30 Users with Laplacian Centrality.csv')

## Graf Undirected

In [7]:
# Membuat 10 objek graf, masing-masing untuk tahun 2009-2019
graphs_by_years = {}
for year in range(2009, 2020):
    graphs_by_years[year] = nx.Graph()

# Menambahkan tautan (edge) berdasarkan mention user dalam tweet untuk setiap tahun
for _, row in df.iterrows():
    tweet_text = row['tweet']
    mentions = extract_mentions(tweet_text)
    
    # Cek apakah nilai tahun tidak kosong (NaN)
    if pd.notna(row['created_at']):
        year = row['created_at'].year
        # Menambahkan nodes
        for mention in mentions:
            if mention:  # Memastikan mention tidak kosong (hanya akan menambahkan node dengan label yang valid)
                graphs_by_years[year].add_node(row['screen_name'])
                graphs_by_years[year].add_node(mention)
                if mention != row['screen_name']:  # Memastikan tidak ada self-loop (misalnya @user menyebut dirinya sendiri)
                    if graphs_by_years[year].has_edge(row['screen_name'], mention):
                        # Jika edge sudah ada, tambahkan bobot (weight) sebesar 1
                        graphs_by_years[year][row['screen_name']][mention]['weight'] += 1
                    else:
                        # Jika edge belum ada, buat edge baru dengan bobot (weight) 1
                        graphs_by_years[year].add_edge(row['screen_name'], mention, weight=1)

In [7]:
np.round(max(nx.adjacency_spectrum(graphs_by_years[2009])), 3)

(5.216+0j)

In [None]:
np.round(max(nx.adjacency_spectrum(graphs_by_years[2010])), 3)

In [42]:
katz_centrality = nx.katz_centrality(graphs_by_years[2009], alpha=0.1, beta=1.0, max_iter=5000)

In [43]:
yearly = pd.DataFrame(list(katz_centrality.items()), columns=['Node', 'Katz Centrality'])
top_10 = yearly.sort_values(by='Katz Centrality', ascending=False).head(20).reset_index(drop=True)
top_10

Unnamed: 0,Node,Katz Centrality
0,Metro_TV,0.041886
1,tifsembiring,0.037902
2,ShafiqPontoh,0.033742
3,vincentrompies,0.0335
4,ribosa,0.033195
5,si_ali,0.031834
6,FasJuw,0.031742
7,chibialfa,0.030504
8,FerdyLou,0.030501
9,vinskatan,0.030251


In [45]:
katz_centrality2 = nx.katz_centrality(graphs_by_years[2010], alpha=0.01, beta=1.0, max_iter=5000)

In [46]:
yearly = pd.DataFrame(list(katz_centrality2.items()), columns=['Node', 'Katz Centrality'])
top_10 = yearly.sort_values(by='Katz Centrality', ascending=False).head(20).reset_index(drop=True)
top_10

Unnamed: 0,Node,Katz Centrality
0,ulil,0.023919
1,fiksimini,0.01824
2,salimafillah,0.016576
3,ayatquran,0.012661
4,SoalCINTA,0.012487
5,gm_gm,0.01242
6,Metro_TV,0.012369
7,detikcom,0.012047
8,syukronamin,0.011187
9,fahiraidris,0.011128


In [48]:
katz_centrality3 = nx.katz_centrality(graphs_by_years[2011], alpha=0.01, beta=1.0, max_iter=5000)

In [49]:
yearly = pd.DataFrame(list(katz_centrality3.items()), columns=['Node', 'Katz Centrality'])
top_10 = yearly.sort_values(by='Katz Centrality', ascending=False).head(20).reset_index(drop=True)
top_10

Unnamed: 0,Node,Katz Centrality
0,salimafillah,0.018547
1,detikcom,0.01408
2,ipphoright,0.011821
3,ulil,0.011806
4,sudjiwotedjo,0.011542
5,fiksimini,0.011485
6,benny_israel,0.010953
7,syukronamin,0.010731
8,TerimakasihIBU,0.00989
9,GunRomli,0.009845


# Katz Centrality (Undirected)

In [30]:
# Kamus untuk menyimpan hasil sentralitas Katz Centrality untuk setiap tahun
katz_centrality_by_years = {}

# Loop untuk menghitung sentralitas Katz Centrality untuk setiap tahun dari 2009 hingga 2019
for year, graph in graphs_by_years.items():
    katz_centrality = nx.katz_centrality(graph, alpha=0.01, beta=1.0, max_iter=5000)
    katz_centrality_by_years[year] = katz_centrality

In [31]:
top_10_katz_centrality_by_years = pd.DataFrame()
for year, katz_centrality in katz_centrality_by_years.items():
    # Sort katz_centrality menjadi descending order dan ambil 10 user pertama
    yearly = pd.DataFrame(list(katz_centrality_by_years[year].items()), columns=['Node', 'Katz Centrality'])
    top_10 = yearly.sort_values(by='Katz Centrality', ascending=False).head(10).reset_index(drop=True)
    top_10['Year'] = year
    # Simpan hasilnya ke dalam list top_10_users_by_year
    top_10_katz_centrality_by_years = pd.concat([top_10_katz_centrality_by_years, top_10])

In [32]:
top_10_katz_centrality_by_years

Unnamed: 0,Node,Katz Centrality,Year
0,Metro_TV,0.022054,2009
1,tifsembiring,0.021474,2009
2,ShafiqPontoh,0.021259,2009
3,vincentrompies,0.021257,2009
4,FasJuw,0.021243,2009
...,...,...,...
5,MahesaTiwi,0.037730,2019
6,sandiuno,0.036233,2019
7,GetrichEnts,0.035982,2019
8,fadlizon,0.035910,2019


# Laplacian Centrality (Undirected)

In [34]:
import networkit as nk

In [35]:
# Kamus untuk menyimpan hasil sentralitas Katz Centrality untuk setiap tahun
nx_to_nks = {}
laplacian_centrality_by_years = {}

# Loop untuk menghitung sentralitas Laplacian Centrality untuk setiap tahun dari 2009 hingga 2019
for year, graph in graphs_by_years.items():
    nx_to_nks[year] = nk.nxadapter.nx2nk(graphs_by_years[year])
    laplacian_centrality = nk.centrality.LaplacianCentrality(nx_to_nks[year]).run()
    laplacian_centrality_by_years[year] = laplacian_centrality

In [37]:
# Membuat DataFrame untuk hasil akhir
top_10_laplacian_by_years = pd.DataFrame()

# Loop untuk setiap tahun dari 2009 hingga 2019
for year in range(2009, 2020):
    centrality_year = pd.DataFrame(laplacian_centrality_by_years[year].ranking(), columns=['ID', 'Laplacian Centrality'])
    index = pd.DataFrame(range(0, 2476), columns=['ID'])
    nodes_year = pd.DataFrame(graphs_by_years[year].nodes, columns=['Username'])
    nama = pd.concat([index, nodes_year], axis=1)
    merged_table = nama.merge(centrality_year, on='ID', how='left')
    top_10_laplacian = merged_table.sort_values(by='Laplacian Centrality', ascending=False).head(10)

    # Menambahkan kolom tahun ke hasil DataFrame
    top_10_laplacian['Year'] = year

    # Menggabungkan hasil untuk setiap tahun
    top_10_laplacian_by_years = pd.concat([top_10_laplacian_by_years, top_10_laplacian], ignore_index=True)

# Menampilkan hasil akhir
top_10_laplacian_by_years

Unnamed: 0,ID,Username,Laplacian Centrality,Year
0,2027.0,Metro_TV,162.0,2009
1,405.0,tifsembiring,118.0,2009
2,277.0,ShafiqPontoh,86.0,2009
3,448.0,ribosa,84.0,2009
4,523.0,vincentrompies,84.0,2009
...,...,...,...,...
105,105.0,AB_1_X_R,722.0,2019
106,18.0,sandiuno,526.0,2019
107,20.0,fadlizon,482.0,2019
108,511.0,SRufamin,398.0,2019
