# Agreement Mapping

In [1]:
from multiprocessing.pool import ThreadPool
import numpy as np
import pandas as pd
import torch

roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')

In [6]:
def parallelizza_agreement(lista_convos):
    """ Given a list with two elements (strings), evaluates the agreement between the two strings using
    the RoBERTa model for natural language inference and randomly prints the first string with a
    probability of 1%."""
    
    valori = []
    
    original = lista_convos[0]
    reply = lista_convos[1]
    
    tokens = roberta.encode(original, reply)
    
    try:
        valori.append(roberta.predict('mnli', tokens).argmax())  # 0: contradiction
    except:
        valori.append('Error')
        
    choice = np.random.choice(['y','n'],p=[1/100,99/100])
    if choice=='y':
        print(lista_convos[0])
    
    return valori


def estrai_tensore(tensore):
    try:
        return tensore.item()
    except:
        return tensore
    

def remap_agreement(x):
    """ Roberta Large MNLI maps agreements the following way:
        0: contradiction
        1: neutral
        2: entailment.
        For the sake of our analysis we will remap them as follows:
        0: -1
        1: +1
        2: +1 
        """

    agreement_mapper = {0:-1, 1:1, 2:1}
    return agreement_mapper[x]

In [3]:
df = pd.read_json(r'./data/dataset_original_processed/tweets_dataviz_2.json')

In [3]:
df_for_agreement = df[['text', 'originalTweetContent'
                      ]].drop_duplicates().reset_index(drop=True)

In [4]:
lista_convos = list(zip(df_for_agreement['text'], df_for_agreement['originalTweetContent']))

In [None]:
pool = ThreadPool()
results = pool.map(parallelizza_agreement, lista_convos)

In [14]:
checkpoint = pd.DataFrame(lista_convos, columns=['text', 'originalTweetContent'])

In [16]:
checkpoint['agreement'] = results
checkpoint.agreement = checkpoint.agreement.apply(lambda x: x[0])
checkpoint.agreement = checkpoint.agreement.apply(estrai_tensore)
checkpoint.agreement = checkpoint.agreement.apply(remap_agreement)

In [None]:
# "Error" was our catch-all response in case roberta agreement detection failed
# so now we drop every row containing it
checkpoint = checkpoint[checkpoint['agreement']!='Error'].reset_index(drop=True)

In [18]:
checkpoint.to_json(r'./data/dataset_original_processed/tweets_dataviz_agreements.json',
               force_ascii=False)

# Community Detection

In [2]:
import pandas as pd
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
import os
from itertools import chain
import matplotlib.pyplot as plt

In [14]:
def plot_top10_communities_distribution(G, df):
    """ For a given graph (G) and a DataFrame (df), this function iteratively computes the modularity
    communities using the greedy modularity algorithm with varying resolutions. It then calculates
    the top 10 communities by size and plots their distribution for each resolution """
    
    for i in range(20, 60):
        if i%2==0 and i!=0:
            c = greedy_modularity_communities(G, resolution=i/10)
            a = []

            for indice in range(len(c)):
                a.append(dict(zip(list(c[indice]), (f'{str(indice)} '*len(c[indice])).split(' '))))


            b = dict( chain( *map( dict.items, a ) ) )

            df['SourceModularity'] = df["username"].map(b)
            df['TargetModularity'] = df["originalUsernamePost"].map(b)

            (df.TargetModularity.value_counts()[:10] + \
            df.SourceModularity.value_counts()[:10]).sort_values(ascending=False).plot(kind='bar')
            plt.title(f'resolution {i/10}')
            plt.show()
            
            
def pick_best_resolution(G,df, resolution):
    """Given a graph G, a DataFrame df, and a resolution value, this function computes the greedy modularity
    communities of the graph and assigns the modularity values to the DataFrame's 'SourceModularity' and
    'TargetModularity' columns."""
    
    community_list = greedy_modularity_communities(G, resolution=resolution)
    community_mapping_list = []

    for indice in range(len(community_list)):
        # Create a dictionary that maps each node in the current community to the community index (indice) as a string
        community_mapping_list.append(dict(zip(list(community_list[indice]), (f'{str(indice)} '*len(community_list[indice])).split(' '))))
        
    from itertools import chain
    #  The following line merges all the dictionaries in the 'community_mapping_list' into a single dictionary called 'merged_community_mapping':
    merged_community_mapping = dict( chain( *map( dict.items, community_mapping_list ) ) )
    
    df['SourceModularity'] = df["username"].map(merged_community_mapping)
    df['TargetModularity'] = df["originalUsernamePost"].map(merged_community_mapping)
    
    return df

In [3]:
df = pd.read_json(r'./data/dataset_original_processed/tweets_dataviz_agreements.json')

In [4]:
G = nx.from_pandas_edgelist(df, "username", 
                            "originalUsernamePost", 
                            create_using=nx.DiGraph)

In [None]:
plot_top10_communities_distribution(G, df)

In [18]:
df = pick_best_resolution(G, df, 2.6)

In [19]:
df.to_json(r'./data/dataset_original_processed/tweets_dataviz_agreements_comms.json',
           force_ascii=False)

# Keywords mapping

In [1]:
import pandas as pd
import networkx as nx
import os
import matplotlib.pyplot as plt
import yake

In [None]:
def extract_kw_yake(x):
    try:
        kw_and_confidence = custom_kw_extractor.extract_keywords(x)
        kw = kw_and_confidence[0][0]
        return kw
    except:
        return 'KEYWORD_NOT_FOUND'
    
    
def rimuovi_prima_parola(df):
    """A further text processing step is removing all the keywords that are nothing but the
    username of the user that posts the tweet"""
    
    username_to_remove = df['originalUsernamePost']
    topkw = df['TopKeyword']
    
    topkw = topkw.split()
    if topkw[0] == username_to_remove:
        return " ".join(topkw[1:])
    else:
        return " ".join(topkw)

In [2]:
df = pd.read_json(r'./data/dataset_original_processed/tweets_dataviz_agreements_comms.json')

In [3]:
language = 'en'
max_ngram_size = 2
numOfKeywords = 1
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size,
                                            top=numOfKeywords, features=None)

In [None]:
kw_of_unique_texts = pd.Series(df['text'].unique()).apply(extract_kw_yake)

In [15]:
text_kw_mapper = dict(zip(df['text'].unique().tolist(), 
         kw_of_unique_texts.tolist()))

In [18]:
df['TopKeyword'] = df['text'].map(text_kw_mapper)

In [24]:
df = df[df['TopKeyword']!='KEYWORD_NOT_FOUND'].reset_index(drop=True)

In [37]:
df['TopKeyword'] = df.apply(rimuovi_prima_parola,axis=1) 

In [49]:
# but now the tweets that had as a keyword just the username display an empty cell,
# let's drop it

df = df.drop(df.loc[df['TopKeyword'] == ''].index,
       axis=0).reset_index(drop=True)

In [52]:
df.to_json(r'./data/dataset_original_processed/tweets_dataviz_agreements_comms_kw.json',
           force_ascii=False)

# Edge Betweenness

In [1]:
import networkx as nx
import pandas as pd

In [3]:
def calcola_edge_betweenness(df):
    G = nx.from_pandas_edgelist(df, 'SourceModularity','TargetModularity')
    bet = nx.edge_betweenness_centrality(G)

    dfbet = pd.DataFrame(bet.keys(),columns=['SourceModularity','TargetModularity'])
    dfbet['edge_bet'] = bet.values()

    df = pd.merge(df,dfbet, on=['SourceModularity','TargetModularity'])
    return df

In [2]:
df = pd.read_json(r'./data/dataset_original_processed/tweets_dataviz_agreements_comms_kw.json')

In [4]:
df = calcola_edge_betweenness(df)

In [9]:
df.to_json(r'./data/dataset_original_processed/tweets_dataviz_agreements_comms_kw_bet.json',
           force_ascii=False)

# Dataset splitting in subsets by research keywords

In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
from itertools import chain

In [9]:
def plot_top10_communities_distribution(G, df):
    for i in range(12, 42):
        if i%2==0 and i!=0:
            c = greedy_modularity_communities(G, resolution=i/10)
            a = []

            for indice in range(len(c)):
                a.append(dict(zip(list(c[indice]), (f'{str(indice)} '*len(c[indice])).split(' '))))


            b = dict( chain( *map( dict.items, a ) ) )

            df['SourceModularity'] = df["username"].map(b)
            df['TargetModularity'] = df["originalUsernamePost"].map(b)

            (df.TargetModularity.value_counts()[:10] + \
            df.SourceModularity.value_counts()[:10]).sort_values(ascending=False).plot(kind='bar')
            plt.title(f'resolution {i/10}')
            plt.show()

In [8]:
def pick_best_resolution(G,df, resolution):
    c = greedy_modularity_communities(G, resolution=resolution)
    a = []

    for indice in range(len(c)):
        a.append(dict(zip(list(c[indice]), (f'{str(indice)} '*len(c[indice])).split(' '))))
        
    from itertools import chain
    b = dict( chain( *map( dict.items, a ) ) )
    
    df['SourceModularity'] = df["username"].map(b)
    df['TargetModularity'] = df["originalUsernamePost"].map(b)
    
    return df

In [4]:
def calcola_edge_betweenness(df):
    G = nx.from_pandas_edgelist(df, 'SourceModularity','TargetModularity')
    bet = nx.edge_betweenness_centrality(G)

    dfbet = pd.DataFrame(bet.keys(),columns=['SourceModularity','TargetModularity'])
    dfbet['edge_bet'] = bet.values()

    df = pd.merge(df,dfbet, on=['SourceModularity','TargetModularity'])
    return df

In [2]:
df = pd.read_json(
    r'./data/dataset_original_processed/tweets_dataviz_agreements_comms_kw_bet.json'
)

In [8]:
chosen_keywords_for_research = [
    'elections', 
    'fraud', 
    'democrats',
    'riggedelection', 
    'sharpiegate',
    'capitol hill', 
    'biden', 
    'trump', 
    'georgia']

In [None]:
for kw in chosen_keywords_for_research:
    print(f"SUBSET {kw}: {df[(df.text.str.lower()).str.contains(kw)].shape}")
    df_subset = df.copy()
    df_subset = df_subset.drop('edge_bet',axis=1)
    df_subset = df_subset[(df_subset.text.str.lower()).str.contains(kw)]
    
    G = nx.from_pandas_edgelist(df_subset, "username", 
                            "originalUsernamePost", 
                            create_using=nx.DiGraph)
    
    plot_top10_communities_distribution(G, df_subset) 
    best_resol = input(f'Pick a resolution for {kw}: ')
    best_resol = float(best_resol)
    df_subset = pick_best_resolution(G, df_subset, best_resol)
    
    df_subset = calcola_edge_betweenness(df_subset)
    
    df_subset.to_json(rf'./data/datasets_splitted_by_research_kw/{kw}.json',
               force_ascii=False)

# Statistical Analyses on subsets

In [1]:
import pandas as pd
import os
from scipy import stats

In [2]:
class StatisticalAnalyzer:
    def __init__(self, df):
        self.df = df.copy()
        self.top10comms = (df.TargetModularity.value_counts()[:10] + \
                           df.SourceModularity.value_counts()[:10]) \
                           .sort_values(ascending=False).index.tolist()
    
    @staticmethod
    def compute_chi2(column_x, column_y):
        """Computes the chi-square test statistic, p-value, degrees of freedom, and expected frequencies 
        using the observed frequencies of occurrence of the events described by two columns of a DataFrame.
        Parameters:
            column_x (pandas.Series): A categorical pandas Series, which will be used as rows in the contingency table.
            column_y (pandas.Series): A categorical pandas Series, which will be used as columns in the contingency table."""
        return stats.chi2_contingency(pd.crosstab(column_x, column_y))
    
    
    def a1(self):
        """ Adds a new column to the dataframe, SameCommunitySourceTarget, which is the integer representation of whether 'SourceModularity' 
        and 'TargetModularity' are the same."""
        
        df = self.df.copy()
        df['SameCommunitySourceTarget'] = (df['SourceModularity'] == df['TargetModularity']).astype('int')        
        return df
    
    def a2(self):
        """ Filters the DataFrame by selecting only the rows where the 'SourceModularity' and 'TargetModularity' 
        are the same. 
        The top 10 communities are considered separately, and the rest are grouped under 
        the same category (11). """
        
        df = self.df.copy()
        top10comms = self.top10comms
        
        df.loc[(df['SourceModularity']==df['TargetModularity']) 
               & (~df['SourceModularity'].isin(top10comms)), 'SourceModularity'] = 11
        df.loc[(df['SourceModularity']==df['TargetModularity']) 
               & (~df['TargetModularity'].isin(top10comms)), 'TargetModularity'] = 11
        return df
    
    def a3(self):
        """ Filters the DataFrame by selecting only the rows where the 'SourceModularity' and 'TargetModularity' 
        are different, and then creates a new column called 'InterCommunities' which represents the interaction 
        between two communities. The top 10 communities are considered separately, and the rest are grouped under 
        the same category (11). """
        
        df = self.df.copy()
        top10comms = self.top10comms
        
        df.loc[(df['SourceModularity']!=df['TargetModularity']) 
                & (~df['SourceModularity'].isin(top10comms)), 'SourceModularity'] = 11

        df.loc[(df['SourceModularity']!=df['TargetModularity']) 
               & (~df['TargetModularity'].isin(top10comms)), 'TargetModularity'] = 11
        
        df = df.loc[df['SourceModularity']!=df['TargetModularity']].reset_index(drop=True)
        df['InterCommunities'] = df['SourceModularity'].astype('str') +  '-' + df['TargetModularity'].astype('str')
        return df
    
    def a4(self):
        """Filters the DataFrame by selecting only the rows where:
        1. The 'SourceModularity' and 'TargetModularity' are the same and both are in the top 10 communities, or
        2. The 'SourceModularity' and 'TargetModularity' are different and both are in the top 10 communities.
        Then, it creates a new column called 'InterCommunities' which represents the interaction between
        the two communities."""


        df = self.df.copy()
        top10comms = self.top10comms
        
        df = pd.concat(
            [
                df.loc[(df['SourceModularity']==df['TargetModularity']) & 
                       (df['SourceModularity'].isin(top10comms))][['SourceModularity','TargetModularity','agreement']], 

                df.loc[(df['SourceModularity'].isin(top10comms)) & 
                       (df['SourceModularity']!=df['TargetModularity'])][['SourceModularity','TargetModularity','agreement']
                                                                        ]
            ]
        )
        df['InterCommunities'] = df['SourceModularity'].astype('str') +  '-' + df['TargetModularity'].astype('str')
        return df
    
    def a5(self):
        """ Computes the chi-square test between the most frequent 100 keywords and
        agreement/disagreement in the dataset. 
        The method first groups the DataFrame by 'TopKeyword' and 'agreement' columns and calculates the size of each group. 
        Then, it selects the top 100 keywords based on their total frequency in the dataset and computes the chi-square test. 
        The p-value of the test is returned."""
        
        df = self.df.copy()
        df = df.groupby(['TopKeyword', 'agreement']).size().unstack(fill_value=0).reset_index()
        df['sum'] = df[[-1,1]].sum(axis=1)
        df = df.sort_values('sum',ascending=False).reset_index(drop=True).drop(['sum'],axis=1)
        df = df.head(100)
        df = df.set_index('TopKeyword')
        return stats.chi2_contingency(df)[1]   
    
    def a6(self):
        """Computes the chi-square test between keywords and agreement/disagreement within each of the top 10 communities in the dataset. 
        For each community, the method filters the DataFrame to include only rows where both 'SourceModularity' and 'TargetModularity' match the community, and groups the filtered data by 'TopKeyword' and 'agreement'. 
        Then, it calculates the size of each group and performs the chi-square test for the top 10 keywords in each community. """        
        
        df = self.df.copy()
        top10comms = self.top10comms
        for i in top10comms:
            kw = df.loc[(df['SourceModularity']==i) & 
                        (df['TargetModularity']==i) & 
                        (df['agreement']!='error')].groupby(['TopKeyword', 'agreement']).size().unstack(fill_value=0)
            try:
                vals = {}
                kw['sum'] = kw[[-1,1]].sum(axis=1)
                kw = kw.sort_values('sum',ascending=False).head(10)
                kw = kw[[-1,1]]
                vals[f"Comunità {i}-{i}"] = stats.chi2_contingency(kw)[1]
            except:
                vals[f"Comunità {i}-{i}"] = None
        
    def a7(self):
        """Computes the chi-square test between keywords connecting different communities and agreement/disagreement across the entire dataset. 
        The method filters the DataFrame to include only rows where 'SourceModularity' and 'TargetModularity' are different.
        Then, it creates a new column 'SKT' that combines 'SourceModularity', 'TopKeyword',and 'TargetModularity'. 
        It calculates the cross-tabulation of 'SKT' and 'agreement' and performs the chi-square test on this cross-tabulation. 
        The p-value of the test is returned"""
        
        df = self.df.copy()
        
        df = df.loc[(df['SourceModularity']!=df['TargetModularity']) & (df['agreement']!='Error')] \
        .assign(SKT=df['SourceModularity'].astype('str') + '-' + df['TopKeyword'] + '-' + df['TargetModularity'].astype('str'))

        kw = pd.crosstab(df['SKT'], df['agreement'])
        try:
            kw['somma'] = kw[[-1,1]].sum(axis=1)
            kw = kw.sort_values('somma', ascending=False)[[-1,1]]

            return stats.chi2_contingency(kw)[1]
        except: return None
    
    def a8(self):
        """Filters the DataFrame to include only rows where 'SourceModularity' and 'TargetModularity'are different and both 'SourceModularity' and 'TargetModularity'are in the top 10 communities. 
        Then, it creates a new column 'SKT' that combines 'SourceModularity', 'TopKeyword', and 'TargetModularity'. 
        The resulting DataFrame is returned."""
        
        df = self.df.copy()
        top10comms = self.top10comms
        
        df = df.loc[(df['SourceModularity']!=df['TargetModularity']) & (df['agreement']!='Error')] \
        .assign(SKT=df['SourceModularity'].astype('str') + '-' + df['TopKeyword'] + '-' + df['TargetModularity'].astype('str'))

        df = df.loc[(df['SourceModularity'].isin(top10comms)) & 
                    (df['TargetModularity'].isin(top10comms)) & 
                    (df['SourceModularity']!=df['TargetModularity'])]
        return df

    def a9(self):
        """Filters the DataFrame to include only rows where 'SourceModularity' and 'TargetModularity'are different and both 'SourceModularity' and 'TargetModularity' are in the top 10 communities. 
        Then, it creates a new column 'SKT' that combines 'SourceModularity', 'TopKeyword', and 'TargetModularity'. 
        For each top 10 community, select the top 10 rows with 'SKT' that starts with the community's number, and concatenate the results. """
        
        df = self.df.copy()
        top10comms = self.top10comms

        df = df.loc[(df['SourceModularity']!=df['TargetModularity']) & (df['agreement']!='Error')] \
        .assign(SKT=df['SourceModularity'].astype('str') + '-' + df['TopKeyword'] + '-' + df['TargetModularity'].astype('str'))


        df = df.loc[(df['SourceModularity'].isin(top10comms)) & 
                       (df['TargetModularity'].isin(top10comms)) & 
                       (df['SourceModularity']!=df['TargetModularity'])][['SKT','agreement']]

        dfs = []
        for comm in top10comms:
            dfs.append(df[df.SKT.str.startswith(f"{comm}-")].head(10))
        dfs = pd.concat(dfs)

        return dfs
    
    def a10(self):
        """Filters the DataFrame to include rows where 'SourceModularity' is in the top 10 communities and 'TargetModularity' is not in the top 10 communities. 
        Then, it creates a new column 'SKT' that combines 'SourceModularity', 'TopKeyword', and 'TargetModularity'. 
        The method computes a crosstab of 'SKT' and 'agreement', then sorts the result by the sum of agreement and disagreement counts and removes any rows with all zeros."""
        
        df = self.df.copy()
        top10comms = self.top10comms

        df['SKT'] = df['SourceModularity'].astype('str') + '-' + df['TopKeyword'] + '-' + df['TargetModularity'].astype('str')

        df = pd.crosstab(df.loc[(df['SourceModularity'].isin(top10comms)) & (~df['TargetModularity'].isin(top10comms))]['SKT'],
                         df.loc[(df['SourceModularity'].isin(top10comms)) & (~df['TargetModularity'].isin(top10comms))]['agreement'])
    
        try:
            df['somma'] = df[[-1,1]].sum(axis=1)
            df = df.loc[(df!=0).any(axis=1)].sort_values('somma',ascending=False).drop('somma',axis=1)
            return df
        except: return None
    
    def a11(self):
        """For each community in the top 10 communities, this method filters the DataFrame to select rows
        where both 'SourceModularity' and 'TargetModularity' are equal to the current community index.
        Then, it separates the 'edge_bet' values based on whether the 'agreement' column has a value of 1
        (agreement) or -1 (disagreement). 
        Finally, it computes the Kruskal-Wallis H-test on these two groups
        of 'edge_bet' values and prints the community index along with the test results."""
        
        df = self.df.copy()
        top10comms = self.top10comms
        for i in top10comms:
            temp = df.loc[(df['SourceModularity']==i) & (df['TargetModularity']==i)]
            edge_bet_agreement = temp.loc[temp['agreement']==1]['edge_bet']
            edge_bet_disagreement = temp.loc[temp['agreement']==-1]['edge_bet']
            print(i,stats.kruskal(edge_bet_agreement.values, edge_bet_disagreement.values))
            
    def a12(self):
        """Filters the DataFrame to select rows where 'SourceModularity' is in the top 10 communities,
        and 'TargetModularity' is not in the top 10 communities. Then, it separates the 'edge_bet'
        values based on whether the 'agreement' column has a value of 1 (agreement) or -1 (disagreement).
        Finally, it computes the Kruskal-Wallis H-test on these two groups of 'edge_bet' values and returns
        the test results"""
        
        df = self.df.copy()
        top10comms = self.top10comms
        df = df.loc[(df['SourceModularity'].isin(top10comms)) & (~df['TargetModularity'].isin(top10comms))]
        edge_bet_agreement = df.loc[df['agreement']==1]['edge_bet']
        edge_bet_disagreement = df.loc[df['agreement']==-1]['edge_bet']
        return stats.kruskal(edge_bet_agreement.values, edge_bet_disagreement.astype('int').values)
    
    def a13(self):
        """Modifies the DataFrame by setting 'SourceModularity' and 'TargetModularity' to 999 for all rows
        where the respective modularity value is not in the top 10 communities. Then, it separates the
        'edge_bet' values based on whether the 'agreement' column has a value of 1 (agreement) or -1
        (disagreement). 
        Finally, it computes the Kruskal-Wallis H-test on these two groups of 'edge_bet' values and returns the test results."""
        
        df = self.df.copy()
        top10comms = self.top10comms
        df.loc[~df['SourceModularity'].isin(top10comms), 'SourceModularity'] = 999
        df.loc[~df['TargetModularity'].isin(top10comms), 'TargetModularity'] = 999
        edge_bet_agreement = df.loc[df['agreement']==1]['edge_bet']
        edge_bet_disagreement = df.loc[df['agreement']==-1]['edge_bet']
        return stats.kruskal(edge_bet_agreement.values, edge_bet_disagreement.astype('int').values)

    def a14(self):
        """Plots the quantile function of 'edge_bet' values for agreement and disagreement cases.
        The method filters the DataFrame to separate the 'edge_bet' values based on whether the
        'agreement' column has a value of 1 (agreement) or -1 (disagreement). 
        Then, it computes the percentiles for both groups and plots the quantile functions for the 80th to 100th
        percentiles on a logarithmic scale."""
        
        df = self.df.copy()
        top10comms = self.top10comms

        edges_agreement = df.loc[df.agreement==1].edge_bet.values
        edges_disagreement = df.loc[df.agreement==-1].edge_bet.values

        percs = np.linspace(0,100,500)

        qn_edges_agreement = np.percentile(edges_agreement, percs)
        qn_edges_disagreement = np.percentile(edges_disagreement, percs)

        plt.plot(percs[400:],qn_edges_agreement[400:])
        plt.plot(percs[400:],qn_edges_disagreement[400:])
        plt.yscale('log')

In [3]:
dfs = []

PATH = './data/datasets_splitted_by_research_kw'
for file in os.listdir(PATH):
    if file.endswith('.json'):
        print(f"{PATH}/{file}")
        df_subset = pd.read_json(f"{PATH}/{file}")
        sa = StatisticalAnalyzer(df_subset)
        _, pvalue1, _, _ = sa.compute_chi2(sa.a1()['SameCommunitySourceTarget'], sa.a1()['agreement'])
        _, pvalue2, _, _ = sa.compute_chi2(sa.a2()['SourceModularity'], sa.a2()['agreement'])
        _, pvalue3, _, _ = sa.compute_chi2(sa.a3()['InterCommunities'], sa.a3()['agreement'])
        _, pvalue4, _, _ = sa.compute_chi2(sa.a4()['InterCommunities'], sa.a4()['agreement'])


        _, pvalue4_1, _, _ = sa.compute_chi2(
            sa.a4()[sa.a4()['SourceModularity']==sa.a4()['TargetModularity']]['SourceModularity'],
            sa.a4()[sa.a4()['SourceModularity']==sa.a4()['TargetModularity']]['agreement']
        )


        _, pvalue4_2, _, _ = sa.compute_chi2(
            sa.a4()[sa.a4()['SourceModularity']!=sa.a4()['TargetModularity']]['SourceModularity'],
            sa.a4()[sa.a4()['SourceModularity']!=sa.a4()['TargetModularity']]['agreement']
        )

        pvalues4_3 = []
        for comm in sa.top10comms:
            temp = df_subset.loc[(df_subset['SourceModularity']==comm)][['SourceModularity','TargetModularity','agreement']]

            temp['SameCommunitySourceTarget'] = (temp['SourceModularity'] == temp['TargetModularity']).astype('int')

            crosstab = pd.crosstab(temp['SameCommunitySourceTarget'], temp['agreement'])

            _, pvalue4_3, _, _ = stats.chi2_contingency(crosstab)
            pvalues4_3.append(pvalue4_3)

        pvalue5 = sa.a5()
        pvalue6 = sa.a6()
        pvalue7 = sa.a7()
        _,pvalue8,_,_ = sa.compute_chi2(sa.a8()['SKT'], sa.a8()['agreement'])
        _,pvalue9,_,_ = sa.compute_chi2(sa.a9()['SKT'], sa.a9()['agreement'])
        _,pvalue10,_,_ = stats.chi2_contingency(sa.a10())
        pvalue12 = sa.a12()
        pvalue13 = sa.a13()
        
        page = pd.DataFrame({'A1': pvalue1,
                     'A2':pvalue2,
                     'A3': pvalue3,
                     'A4': pvalue4,
                     'A4.1':pvalue4_1,
                     'A4.2': pvalue4_2,
                     'A4.3': [pvalues4_3],
                     'A5': pvalue5,
                     'A6': pvalue6,
                     'A7': pvalue7,
                     'A8': pvalue8,
                     'A9': pvalue9,
                     'A10': pvalue10,
                     'A12': pvalue12.pvalue,
                     'A13': pvalue13.pvalue}, 
                           index=[file])
        dfs.append(page)

./data/datasets_splitted_by_research_kw/biden.json
./data/datasets_splitted_by_research_kw/capitol hill.json
./data/datasets_splitted_by_research_kw/democrats.json
./data/datasets_splitted_by_research_kw/elections.json
./data/datasets_splitted_by_research_kw/fraud.json
./data/datasets_splitted_by_research_kw/georgia.json
./data/datasets_splitted_by_research_kw/trump.json


In [5]:
pd.concat(dfs).to_excel('summary_statistical_analyses.xlsx')