This code gets a list of users in categories and checks for joint accounts based on PMI calculations

Expected Data Format:

| user1 | screen_name | user2 | category |

In [1]:
import pandas as pd
import os
import ast
import swifter
import math

# Load files with the list of accounts

In [2]:
path = "/Users/nlotan/Dropbox/לימודים/University/Research Data Backup/For Sagi"
original_users = pd.DataFrame()

for root, dirs, files  in os.walk(path):
    for name in files:
        if name.endswith(".csv"):
            st_name = os.path.join(root, name)
            filepath = st_name
            category =  (st_name.split("/")[-2])
            new_df = pd.read_csv(filepath)
            new_df['category'] = category
            original_users = pd.concat([original_users,new_df], ignore_index=True)
            
            
original_users.drop_duplicates(subset=['user1'], keep='first', inplace=True)
original_users['user2'] = original_users['user2'].swifter.apply (lambda x: ast.literal_eval(x))

Pandas Apply: 100%|██████████| 30244/30244 [01:35<00:00, 315.67it/s]
Backing off send_request(...) for 0.7s (requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.segment.io', port=443): Max retries exceeded with url: /v1/batch (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x12685a8b0>, 'Connection to api.segment.io timed out. (connect timeout=15)')))


# Downsample the random category

In [3]:
def downsample_category(df, category_index, category_name, proportion):
    df_non_category = df[df[category_index] != category_name]
    
    under_sample_criteria = df[category_index]==category_name
    downsampled_df = df.sample(frac=proportion, 
                               axis=0, 
                               weights=under_sample_criteria, 
                               random_state=42)
    
    downsampled_df = pd.concat([downsampled_df,df_non_category],sort=False)
    return downsampled_df

In [4]:
original_users = original_users.dropna()
original_users = downsample_category(original_users,"category","random_users",0.25)

In [5]:
original_users.category.value_counts()

random_users       7561
uncivil_users      7163
political_users    3200
Name: category, dtype: int64

# Choose subclasses now


In [6]:
original_users.category.unique()

array(['random_users', 'uncivil_users', 'political_users'], dtype=object)

In [12]:
categories_for_now = ['uncivil_users', 'political_users']

In [13]:
original_users = original_users[original_users['category'].isin(categories_for_now)]

 # Explode the dataframe into a list of rows, one per friend

In [15]:
#create one line per user1 + user2 combination for easy counting/grouping
all_users = original_users.explode("user2")

# Load poplar users and details

In [16]:
popular_df = pd.read_pickle("/Users/nlotan/code/university/SocialVec/auxiliary/users_with_over_200_DETAILS.pkl")
popular_users_list = popular_df.dropna().user_id.astype(int).tolist()
all_users_popular = all_users[all_users['user2'].isin(popular_users_list)]

In [17]:
## Count the number of occurrences of user2 in each category
user2_count_by_category = all_users_popular.groupby(['category','user2']).count().reset_index()
user2_count_by_category['user2']=user2_count_by_category['user2'].astype(int)
user2_count_by_category.rename(columns={0:'count'}, inplace=True)
user2_count_all = user2_count_by_category.groupby(['user2']).sum().reset_index()

In [18]:
# Mutual information Matrix
#ref: https://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html

# c - class
# a - account 

# for each class:
#          | c=1   | c=0     |
# --------+--------+---------+
#  a=1    |  n11  |    n10   |
# --------+-------+----------+
#  a=0    |  n01  |  n00     |
# --------+-------+----------+

In [19]:
#number of users in this report
all_user1_count = original_users['user1'].nunique()
list_of_all_followees = user2_count_all['user2'].unique()
all_categories = user2_count_by_category['category'].unique()

In [20]:
all_user2_count = user2_count_all['user2'].nunique()

To compute the PMI between a class (uncivil) and each account followed (e.g., Obama):

Pr(Obama, uncivil) = % of users that are uncivil who follow Obama
Pr(uncivil) = % of users in the dataset that are uncivil
Pr(Obama) = % of users in the dataset (regardless of their label) who follow Obama

PMI(Obama,uncivil) = log_2 ( Pr(O,U) / (Pr(O)*Pr(U)) )

This should be computed for each of the accounts followed in the dataset X class. Then, we would like to see the accounts with the highest PMI scores per class.


In [None]:
import math
from tqdm import tqdm
tqdm.pandas()

# Optimizations - pre calculate all you need


In [23]:

pr_class_dict = {}

for current_category in all_categories:
    this_class_count        = original_users[original_users['category']==current_category]['user1'].nunique()
    pr_class_dict[current_category] = this_class_count        / all_user1_count
    
class_count_dict = {}

for current_category in all_categories:
    class_count_dict[current_category]        = original_users[original_users['category']==current_category]['user1'].nunique()

In [24]:

user2_count_all['pr_account'] = user2_count_all['user2'].swifter.apply(lambda x: user2_count_all[user2_count_all['user2']==x]['user1'].values[0] / all_user1_count)


Dask Apply: 100%|██████████| 32/32 [01:43<00:00,  3.22s/it]


In [25]:
# optimized pmi_calc

PMI_res = pd.DataFrame()

def calc_pmi( curr_row):
    
    global PMI_res
    global global_class
    
    user2_ = curr_row.user2
    current_category = global_class
        
    # total number of users2 in this category
    this_class_count        = class_count_dict[current_category]
    
    # Pr(Obama) = % of users in the dataset (regardless of their label) who follow Obama
    pr_account = curr_row.pr_account #user2_count_all[user2_count_all[user2_count_all['user2']==user2_]]['pr_account'].value

    #Pr(uncivil) = % of users in the dataset that are uncivil
    pr_class         = pr_class_dict[current_category]

    #Pr(Obama, uncivil) = % of users that are uncivil who follow Obama
    #Pr(Obama, uncivil) = % of users that are uncivil who follow Obama
    this_user_in_class_count = user2_count_by_category[user2_count_by_category['user2']==user2_]['user1'].values[0]
    pr_account_class =  this_user_in_class_count / this_class_count
    
    # PMI(Obama,uncivil) = log_2 ( Pr(O,U) / (Pr(O)*Pr(U)) )
    pmi_score = math.log2( pr_account_class / (pr_account * pr_class ) )

    PMI_res = PMI_res.append({'user2':int(user2_), 'category':current_category, 'pmi':pmi_score, 'count_in_class':this_user_in_class_count}, ignore_index=True)


for class_ in all_users_popular.category.unique():
    global_class = class_
    user2_count_all.swifter.apply(calc_pmi, axis=1)

Dask Apply: 100%|██████████| 32/32 [06:33<00:00, 12.30s/it]  
Dask Apply: 100%|██████████| 32/32 [07:02<00:00, 13.20s/it]  


In [26]:
PMI_res.to_csv('pmi_political_vs_uncivil.csv')

In [None]:
PMI_res.sort_values(by=['pmi'], ascending=False)

In [None]:
# works

In [None]:
PMI_res = pd.DataFrame()

def calc_pmi(user2_):
    global PMI_res
    for current_category in all_categories:
        # total number of users2 in this category
        this_class_count        = original_users[original_users['category']==current_category]['user1'].nunique()
        
        # Pr(Obama) = % of users in the dataset (regardless of their label) who follow Obama
        pr_account = user2_count_all[user2_count_all['user2']==user2_]['user1'].values[0] / all_user1_count

        #Pr(uncivil) = % of users in the dataset that are uncivil
        pr_class         = this_class_count        / all_user1_count

        #Pr(Obama, uncivil) = % of users that are uncivil who follow Obama
        pr_account_class = user2_count_by_category[user2_count_by_category['user2']==user2_]['user1'].values[0] / this_class_count

        # PMI(Obama,uncivil) = log_2 ( Pr(O,U) / (Pr(O)*Pr(U)) )
        pmi_score = math.log2( pr_account_class / (pr_account * pr_class ) )

        PMI_res = PMI_res.append({'user2':int(user2_), 'category':current_category, 'pmi':pmi_score}, ignore_index=True)

user2_count_all['user2'].progress_apply(calc_pmi)

In [None]:
PMI_res.user2 = PMI_res.user2.astype(int)
mdf = pd.merge( PMI_res, popular_df, left_on="user2", right_on="user_id", how="left")

In [None]:
popular_df.dropna(inplace=True)
popular_df.user_id = popular_df.user_id.astype(int)

In [None]:
#mdf[(~mdf['user_id'].isna())&(mdf['category']=='political_users')]
mdf[mdf['count_in_class']>20].sort_values(by=['pmi'], ascending=False)

In [None]:
original_users['user1'] = original_users['user1'].astype(int)

In [None]:
original_users[['user1','screen_name']].sort_values(by='user1')

In [None]:
    import pandas as pd
    
    popular_df = pd.read_pickle(
        "/Users/nlotan/code/university/SocialVec/auxiliary/users_with_over_200_DETAILS.pkl")

In [None]:
def make_clickable(username):
    # target _blank to open new window
    # extract clickable text to display for your link
    
    link = f"https://twitter.com/{username}"
    return f'<a target="_blank" href="{link}">Twitter link</a>'


In [None]:
popular_df['link'] = popular_df['screen_name'].apply(make_clickable)

In [None]:
popular_df