In [1]:
import numpy as np
import pandas as pd
import csv

from matplotlib import pyplot as plt
import matplotlib.lines as lines
%matplotlib inline

In [2]:
'''
Useful Functions
'''
def convert_to_numeric(var):
    multiplier_dict = {'k': 10**3, 'm': 10**6, 'b': 10**9}
    if isinstance(var, (int, float)):
        # var is already a number
        return var
    else:
        # try to convert var to a number
        try:
            # Check if var can be directly converted to a float
            return float(var)
        except ValueError:
            # var can't be directly converted to a float, so it might be in the format '123k', '22.5M', etc.
            # Check if the last character of var is in the multiplier_dict
            if var[-1].lower() in multiplier_dict:
                try:
                    # Try to convert the part of var before the last character to a float
                    number = float(var[:-1])
                    # Multiply the number by the appropriate multiplier
                    return number * multiplier_dict[var[-1].lower()]
                except ValueError:
                    # The part of var before the last character can't be converted to a float
                    return 0
            else:
                # The last character of var is not in the multiplier_dict
                return 0
        


In [3]:
# Read the file of researchers
df_r = pd.read_csv('fairweb1_r.csv')
df_r.head()

Unnamed: 0,_id,topicid,docid,usrid,comments,noentity,document_id,researcher_name1,bio_url1,gender1,...,bio_url2,gender2,google_scholar_url2,h_index2,researcher_name3,bio_url3,gender3,google_scholar_url3,h_index3,error
0,647d8838a82464738222f3e8,sort_R002,1,user01,F. R. Robertson,NO.REL.ENTITY,dba92aea-b2b0-4bf4-8ffa-ac610cf37b2e,,,,...,,,,,,,,,,
1,647d887da82464738222f3ee,sort_R002,2,user01,A. W. Robertson,NO.REL.ENTITY,0bd9fb3d-795d-48e6-8bbd-717a64de695d,,,,...,,,,,,,,,,
2,647d8894a82464738222f3f2,sort_R002,3,user01,A. W. Robertson,NO.REL.ENTITY,a64a6642-f733-421f-887d-d6f6239bf94a,,,,...,,,,,,,,,,
3,647d88a3a82464738222f3f6,sort_R002,4,user01,A. W. Robertson,NO.REL.ENTITY,b29bebed-ba72-4bcd-9e6e-6669712c827f,,,,...,,,,,,,,,,
4,647d88b3a82464738222f3fa,sort_R002,5,user01,Dave Robertson,NO.REL.ENTITY,0772aa1a-f6cc-4752-8867-e9b9b0b44f6f,,,,...,,,,,,,,,,


In [4]:
# removed
#df_t = pd.read_csv('fw1pilot_t.csv')
#df_t.head()

In [5]:
# Read the file of youtube videos
df_y = pd.read_csv('fairweb1_y.csv')
df_y.head()

Unnamed: 0,_id,topicid,docid,usrid,comments,noentity,document_id,youtube_url1,content_title1,num_subscribers1,youtube_url2,content_title2,num_subscribers2,youtube_url3,content_title3,num_subscribers3,error
0,647ea5cfa82464738222f4af,sort_Y004,2,user01,"""collection of links""",NO.REL.ENTITY,949129b0-74f8-4b95-a883-b4a019f9009a,,,,,,,,,,
1,647ea62ba82464738222f4b3,sort_Y004,3,user01,,,ad9febc5-d71a-4c1c-9ed2-b96351acc1a6,https://youtu.be/4Mrfh6dZOFc,THE MEG International Trailer 2 (2018) Jason S...,2650000.0,,,,,,,
2,647ea659a82464738222f4b7,sort_Y004,4,user01,"""shark night 3d"" in similar posts",NO.REL.ENTITY,be2aa2fa-352f-4e5f-9957-64750480e274,,,,,,,,,,
3,647ea671a82464738222f4bb,sort_Y004,5,user01,"""shark night 3d"" in similar posts",NO.REL.ENTITY,a57c36f4-dfca-4cf4-9a69-d5efd447ceb2,,,,,,,,,,
4,647ea748a82464738222f4bf,sort_Y004,6,user01,NOT a shark movie,NO.REL.ENTITY,4992645f-af66-4217-85a4-7f32ca62a008,,,,,,,,,,


In [6]:
# Read the file of movies
df_m = pd.read_csv('fairweb1_m.csv')
df_m.head()

Unnamed: 0,_id,topicid,docid,usrid,comments,noentity,document_id,error,movie_name1,imdb_url1,num_ratings1,country1,movie_name2,imdb_url2,num_ratings2,country2,movie_name3,imdb_url3,num_ratings3,country3
0,647ee2fda82464738222f5ed,sort_M004,3,user01,"""looks like spam""",NO.REL.ENTITY,740d978c-5b22-4320-b835-2c122cd3ad34,,,,,,,,,,,,,
1,647ee349a82464738222f609,sort_M004,6,user01,"""looks like spam""",NO.REL.ENTITY,ca5a2c2a-afaf-4b0d-b998-f8ca42aafc23,,,,,,,,,,,,,
2,647ee363a82464738222f60d,sort_M004,7,user01,,,1c3b0c88-3c5d-426b-a2c1-363abd4f1524,NOT.SHOWN.PROPERLY,,,,,,,,,,,,
3,647ee3b2a82464738222f615,sort_M004,9,user01,"""looks like spam""",NO.REL.ENTITY,d9ee0c3a-93bf-4c5a-b2f5-dfbe34bc85ae,,,,,,,,,,,,,
4,647ee519a82464738222f64c,sort_M004,19,user01,,,753c41ed-aa34-4cae-8e6f-1bfbaebec6d1,NOT.SHOWN.PROPERLY,,,,,,,,,,,,


In [7]:
doc_rel_map = {
    'topicid':[],
    'document_id':[],
    'relevance':[]
}

In [8]:
def get_doc_rel(df):
    entity_map = {}
    topic_id = df['topicid'].values[0]
    doc_id = df['document_id'].values[0]
    for idx, rec in df.iterrows():
        #print(rec['noentity'])
        if not pd.isna(rec['noentity']):
            #print(idx)
            continue
            
        user = rec['usrid']
        if 'google_scholar_url1' in df.columns:
            e1 = rec['google_scholar_url1']
            e2 = rec['google_scholar_url2']
            e3 = rec['google_scholar_url3']
        #elif 'twitter_url1' in df.columns:
        #    e1 = rec['twitter_url1']
        #    e2 = rec['twitter_url2']
        #    e3 = rec['twitter_url3']
        elif 'youtube_url1' in df.columns:
            e1 = rec['youtube_url1']
            e2 = rec['youtube_url2']
            e3 = rec['youtube_url3']
        elif 'imdb_url1' in df.columns:
            e1 = rec['imdb_url1']
            e2 = rec['imdb_url2']
            e3 = rec['imdb_url3']
        
        if not pd.isna(e1):
            if e1 not in entity_map:
                entity_map[e1] = [user]
            else:
                entity_map[e1].append(user)
        if not pd.isna(e2):
            if e2 not in entity_map:
                entity_map[e2] = [user]
            else:
                entity_map[e2].append(user)
        if not pd.isna(e3):
            if e3 not in entity_map:
                entity_map[e3] = [user]
            else:
                entity_map[e3].append(user)
    rel = 0
    #print(scholar_map)
    for entity, usr_list in entity_map.items():
        if len(usr_list) > rel:
            rel = len(usr_list)
    doc_rel_map['topicid'].append(topic_id)
    doc_rel_map['document_id'].append(doc_id)
    doc_rel_map['relevance'].append('L'+str(rel))
    return df

In [9]:
df_r.groupby(['topicid','document_id'],as_index=False).apply(get_doc_rel)
df_m.groupby(['topicid','document_id'],as_index=False).apply(get_doc_rel)
# removed
#df_t.groupby(['topicid','document_id']).apply(get_doc_rel)
df_y.groupby(['topicid','document_id'],as_index=False).apply(get_doc_rel)

Unnamed: 0,_id,topicid,docid,usrid,comments,noentity,document_id,youtube_url1,content_title1,num_subscribers1,youtube_url2,content_title2,num_subscribers2,youtube_url3,content_title3,num_subscribers3,error
0,647ea5cfa82464738222f4af,sort_Y004,2,user01,"""collection of links""",NO.REL.ENTITY,949129b0-74f8-4b95-a883-b4a019f9009a,,,,,,,,,,
1,647ea62ba82464738222f4b3,sort_Y004,3,user01,,,ad9febc5-d71a-4c1c-9ed2-b96351acc1a6,https://youtu.be/4Mrfh6dZOFc,THE MEG International Trailer 2 (2018) Jason S...,2650000,,,,,,,
2,647ea659a82464738222f4b7,sort_Y004,4,user01,"""shark night 3d"" in similar posts",NO.REL.ENTITY,be2aa2fa-352f-4e5f-9957-64750480e274,,,,,,,,,,
3,647ea671a82464738222f4bb,sort_Y004,5,user01,"""shark night 3d"" in similar posts",NO.REL.ENTITY,a57c36f4-dfca-4cf4-9a69-d5efd447ceb2,,,,,,,,,,
4,647ea748a82464738222f4bf,sort_Y004,6,user01,NOT a shark movie,NO.REL.ENTITY,4992645f-af66-4217-85a4-7f32ca62a008,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3344,649ac223a824647382235e8f,sort_Y015,229,user05,,NO.REL.ENTITY,67a6e823-bdf6-4e74-b6b5-4ced3bb47f78,,,,,,,,,,
3345,649ac228a824647382235e93,sort_Y015,230,user05,,NO.REL.ENTITY,62a69e65-42f6-4aaa-aa46-00fa7ba00ec2,,,,,,,,,,
3346,649ac230a824647382235e97,sort_Y015,231,user05,,NO.REL.ENTITY,486106e4-6968-4d8b-8ece-119167a92ea6,,,,,,,,,,
3347,649ac236a824647382235e9b,sort_Y015,232,user05,,NO.REL.ENTITY,44b62424-8c3a-44c5-bf11-c3368e1650e9,,,,,,,,,,


In [10]:
res= pd.DataFrame(doc_rel_map) 
res

Unnamed: 0,topicid,document_id,relevance
0,sort_R001,00b0d6a8-a3e0-4415-80bc-94fdfe810814,L0
1,sort_R001,00ccbba5-3125-486f-bc7a-16a4598c7630,L0
2,sort_R001,00d890f1-5130-441a-97c9-7bc8da785e77,L0
3,sort_R001,0195dd59-1c8c-47ae-ba51-ea1bc512817a,L0
4,sort_R001,0224f27d-9eae-4a6b-9916-a1a30ea7a92a,L0
...,...,...,...
9209,sort_Y015,fc05569a-6afc-4263-89c8-93430603d98c,L0
9210,sort_Y015,fc36eef2-a2d2-4953-8d84-6726684bc94c,L0
9211,sort_Y015,fc9b38c7-83dd-4c1e-93d5-61197cb79cda,L0
9212,sort_Y015,fcde87ba-4170-42e6-8f60-8ea0d62b2ba1,L0


In [11]:
res.to_csv('fairweb1.qrels', sep=' ', index=False)

In [12]:
prob_gender_map = {
    'topicid':[],
    'document_id':[],
    'prob_he':[],
    'prob_she':[],
    'prob_oth':[]
}

prob_h_map = {
    'topicid':[],
    'document_id':[],
    'prob_g1':[],
    'prob_g2':[],
    'prob_g3':[],
    'prob_g4':[],
}

def get_prob_r(df):
    #print(df.columns)
    if 'topicid' not in df.columns or 'document_id' not in df.columns:
        print(df["_id"].values[0])
        return
    entity_map = {}
    topic_id = df['topicid'].values[0]
    doc_id = df['document_id'].values[0]
    for idx, rec in df.iterrows():
        if not pd.isna(rec['noentity']):
            continue

        user = rec['usrid']
        e1 = rec['google_scholar_url1']
        e2 = rec['google_scholar_url2']
        e3 = rec['google_scholar_url3']
        if not pd.isna(e1):
            gender_e1 = rec['gender1']
            h_e1 = rec['h_index1']
            if e1 not in entity_map:
                entity_map[e1] = {
                    'gen': gender_e1,
                    'h': h_e1
                }
            #else:
            #    pass
        if not pd.isna(e2):
            gender_e2 = rec['gender2']
            h_e2 = rec['h_index2']
            if e2 not in entity_map:
                entity_map[e2] = {
                    'gen': gender_e2,
                    'h': h_e2
                }
        if not pd.isna(e3):
            gender_e3 = rec['gender3']
            h_e3 = rec['h_index3']
            if e3 not in entity_map:
                entity_map[e3] = {
                    'gen': gender_e3,
                    'h': h_e3
                }

    if not entity_map:
        # If there is no entity,
        # output a uniform distribution
        #print(doc_id)
        prob_he = 1/3
        prob_she = 1/3
        prob_oth = 1/3
        prob_h1 = 1/4
        prob_h2 = 1/4
        prob_h3 = 1/4
        prob_h4 = 1/4
    else:
        prob_he = 0
        prob_she = 0
        prob_oth = 0
        prob_h1 = 0
        prob_h2 = 0
        prob_h3 = 0
        prob_h4 = 0
        cnt = 0
        for url, info in entity_map.items():
            if info['gen'] == 'he':
                prob_he += 1
            elif info['gen'] == 'she':
                prob_she += 1
            else:
                prob_oth += 1
            h_idx = convert_to_numeric(info['h'])
            if h_idx < 10:
                prob_h1 += 1
            elif h_idx < 30:
                prob_h2 += 1
            elif h_idx < 50:
                prob_h3 += 1
            else:
                prob_h4 += 1
            cnt += 1
            
        prob_he /= cnt
        prob_she /= cnt
        prob_oth /= cnt
        prob_h1 /= cnt
        prob_h2 /= cnt
        prob_h3 /= cnt
        prob_h4 /= cnt
            
    prob_gender_map['topicid'].append(topic_id)
    prob_gender_map['document_id'].append(doc_id)
    prob_gender_map['prob_he'].append(prob_he)
    prob_gender_map['prob_she'].append(prob_she)
    prob_gender_map['prob_oth'].append(prob_oth)

    prob_h_map['topicid'].append(topic_id)
    prob_h_map['document_id'].append(doc_id)
    prob_h_map['prob_g1'].append(prob_h1)
    prob_h_map['prob_g2'].append(prob_h2)
    prob_h_map['prob_g3'].append(prob_h3)
    prob_h_map['prob_g4'].append(prob_h4)
    return df

In [13]:
df_r.groupby(['topicid','document_id'],as_index=False).apply(get_prob_r)

res= pd.DataFrame(prob_gender_map) 
res.to_csv('fairweb1.R-GENDER.gmemb', sep=' ', index=False,)
res= pd.DataFrame(prob_h_map) 
res.to_csv('fairweb1.R-HINDEX.gmemb', sep=' ', index=False,)

In [14]:
CONST_10K = convert_to_numeric('10k')
CONST_1M = convert_to_numeric('1m')

def get_prob_rvw(df):
    entity_map = {}
    topic_id = df['topicid'].values[0]
    doc_id = df['document_id'].values[0]
    for idx, rec in df.iterrows():
        if not pd.isna(rec['noentity']):
            continue

        user = rec['usrid']
        if 'twitter_url1' in df.columns:
            e1 = rec['twitter_url1']
            e2 = rec['twitter_url2']
            e3 = rec['twitter_url3']
        elif 'youtube_url1' in df.columns:
            e1 = rec['youtube_url1']
            e2 = rec['youtube_url2']
            e3 = rec['youtube_url3']
        elif 'imdb_url1' in df.columns:
            e1 = rec['imdb_url1']
            e2 = rec['imdb_url2']
            e3 = rec['imdb_url3']
            
        if not pd.isna(e1):
            if 'num_ratings1' in df.columns:
                rvw_e1 = convert_to_numeric(rec['num_ratings1'])
            elif 'num_subscribers1' in df.columns:
                rvw_e1 = convert_to_numeric(rec['num_subscribers1'])
            elif 'num_followers1' in df.columns:
                rvw_e1 = convert_to_numeric(rec['num_followers1'])
            
            if e1 not in entity_map:
                entity_map[e1] = {
                    'rvw': rvw_e1
                }

        if not pd.isna(e2):
            if 'num_ratings2' in df.columns:
                rvw_e2 = convert_to_numeric(rec['num_ratings2'])
            elif 'num_subscribers2' in df.columns:
                rvw_e2 = convert_to_numeric(rec['num_subscribers2'])
            elif 'num_followers2' in df.columns:
                rvw_e2 = convert_to_numeric(rec['num_followers2'])
            
            if e2 not in entity_map:
                entity_map[e2] = {
                    'rvw': rvw_e2
                }
                
        if not pd.isna(e3):
            if 'num_ratings3' in df.columns:
                rvw_e3 = convert_to_numeric(rec['num_ratings3'])
            elif 'num_subscribers3' in df.columns:
                rvw_e3 = convert_to_numeric(rec['num_subscribers3'])
            elif 'num_followers3' in df.columns:
                rvw_e3 = convert_to_numeric(rec['num_followers3'])
            
            if e3 not in entity_map:
                entity_map[e3] = {
                    'rvw': rvw_e3
                }
                
    if not entity_map:
        #print(doc_id)
        prob_g1 = 1/4
        prob_g2 = 1/4
        prob_g3 = 1/4
        prob_g4 = 1/4
    else:
        prob_g1 = 0
        prob_g2 = 0
        prob_g3 = 0
        prob_g4 = 0
        cnt = 0
        for url, info in entity_map.items():
            if info['rvw'] < 100:
                prob_g1 += 1
            elif info['rvw'] < CONST_10K:
                prob_g2 += 1
            elif info['rvw'] < CONST_1M:
                prob_g3 += 1
            else:
                prob_g4 += 1
            cnt += 1
        prob_g1 /= cnt
        prob_g2 /= cnt
        prob_g3 /= cnt
        prob_g4 /= cnt
        
    prob_review_map['topicid'].append(topic_id)
    prob_review_map['document_id'].append(doc_id)
    prob_review_map['prob_g1'].append(prob_g1)
    prob_review_map['prob_g2'].append(prob_g2)
    prob_review_map['prob_g3'].append(prob_g3)
    prob_review_map['prob_g4'].append(prob_g4)
    return df

In [22]:
prob_review_map = {
    'topicid':[],
    'document_id':[],
    'prob_g1':[],
    'prob_g2':[],
    'prob_g3':[],
    'prob_g4':[],
}

df_m.groupby(['topicid','document_id'],as_index=False).apply(get_prob_rvw)
res= pd.DataFrame(prob_review_map) 
res.to_csv('fairweb1.M-REVIEWS.gmemb', sep=' ', index=False,)

'''
prob_review_map = {
    'topicid':[],
    'document_id':[],
    'prob_g1':[],
    'prob_g2':[],
    'prob_g3':[],
    'prob_g4':[],
}

df_t.groupby(['topicid','document_id']).apply(get_prob_rvw)
res= pd.DataFrame(prob_review_map) 
res.to_csv('fairweb1.T-FOLLOWERS.gmemb', sep=' ', index=False,)
'''

prob_review_map = {
    'topicid':[],
    'document_id':[],
    'prob_g1':[],
    'prob_g2':[],
    'prob_g3':[],
    'prob_g4':[],
}

df_y.groupby(['topicid','document_id'],as_index=False).apply(get_prob_rvw)
res= pd.DataFrame(prob_review_map) 
res.to_csv('fairweb1.Y-SUBSCS.gmemb', sep=' ', index=False,)

In [23]:
df_c2r = pd.read_csv('country2region.csv', header = None)
df_c2r.head()

# Country to region
c2r_map = {
    
}

for idx, rec in df_c2r.iterrows():
    c2r_map[rec[0]] = []
    if not pd.isna(rec[1]):
        c2r_map[rec[0]].append(rec[1])
    if not pd.isna(rec[2]):
        c2r_map[rec[0]].append(rec[2])
        
print(c2r_map)

{'French Southern Territories': ['Antarctica'], 'Wallis and Futuna Islands': ['Oceania'], 'Vanuatu': ['Oceania'], 'Tuvalu': ['Oceania'], 'Tonga': ['Oceania'], 'Tokelau': ['Oceania'], 'Solomon Islands': ['Oceania'], 'Samoa': ['Oceania'], 'Pitcairn Island': ['Oceania'], 'Palau': ['Oceania'], 'Northern Mariana Islands': ['Oceania'], 'Niue': ['Oceania'], 'New Zealand': ['Oceania'], 'New Caledonia': ['Oceania'], 'Nauru': ['Oceania'], 'Micronesia, Federal States of': ['Oceania'], 'Marshall Islands': ['Oceania'], 'Kiribati': ['Oceania'], 'Guam': ['Oceania'], 'French Polynesia': ['Oceania'], 'Fiji': ['Oceania'], 'Cook Islands': ['Oceania'], 'Australia': ['Oceania'], 'American Samoa': ['Oceania'], 'Yemen': ['Middle East'], 'United Arab Emirates': ['Middle East'], 'Saudi Arabia': ['Middle East'], 'Qatar': ['Middle East'], 'Oman': ['Middle East'], 'Bahrain': ['Middle East'], 'Vatican City State (Holy See)': ['Europe'], 'United Kingdom': ['Europe'], 'Ukraine': ['Europe'], 'Switzerland': ['Europe']

In [24]:
entry_term_country = {
    'Taiwan': 'Taiwan (Republic of China)',
    'Russia': 'Russian Federation',
    'German': 'Germany',
    'West Germany': 'Germany',
    'nited States': 'United States',
    'South Korea': 'Korea, Republic of (South Korea)'
    
}

In [32]:
def get_region(cl):
    res = []
    for country in cl:
        country = country.strip()
        if country not in c2r_map:
            # Special cases that we can manualy handle
            if country in entry_term_country:
                country = entry_term_country[country]
            else:
                #if country == 'none':
                #    print('aaa')
                country = '_unknown'
        try:
            if country != '_unknown':
                rl = c2r_map[country]
                for region in rl:
                    res.append(region)
        except:
            print(country)
    return res

In [33]:
prob_region_map = {
    'topicid':[],
    'document_id':[],
    'prob_g1':[],
    'prob_g2':[],
    'prob_g3':[],
    'prob_g4':[],
    'prob_g5':[],
    'prob_g6':[],
    'prob_g7':[],
    'prob_g8':[],
}

def get_prob_region(df):
    entity_map = {}
    topic_id = df['topicid'].values[0]
    doc_id = df['document_id'].values[0]
    for idx, rec in df.iterrows():
        if not pd.isna(rec['noentity']):
            continue

        user = rec['usrid']

        e1 = rec['imdb_url1']
        e2 = rec['imdb_url2']
        e3 = rec['imdb_url3']
            
        if not pd.isna(e1):
            country_list = rec['country1'].split(',')
            region_list = get_region(country_list)
            if e1 not in entity_map:
                entity_map[e1] = {
                    'region': set(region_list)
                }
            
        if not pd.isna(e2):
            country_list = rec['country2'].split(',')
            region_list = get_region(country_list)
            if e2 not in entity_map:
                entity_map[e2] = {
                    'region': set(region_list)
                }
        if not pd.isna(e3):
            country_list = rec['country3'].split(',')
            region_list = get_region(country_list)
            if e3 not in entity_map:
                entity_map[e3] = {
                    'region': set(region_list)
                }
                
    if not entity_map:
        #print(doc_id)
        prob_g1 = 1/8
        prob_g2 = 1/8
        prob_g3 = 1/8
        prob_g4 = 1/8
        prob_g5 = 1/8
        prob_g6 = 1/8
        prob_g7 = 1/8
        prob_g8 = 1/8
    else:
        prob_g1 = 0
        prob_g2 = 0
        prob_g3 = 0
        prob_g4 = 0
        prob_g5 = 0
        prob_g6 = 0
        prob_g7 = 0
        prob_g8 = 0
        cnt = 0
        flag_all_unknown = True
        for url, info in entity_map.items():
            for region in info['region']:
                if region == 'Africa':
                    prob_g1 += 1/len(info['region'])
                    flag_all_unknown = False
                elif region == 'America':
                    prob_g2 += 1/len(info['region'])
                    flag_all_unknown = False
                elif region == 'Antarctica':
                    prob_g3 += 1/len(info['region'])
                    flag_all_unknown = False
                elif region == 'Asia':
                    prob_g4 += 1/len(info['region'])
                    flag_all_unknown = False
                elif region == 'Caribbean':
                    prob_g5 += 1/len(info['region'])
                    flag_all_unknown = False
                elif region == 'Europe':
                    prob_g6 += 1/len(info['region'])
                    flag_all_unknown = False
                elif region == 'Middle East':
                    prob_g7 += 1/len(info['region'])
                    flag_all_unknown = False
                elif region == 'Oceania':
                    prob_g8 += 1/len(info['region'])
                    flag_all_unknown = False
            if len(info['region']):
                cnt += 1
        if flag_all_unknown:
            print("ALL UNKNOWN")
            prob_g1 = 1/8
            prob_g2 = 1/8
            prob_g3 = 1/8
            prob_g4 = 1/8
            prob_g5 = 1/8
            prob_g6 = 1/8
            prob_g7 = 1/8
            prob_g8 = 1/8
        else:
            prob_g1 /= cnt
            prob_g2 /= cnt
            prob_g3 /= cnt
            prob_g4 /= cnt
            prob_g5 /= cnt
            prob_g6 /= cnt
            prob_g7 /= cnt
            prob_g8 /= cnt
        
    prob_region_map['topicid'].append(topic_id)
    prob_region_map['document_id'].append(doc_id)
    prob_region_map['prob_g1'].append(prob_g1)
    prob_region_map['prob_g2'].append(prob_g2)
    prob_region_map['prob_g3'].append(prob_g3)
    prob_region_map['prob_g4'].append(prob_g4)
    prob_region_map['prob_g5'].append(prob_g5)
    prob_region_map['prob_g6'].append(prob_g6)
    prob_region_map['prob_g7'].append(prob_g7)
    prob_region_map['prob_g8'].append(prob_g8)
    return df

In [34]:
df_m.groupby(['topicid','document_id']).apply(get_prob_region)
res= pd.DataFrame(prob_region_map) 
res.to_csv('fairweb1.M-ORIGIN.gmemb', sep=' ', index=False,)

In [35]:
for index, row in res.iterrows():
    row_sum = row.iloc[2:10].sum()
    if not np.isclose(row_sum, 1.0):
        print("docid: %s sum: %.3f"%(row[1], row_sum))