Load blog corpus

In [12]:
import glob
from bs4 import BeautifulSoup
blogs = glob.glob("../../../blogs/*.xml")
len(blogs)

19320

1. Extract all authors with >= 300 sentences (3347 males, 4282 females)
2. Randomly pair authors to produce 1282 male/male pairs, 1283 male/female pairs, and 1499 female/female pairs
3. Analyze each pair using “triangle test”
4. First 100 sentences of each is training data, last 100 sentences each is test data (tests 1 and 2

(random chance performance is 50% by design) 

In [37]:
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter

def sent_to_freq(sentences):
    #word tokenize
    token_lists = [ word_tokenize(i) for i in sentences]
    token_list = [item for sublist in token_lists for item in sublist]
    #convert to frequencies
    freqs = Counter(token_list)
    return freqs
    
def process_blog(sentence_list):
    #split into training and test
    train = sent_to_freq(sentence_list[0:100])
    test = sent_to_freq(sentence_list[-101:-1])
    return (train, test)

In [190]:
metadata = []
sentence_dict = {}

#loop all
counter = 1
for blog in blogs:
    if counter % 5000 == 0:
        print(counter)
    blogname = blog.replace("/blogs/", "")
    meta = blogname.split(".")
    
    with open(blog, encoding="latin-1") as t: 
        xml = t.read()
        text = BeautifulSoup(xml, "lxml").text
    #sentence tokenize
    sent_toke_list = sent_tokenize(text)
    row = meta[6:10]
    row.append(len(sent_toke_list))
    sentence_dict[row[0]] = sent_toke_list
    metadata.append(row)
    counter +=1

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000


In [191]:
import pickle
with open('pickled_data/blog_sentences.pickle', 'wb') as handle:
    pickle.dump(sentence_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickled_data/blog_metadata.pickle', 'wb') as handle2:
    pickle.dump(metadata, handle2, protocol=pickle.HIGHEST_PROTOCOL)

In [193]:
with open('pickled_data/blog_sentences.pickle', 'rb') as handle:
    sentence_dict = pickle.load(handle)
with open('pickled_data/blog_metadata.pickle', 'rb') as handle2:
    metadata = pickle.load(handle2)

In [257]:
import pandas as pd
df = pd.DataFrame.from_records(metadata, columns=["id","gender", "age", "category", "sentence_count"])
#screen rows with too few sentences
long_enough = df.loc[df['sentence_count'] >= 200].reset_index(drop=True)
#long_enough.iloc[:10]
len(long_enough.loc[long_enough['gender'] == 'male'].index)

3569

In [221]:
from random import shuffle
import itertools

#generate random pairings
def random_pairs(long_enough):
    male_rows = long_enough.loc[long_enough['gender'] == 'male'].reset_index(drop=True)
    female_rows = long_enough.loc[long_enough['gender'] == 'female'].reset_index(drop=True)
    male_ids = list(male_rows['id'])
    female_ids = list(female_rows['id'])
    shuffle(male_ids)
    shuffle(female_ids)

    #need to do one third male-male, one third, female-female, one third male-female
    #get lengths of both
    m = len(male_ids)
    f = len(female_ids)
    #whichever is smaller, get half for m-f, half for alike gender
    if m < f:
        mf_pair_m = male_ids[0:int(m/3)]
        mf_pair_f = female_ids[0:int(m/3)]
        #for the larger, pair a random chunk half the length of the smaller and use the rest for f-f
        mm = male_ids[int(m/3):]
        ff = female_ids[int(m/3):]
        #make tuple pairs
        mma = [j for i,j in enumerate(mm) if i % 2 == 0]
        mmb = [j for i,j in enumerate(mm) if i % 2 != 0]
        mm_pairings = list(zip(mma, mmb))

        ffa = [j for i,j in enumerate(ff) if i % 2 == 0]
        ffb = [j for i,j in enumerate(ff) if i % 2 != 0]
        ff_pairings = list(zip(ffa, ffb))
    else:    
        mf_pair_m = male_ids[0:int(f/3)]
        mf_pair_f = female_ids[0:int(f/3)]
        #for the larger, pair a random chunk half the length of the smaller and use the rest for f-f
        mm = male_ids[int(f/3):]
        ff = female_ids[int(f/3):]
        #make tuple pairs
        mma = [j for i,j in enumerate(mm) if i % 2 == 0]
        mmb = [j for i,j in enumerate(mm) if i % 2 != 0]
        mm_pairings = list(zip(mma, mmb))

        ffa = [j for i,j in enumerate(ff) if i % 2 == 0]
        ffb = [j for i,j in enumerate(ff) if i % 2 != 0]
        ff_pairings = list(zip(ffa, ffb))
    mf_pairings = list(zip(mf_pair_m, mf_pair_f))
    return ff_pairings, mm_pairings, mf_pairings

In [204]:
from sklearn.feature_extraction import DictVectorizer
from application.selective_features import dictionaries_without_features, dictionaries_of_features
from sklearn.linear_model import LogisticRegression

with open('pickled_data/fullstops.pickle', 'rb') as handle2:
    fullstops = pickle.load(handle2)

def predict_pairs(pairings):    
    results = []
    #loop pairings
    count = 1
    for p,s in pairings:
        if count % 250 == 0:
            print(count)
        counters_all = [process_blog(sentence_dict[p]), process_blog(sentence_dict[s])]
        #[0] positions are train, [1] are tests
        stop_features_train_test_a = dictionaries_of_features(counters_all[0], fullstops)
        stop_features_train_test_b = dictionaries_of_features(counters_all[1], fullstops)
        all_samples = [stop_features_train_test_a[0], stop_features_train_test_b[0],
                       stop_features_train_test_a[1], stop_features_train_test_b[1]]
        
        #instantiate vectorizer
        v = DictVectorizer()
        #transform all
        X = v.fit_transform(all_samples)
        #convert to nonsparse
        scaled_vsm = X.toarray()
        #print(len(scaled_vsm[2:4]))
        #train logistic on first 100 sentences
        lr = LogisticRegression()
        lr.fit(scaled_vsm[0:2], [0,1])
        #[0, 1] is always the correct prediction
        preds = lr.predict(scaled_vsm[2:4])
        probs = lr.predict_proba(scaled_vsm[2:4])
        
        a = [p, s, "a", 0]+[preds[0]] + [probs[0][0], probs[0][1]]
        b = [p, s, "b", 1]+[preds[1]] + [probs[1][0], probs[1][1]]
        #record results
        results.append(a)
        results.append(b)
    return results

In [233]:
#ff_pairings, mm_pairings, mf_pairings = random_pairs(long_enough)
#mf_results = predict_pairs(mf_pairings)
#mm_results = predict_pairs(mm_pairings)
#ff_results = predict_pairs(ff_pairings)

In [245]:
ff_results_all = []
mf_results_all = []
mm_results_all = []

for test in range(21):
    print(test+1)
    ff_pairings, mm_pairings, mf_pairings = random_pairs(long_enough)
    mf_results = predict_pairs(mf_pairings)
    mf_results_all.extend(mf_results)
    mm_results = predict_pairs(mm_pairings)
    mm_results_all.extend(mm_results)
    ff_results = predict_pairs(ff_pairings)
    ff_results_all.extend(ff_results)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [246]:
cols = ["id_a","id_b","test", "true", "prediction", "prob_0", "prob_1" ]
df_results_mf = pd.DataFrame.from_records(mf_results_all, columns=cols)
df_results_ff = pd.DataFrame.from_records(ff_results_all, columns=cols)
df_results_mm = pd.DataFrame.from_records(mm_results_all, columns=cols)

df_results_mf.to_csv('mf.csv')
df_results_mm.to_csv('mm.csv')
df_results_ff.to_csv('ff.csv')

In [247]:
df_mf_right = len(df_results_mf.loc[df_results_mf['true'] == df_results_mf['prediction']].index)
df_mf_wrong = len(df_results_mf.index) - df_mf_right
df_ff_right = len(df_results_ff.loc[df_results_ff['true'] == df_results_ff['prediction']].index)
df_ff_wrong = len(df_results_ff.index) - df_ff_right
df_mm_right = len(df_results_mm.loc[df_results_mm['true'] == df_results_mm['prediction']].index)
df_mm_wrong = len(df_results_mm.index) - df_mm_right

In [248]:
(1.0*df_mf_right)/len(df_results_mf.index)

0.8535984620929953

In [249]:
(1.0*df_mm_right)/len(df_results_mm.index)

0.8464585834333733

In [250]:
(1.0*df_ff_right)/len(df_results_ff.index)

0.8435117249316645

In [251]:
print(df_mm_right+df_mm_wrong)
print(df_ff_right+df_ff_wrong)
print(df_mf_right+df_mf_wrong)

49980
55608
49938


In [254]:
df_ff_wrong, df_ff_right

(8702, 46906)

In [261]:
import scipy.stats as stats
oddsratio, pvalue = stats.fisher_exact([[7674, 42306],[8702, 46906]])
pvalue

0.18698087695487198

In [None]:
[7311, 42627]