In [1]:
import pandas as pd
import numpy as np
import glob

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.metrics import *
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import utils

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from itertools import islice
from tqdm import tqdm
import multiprocessing
import collections
import random
import string

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 50)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.ConvergenceWarning)

### First look at the data

In [2]:
path = './Articles'
files = glob.glob(path+'/*.csv')

li = []

for file in files:
    df = pd.read_csv(file, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)
data = data.drop(data.columns[0], axis=1)

In [3]:
data.sample(10)

Unnamed: 0,id,title,publication,author,date,year,month,url,content
141091,149947,Russian deception influenced election due to T...,Guardian,Spencer Ackerman,2017-03-30,2017.0,3.0,https://www.theguardian.com/us-news/2017/mar/3...,Donald Trump’s willingness to embrace Russian ...
139465,147817,Depression is leading cause of disability worl...,Guardian,,2017-03-31,2017.0,3.0,https://www.theguardian.com/society/2017/mar/3...,Cases of depression have ballooned almost 20% ...
140207,148778,Orange Order asks DUP to put Drumcree march on...,Guardian,Owen Bowcott,2017-06-12,2017.0,6.0,https://www.theguardian.com/uk-news/2017/jun/1...,It is 20 years since the protestant Orange Ord...
138997,147228,Man who jumped White House fence arrested by U...,Guardian,Alan Yuhas,2017-03-11,2017.0,3.0,https://www.theguardian.com/us-news/2017/mar/1...,The US secret service arrested a person who cl...
79486,200432,Why Democrats increasingly think Donald Trump ...,Vox,Andrew Prokop,2016/5/16,2016.0,5.0,http://www.vox.com/2016/5/16/11666316/senate-2...,Now that Donald Trump will be the GOP preside...
90372,215240,The bizarre world of bitcoin ‘mining’ finds a ...,Washington Post,Simon Denyer,2016-09-12,2016.0,9.0,https://web.archive.org/web/20160915002508/htt...,Inside a metal shed in the Tibetan highl...
111854,106969,Mississippi Legislature Sends Sweeping Anti-L...,Buzzfeed News,Dominic Holden,2016-04-04,2016.0,4.0,https://web.archive.org/web/20160404235336/htt...,’The bill as it passed the House. ’] The Mis...
132058,138071,Will the lords of tennis turn a blind eye to f...,New York Post,Post Editorial Board,2016-01-25,2016.0,1.0,http://nypost.com/2016/01/25/will-the-lords-of...,Add tennis to the list of global sports now fa...
95480,77340,The Future Is Almost Now,Atlantic,Elizabeth Alsop,2016-05-15,2016.0,5.0,,", I want to receive updates from partners and ..."
72193,187764,"Trump poses big threat to emerging markets, U....",Reuters,David Randall,2016-03-31,2016.0,3.0,http://www.reuters.com/article/us-emergingmark...,A Donald Trump victory in the November U. S. ...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142570 entries, 0 to 142569
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           142570 non-null  int64  
 1   title        142568 non-null  object 
 2   publication  142570 non-null  object 
 3   author       126694 non-null  object 
 4   date         139929 non-null  object 
 5   year         139929 non-null  float64
 6   month        139929 non-null  float64
 7   url          85559 non-null   object 
 8   content      142570 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 9.8+ MB


In [5]:
data['title'] = data['title'].fillna('')

### Find articles about Hillary Clinton published by Washington Post and New York Times

In [6]:
clinton_data = data[(data['year'] == 2016) &
                    (data['title'].str.contains('Hillary Clinton'))]
clinton_data = clinton_data[(clinton_data['publication'] == 'Washington Post') | 
                            (clinton_data['publication'] == 'New York Times')]
clinton_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329 entries, 2614 to 92416
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           329 non-null    int64  
 1   title        329 non-null    object 
 2   publication  329 non-null    object 
 3   author       327 non-null    object 
 4   date         329 non-null    object 
 5   year         329 non-null    float64
 6   month        329 non-null    float64
 7   url          175 non-null    object 
 8   content      329 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 25.7+ KB


In [7]:
pd.set_option('display.max_colwidth', 500)
clinton_data[['title', 'publication', 'author']].sample(5).reset_index()

Unnamed: 0,index,title,publication,author
0,91129,"In the debates, Hillary Clinton showed exactly why she should be president",Washington Post,Editorial Board
1,2927,"Donald Trump, Courting Evangelicals, Faults Hillary Clinton’s Policies and Character - The New York Times",New York Times,Ashley Parker
2,86055,The gaping hole at the heart of Hillary Clinton’s campaign,Washington Post,Paul Waldman
3,90940,The facts about Hillary Clinton and the Kathy Shelton rape case,Washington Post,Glenn Kessler
4,88708,The 2016 election is already decided. History says Hillary Clinton wins.,Washington Post,Doug Sosnik


In [8]:
pd.set_option('display.max_colwidth', 50)

### Manually label authors' gender (B indicate the authors of an article have different genders)

In [9]:
clinton_data_gender = pd.read_csv('clinton_data_gender.csv', index_col=None, header=0)
clinton_data_gender.sample(5)

Unnamed: 0,id,title,publication,author,gender,date,year,month,url,content
21,20518,"Obama Endorses Hillary Clinton, and Urges Demo...",New York Times,Julie Hirschfeld Davis and Jonathan Martin,B,2016-06-10,2016.0,6.0,,WASHINGTON — President Obama told Senator B...
121,24978,Donald Trump Slips Further Behind Hillary Clin...,New York Times,Jeremy W. Peters,M,2016-06-27,2016.0,6.0,,Weeks of provocative and outlandish behavior h...
9,20418,Gov. Jerry Brown Endorses Hillary Clinton Ahea...,New York Times,Adam Nagourney,M,2016-06-01,2016.0,6.0,,LOS ANGELES — Gov. Jerry Brown of Californi...
318,216724,Hillary Clinton has to be gracious to Donald T...,Washington Post,Paul Waldman,M,2016-11-08,2016.0,11.0,https://web.archive.org/web/20161109000218/htt...,"If Hillary Clinton wins, at some point to..."
41,21103,Why Hillary Clinton’s Polling Bounce Has a Bet...,New York Times,Nate Cohn,M,2016-08-02,2016.0,8.0,,What a month. At the end of a series of tumult...


In [10]:
clinton_data_gender.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           329 non-null    int64  
 1   title        329 non-null    object 
 2   publication  329 non-null    object 
 3   author       328 non-null    object 
 4   gender       329 non-null    object 
 5   date         329 non-null    object 
 6   year         329 non-null    float64
 7   month        329 non-null    float64
 8   url          175 non-null    object 
 9   content      329 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 25.8+ KB


In [11]:
clinton_data_gender['gender'].value_counts()

M    185
F    101
B     43
Name: gender, dtype: int64

In [12]:
clinton_data_binary = clinton_data_gender[clinton_data_gender['gender']!='B']

### Preprocess the text

In [13]:
# some preprocessing is done after pos tagging to improve the tag quality
def clean_text(unprocessed_string):
    cleaned_text = ""
    unprocessed_string = unprocessed_string.lower()
    unprocessed_string = unprocessed_string.replace("'", "").replace(".", "").replace("_", "")

    text_tokens = word_tokenize(unprocessed_string)
    for word in text_tokens:
        if word not in string.punctuation:
            #if word not in stopword_list:
                if len(word) > 1:
                    cleaned_text = cleaned_text + " " + word
    cleaned_text = ("").join(cleaned_text)
    return cleaned_text

stopword_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", 
                 "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 
                 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', 
                 "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
                 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
                 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
                 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 
                 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 
                 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 
                 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 
                 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 
                 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'own', 
                 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'should', 
                 "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ma']

In [14]:
X = clinton_data_binary['content'].reset_index(drop=True).copy()
y = clinton_data_binary['gender'].reset_index(drop=True).copy()

for i in range(len(X)):
    X[i] = clean_text(X[i])

### Compare models

In [15]:
def print_scores(scores):
    k = len(scores['test_precision_macro'])
    print('precision_macro:    ' + str(sum(scores['test_precision_macro']) / k))
    print('recall_macro:       ' + str(sum(scores['test_recall_macro']) / k))
    print('f1_macro:           ' + str(sum(scores['test_f1_macro']) / k))
    print('precision_weighted: ' + str(sum(scores['test_precision_weighted']) / k))
    print('recall_weighted:    ' + str(sum(scores['test_recall_weighted']) / k))
    print('f1_weighted:        ' + str(sum(scores['test_f1_weighted']) / k))
    
scoring = ['precision_macro', 'recall_macro', 'f1_macro',
           'precision_weighted', 'recall_weighted', 'f1_weighted']

#### Logistic Regression

In [16]:
vectorizer = TfidfVectorizer(lowercase=True, stop_words=stopword_list)
vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)

lr_model = LogisticRegression()

scores = cross_validate(lr_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

precision_macro:    0.6292401458190932
recall_macro:       0.5242857142857142
f1_macro:           0.44106899511318953
precision_weighted: 0.6386934434474791
recall_weighted:    0.6642468239564427
f1_weighted:        0.5451010430583368


#### Naive Bayes

In [17]:
nb_model = MultinomialNB()

scores = cross_validate(nb_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

precision_macro:    0.32344222625529345
recall_macro:       0.5
f1_macro:           0.39278835386338184
precision_weighted: 0.4184795358235169
recall_weighted:    0.6468844525105869
f1_weighted:        0.50819219729441


We can see logistic regression achieves better results, therefore we will use logistic regression for further exploration

### Logistic regression with TF-IDF + n-grams

In [18]:
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_features=1000, stop_words=stopword_list)
vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)

lr_model = LogisticRegression(C=10000)

scores = cross_validate(lr_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

precision_macro:    0.7539876022322831
recall_macro:       0.7260875160875161
f1_macro:           0.7246559920693797
precision_weighted: 0.7687390372995849
recall_weighted:    0.7692075015124017
f1_weighted:        0.7563576483389431


#### Most informative features from logistic regression

In [19]:
lr_model = LogisticRegression(C=10000)
lr_model.fit(X_tfidf, y)

feature_names = vectorizer.get_feature_names() 
coefs_with_fns = sorted(zip(lr_model.coef_[0], feature_names)) 
coef_word=pd.DataFrame(coefs_with_fns)
coef_word.columns='coefficient','word'
most_pos = coef_word.sort_values(by='coefficient', ascending=True).head(20).reset_index(drop=True)
most_neg = coef_word.sort_values(by='coefficient', ascending=False).head(20).reset_index(drop=True)
pd.concat([most_pos, most_neg], axis=1)

Unnamed: 0,coefficient,word,coefficient.1,word.1
0,-18.463986,foundation,18.890507,debate
1,-14.828506,keep,13.444498,gay
2,-14.644311,briefing posted,12.633489,hard
3,-14.410654,posted,11.876146,americans
4,-14.134564,bill clinton,10.606128,mr obama
5,-14.046969,briefing,10.356544,making
6,-13.291154,mr trump,10.341619,democratic
7,-13.060096,mr sanders,9.967781,weapons
8,-12.556891,trust,9.877935,best
9,-12.518605,clinton foundation,9.864462,trade


### Create pos tagged content

In [20]:
# get pos tags first, then remove stop words
# this ensures the tags are correct

X_pos = X.copy()
for i in range(len(X_pos)):
    article = X_pos[i]
    article = pos_tag(word_tokenize(article))
    article = [x for x in article if x[0] not in stopword_list]
    for j in range(len(article)):
        article[j] = article[j][0] + '-' + article[j][1]
    X_pos[i] = TreebankWordDetokenizer().detokenize(article)

In [21]:
print(X_pos[0][:1000])

hillary-JJ clinton-NN advisers-NNS allies-NNS begun-VBN extensive-JJ discussions-NNS running-VBG mate-NN seeking-VBG compile-JJ list-NN 15-CD 20-CD potential-JJ picks-NNS team-NN start-VB vetting-NN late-JJ spring-NN mrs-NN clinton-NN team-NN grapple-VB complicated-JJ questions-NNS like-IN whether-IN united-JJ states-NNS ready-JJ ticket-NN whether-IN choice-NN vice-NN president-NN would-MD able-JJ handle-VB working-VBG white-JJ house-NN former-JJ president-NN bill-NN clinton-NN wielded-VBD significant-JJ influence-NN policy-NN nomination-NN fight-NN still-RB fluid-JJ mrs-NN clinton-NN confident-JJ enough-RB victory-NN described-VBN vision-NN running-VBG mate-NN objectives-NNS search-NN according-VBG campaign-NN advisers-NNS dozen-NN democrats-NNS close-RB campaign-NN clintons-NNS does-VBZ not-RB mind-NN said-VBD intrigued-VBN several-JJ contenders-NNS scenarios-NNS among-IN names-NNS discussion-NN mrs-JJ clinton-NN mr-NN clinton-NN campaign-NN advisers-NNS senators-NNS tim-VBP kaine-NN

### Logistic regression with pos tagging + TF-IDF + n-grams

In [22]:
X_pos_tag = X_pos
y = clinton_data_binary['gender']

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_features=1000, token_pattern=r"(?u)\b\w[\w-]*\w\b")
vectorizer.fit(X_pos_tag)
X_pos_tag_tfidf = vectorizer.transform(X_pos_tag)

lr_model = LogisticRegression(C=10000, max_iter=1000)

scores = cross_validate(lr_model, X_pos_tag_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

precision_macro:    0.7231982323987955
recall_macro:       0.7030501930501931
f1_macro:           0.7024316659808489
precision_weighted: 0.7416274855722276
recall_weighted:    0.7448880822746522
f1_weighted:        0.7344057253175297


In [23]:
lr_model = LogisticRegression(C=10000, max_iter=1000)
lr_model.fit(X_pos_tag_tfidf, y)

feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(lr_model.coef_[0], feature_names)) 
coef_word=pd.DataFrame(coefs_with_fns)
coef_word.columns='coefficient','word'
most_pos = coef_word.sort_values(by='coefficient', ascending=True).head(20).reset_index(drop=True)
most_neg = coef_word.sort_values(by='coefficient', ascending=False).head(20).reset_index(drop=True)
pd.concat([most_pos, most_neg], axis=1)

Unnamed: 0,coefficient,word,coefficient.1,word.1
0,-18.649103,foundation-nn,16.494725,debate-nn
1,-14.358238,keep-vb,11.327887,americans-nns
2,-13.268541,clintons-nns,10.889115,gay-nn
3,-12.692948,voters-nns,10.888439,clinton-jj
4,-12.61941,clinton-nn foundation-nn,10.599619,weapons-nns
5,-12.352482,briefing-nn posted-vbd,10.278732,hard-jj
6,-12.27236,mr-nn sanders-nns,10.044489,worse-jjr
7,-11.995323,posted-vbd,9.928051,best-jjs
8,-11.921507,briefing-nn,9.787263,results-nns
9,-11.887587,million-cd,9.760599,trade-nn


### Doc2vec

In [28]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

n_splits = 5
kf = KFold(n_splits=n_splits)
cv_precision_macro = []
cv_recall_macro = []
cv_f1_macro = []
cv_precision_weighted = []
cv_recall_weighted = []
cv_f1_weighted = []
featureset = clinton_data_binary[['content', 'gender']]

for train_index, test_index in kf.split(featureset):
    train = featureset.iloc[train_index]
    test = featureset.iloc[test_index]
    
    train_tagged = train.apply(lambda r: TaggedDocument(words=word_tokenize(r['content']), tags=[r.gender]), axis=1)
    test_tagged = test.apply(lambda r: TaggedDocument(words=word_tokenize(r['content']), tags=[r.gender]), axis=1)

    cores = multiprocessing.cpu_count()
    model_dbow = Doc2Vec(dm=0, vector_size=100, negative=0, hs=0, min_count=2, sample=0, workers=cores)
    model_dbow.build_vocab([x for x in train_tagged.values])

    for epoch in range(30):
        model_dbow.train(utils.shuffle([x for x in train_tagged.values]), total_examples=len(train_tagged.values), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha

    y_train, X_train = vec_for_learning(model_dbow, train_tagged)
    y_test, X_test = vec_for_learning(model_dbow, test_tagged)
    lr_model = LogisticRegression(n_jobs=1, C=1e5, max_iter=1000)
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    
    cv_precision_macro.append(precision_score(y_test, y_pred, average='macro'))
    cv_recall_macro.append(recall_score(y_test, y_pred, average='macro'))
    cv_f1_macro.append(f1_score(y_test, y_pred, average='macro'))
    
    cv_precision_weighted.append(precision_score(y_test, y_pred, average='weighted'))
    cv_recall_weighted.append(recall_score(y_test, y_pred, average='weighted'))
    cv_f1_weighted.append(f1_score(y_test, y_pred, average='weighted'))

avg_precision_macro = sum(cv_precision_macro)/n_splits
avg_recall_macro = sum(cv_recall_macro)/n_splits
avg_f1_macro = sum(cv_f1_macro)/n_splits

avg_precision_weighted = sum(cv_precision_weighted)/n_splits
avg_recall_weighted = sum(cv_recall_weighted)/n_splits
avg_f1_weighted = sum(cv_f1_weighted)/n_splits

print('precision_macro:    ' + str(avg_precision_macro))
print('recall_macro:       ' + str(avg_recall_macro))
print('f1_macro:           ' + str(avg_f1_macro))
print('precision_weighted: ' + str(avg_precision_weighted))
print('recall_weighted:    ' + str(avg_recall_weighted))
print('f1_weighted:        ' + str(avg_f1_weighted))

precision_macro:    0.5342167815582812
recall_macro:       0.5335843785056088
f1_macro:           0.5300755175230507
precision_weighted: 0.5860114054149348
recall_weighted:    0.5699939503932245
f1_weighted:        0.574438322003609
