In [1]:
import pandas as pd
df = pd.read_csv('metadata.csv')
df_binary = df.loc[df['inferred_gender'].isin(['female', 'male'])].reset_index(drop=True)

In [2]:
df_binary.iloc[:3]

Unnamed: 0,id,url,word_count,snippet,source,byline,byline_parsed,pub_date,inferred_gender
0,58694c8895d0e039260788f3,https://www.nytimes.com/2017/01/01/opinion/ist...,859,The New Year attack reinforces a sense that th...,The New York Times,By KAYA GENC,kaya genc,2017-01-01T18:37:51+0000,female
1,586a0d8795d0e039260789b4,https://www.nytimes.com/2017/01/02/opinion/mar...,963,The lesson of Iraq and Syria for America is th...,The New York Times,By CHRIS MURPHY,chris murphy,2017-01-02T08:21:22+0000,male
2,586a0d8795d0e039260789b5,https://www.nytimes.com/2017/01/02/opinion/lea...,715,"I banned screens, and it improved students’ en...",The New York Times,By DARREN ROSENBLUM,darren rosenblum,2017-01-02T08:21:21+0000,male


In [5]:
import spacy
import string

nlp = spacy.load('en')

def text_cleanup(mystring):
    x = ['SPACE', 'PUNCT', 'SYM', 'X', 'NUM']
    doc = nlp(mystring.lower())
    processed = [i.text for i in doc if i.pos_ not in x]
    processed_fixed =[]
    for p in processed:
        new_word = p
        for mark in string.punctuation:
            new_word = new_word.replace(mark, "")
        processed_fixed.append(new_word)
    processed_fixed = [u for u in processed_fixed if len(u) > 0]
    return processed_fixed

In [19]:
from collections import Counter 
txt_counters = []
labels = []
for row in df_binary.iterrows():
    labels.append(row[1]['inferred_gender'])
    try:
        # try to make df
        file = "tf_tables/"+row[1]['id'] + ".csv"
        df_tf = pd.read_csv(file)
        tf = Counter()
        for r in df_tf.iterrows():
            tf[str(r[1]['term'])] = int(r[1]['count'])
        txt_counters.append(tf)    
    except:    
        fname = "txt/"+row[1]['id'] + ".txt"
        with open(fname) as f:
            txt = f.read()
        tokens = text_cleanup(txt)
        f.close()
        #make counter and append
        tf = Counter(tokens)
        txt_counters.append(tf) 
        #make df and csv
        df = pd.DataFrame(tf.items(), columns=["term", "count"])
        df.to_csv(file) 
    

In [21]:
len(txt_counters)

5466

In [22]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# vectorize
v = DictVectorizer(sparse=False)
X = v.fit_transform(txt_counters)

In [23]:
from sklearn.model_selection import train_test_split
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.45, random_state=42)

In [26]:
# train regression
clf = LogisticRegression(random_state=0, solver='liblinear').fit(X_train, y_train)

In [28]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
# evaluate accuracy
accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

0.7800813008130081

In [48]:
clf.intercept_

array([2.95977205])

In [47]:
# print top coefficients
pairs = dict(zip(v.get_feature_names(), list(clf.coef_[0])))
df_coef = pd.DataFrame(pairs.items(), columns=['feature', 'coef']).sort_values(by='coef', ascending=False).reset_index(drop=True)
df_coef

Unnamed: 0,feature,coef
0,newsletter,0.584686
1,far,0.425060
2,sign,0.424061
3,supposed,0.391137
4,age,0.386638
5,issues,0.376016
6,campus,0.349466
7,began,0.343548
8,lgbt,0.322852
9,here,0.314217
