In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import sparse

In [2]:
training_set_dataframe = pd.read_csv("data/train_data.csv", encoding="utf8")
training_set_dataframe.head()

In [4]:
training_set_dataframe.author.unique().shape

(5000,)

In [5]:
training_targets_dataframe = pd.read_csv("data/train_target.csv")
training_targets_dataframe.head()

In [33]:
training_set = list(training_set_dataframe.body)
training_set[:10]

["I don't think we'd get nearly as much fanfiction and pictures shipping Ban-Ban and Lyro. Just saying.",
 "Thanks. I made it up, that's how I got over my first heart break. ",
 "Are you sure you aren't confusing Cyclops (the easiest boss monster) for Ogres? I'm talking about [these guys](http://i.imgur.com/c3YKPdI.jpg)\n\nMaybe I'm just a bad player... But every time I faced one on my first playthrough, all my pawns ended up getting to 0 HP at least once and I could barely get an attack in once it started berserking.",
 'dont do this to me bro',
 "That's what we do when we can't find a mate",
 'Damn I love this question. Here\'s what I think:\n\n* Church Fathers and earlier heresies (i.e. Gnosticism, Marcionism, Arianism etc...). Also including arguments Church Fathers held for the true Christianity. And it\'d be awesome if there was St. Nicholas laying the smack down on Arius. \n\n* Persecutions of Christians\n\n* Constantine\'s conversion and legalization of Christianity\n\n* Ecumen

In [32]:
training_targets_dictionary = dict(zip(targets.author, targets.gender))
training_targets = list(map(lambda a: training_targets_dictionary[a], training_set_dataframe.author))
training_targets[:10]

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

token_pattern ='(?u)\\b[A-Za-z]{1,}'
count_vectorizer = CountVectorizer(max_features = 100000, token_pattern = token_pattern, ngram_range = (1, 2))
count_vectorized_training_set = count_vectorizer.fit_transform(training_set)
tfidf = TfidfTransformer()                     
tfidf_training_set = tfidf.fit_transform(count_vectorized_training_set) 
standard_scaler = preprocessing.StandardScaler(with_mean = False).fit(tfidf_training_set)
tfidf_scaled_training_set = standard_scaler.transform(tfidf_training_set)

print("training_targets is a list of categories: %s ..." % str(training_targets)[:70])
print("tfidf_scaled_training_set has %d feature vectors with dim " % (tfidf_scaled_training_set.shape[1]))
print("dataset has %d rows" % (len(training_set)))

y_train is a list of categories: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,  ...
X_train has 100000 feature vectors
y_train has 296042 target classes
dataset has 296042 rows


"\nclf2 = Pipeline([\n     ('vect', CountVectorizer()),\n     ('tfidf', TfidfTransformer()),\n     ('clf2', SGDClassifier(loss='hinge', penalty='elasticnet',\n                           alpha=1e-3, random_state=0,\n                           max_iter=10, tol=None)),\n])\n"

In [16]:
clf_multinomial_naive_bayes = MultinomialNB()
clf_multinomial_naive_bayes.fit(tfidf_scaled_training_set, training_targets)
print("Trained MultinomialNB Classifier")
print(clf_multinomial_naive_bayes.score(tfidf_scaled_training_set, training_targets))

Trained MultinomialNB Classifier
0.8249809148701873


In [17]:
clf_logistic_regression = LogisticRegression(C = 1, tol = 0.1, max_iter=10000, random_state = 0)
clf_logistic_regression.fit(tfidf_scaled_training_set, training_targets)
print("Trained Logistic Classifier")
print(clf_logistic_regression.score(tfidf_scaled_training_set, training_targets))

Trained Logistic Classifier


In [24]:
token_pattern ='(?u)\\b[A-Za-z]{1,}'
clf_pipeline = Pipeline([
     ('vect', CountVectorizer(max_features = 100000, token_pattern = token_pattern, ngram_range = (1, 2))),
     ('tfidf', TfidfTransformer()),
     ('scaler', preprocessing.StandardScaler(with_mean = False)),
     ('clf', LogisticRegression(C = 1, tol = 0.1, max_iter=10000, random_state = 0))
])
clf_pipeline.fit(training_set, training_targets)
print(clf_pipeline.score(training_set, training_targets))

0.9779997432796698


In [24]:
subreddits = training_set.subreddit.unique()
subreddits_map = pd.Series(index=subreddits, data=arange(subreddits.shape[0]))
print(subreddits_map)
print(subreddits)

mylittlepony            0
sex                     1
DragonsDogma            2
malefashionadvice       3
todayilearned           4
                     ... 
palegirls            3463
onions               3464
mumfordandsons       3465
infertility          3466
HangoutFest          3467
Length: 3468, dtype: int32
['mylittlepony' 'sex' 'DragonsDogma' ... 'mumfordandsons' 'infertility'
 'HangoutFest']


In [33]:
def extract_features(group):
    group_subreddits = group['subreddit']
    group_subreddits = group_subreddits[group_subreddits.isin(subreddits_map.index)].values
    idxs = subreddits_map.loc[group_subreddits].values
    v = sparse.dok_matrix((1, subreddits.shape[0]))
    for idx in idxs:
        if not np.isnan(idx):
            v[0, idx] = 1
    return v.tocsr()

extract_features(training_set[training_set.author=='RedThunder90'])

features_dict = {}

for author, group in training_set.groupby('author'):
    features_dict[author] = extract_features(group)
    
X = sparse.vstack([features_dict[author] for author in targets.author])
print(X)

y = targets.gender
print(y)

  (0, 103)	1.0
  (1, 45)	1.0
  (1, 7)	1.0
  (1, 95)	1.0
  (1, 22)	1.0
  (1, 925)	1.0
  (1, 384)	1.0
  (1, 50)	1.0
  (1, 12)	1.0
  (1, 17)	1.0
  (1, 4)	1.0
  (1, 507)	1.0
  (1, 1040)	1.0
  (1, 2678)	1.0
  (1, 434)	1.0
  (1, 472)	1.0
  (1, 1756)	1.0
  (1, 826)	1.0
  (2, 53)	1.0
  (3, 120)	1.0
  (3, 1646)	1.0
  (3, 7)	1.0
  (3, 95)	1.0
  (3, 17)	1.0
  (3, 1850)	1.0
  :	:
  (4998, 41)	1.0
  (4998, 794)	1.0
  (4998, 19)	1.0
  (4998, 7)	1.0
  (4998, 81)	1.0
  (4998, 14)	1.0
  (4998, 1165)	1.0
  (4998, 115)	1.0
  (4999, 1378)	1.0
  (4999, 95)	1.0
  (4999, 12)	1.0
  (4999, 126)	1.0
  (4999, 1373)	1.0
  (4999, 3)	1.0
  (4999, 8)	1.0
  (4999, 1664)	1.0
  (4999, 768)	1.0
  (4999, 704)	1.0
  (4999, 547)	1.0
  (4999, 206)	1.0
  (4999, 182)	1.0
  (4999, 915)	1.0
  (4999, 325)	1.0
  (4999, 2280)	1.0
  (4999, 7)	1.0
0       0
1       1
2       0
3       0
4       0
       ..
4995    0
4996    1
4997    0
4998    1
4999    0
Name: gender, Length: 5000, dtype: int64


In [29]:
def extract_text(group):
    group_text = group['body'].astype(str).values
    return " ".join(group_text)

extract_text(training_set[training_set.author=='RedThunder90'])

'I still prefer to buy foods either grown locally or where animals are treated better, but this definitely has me looking at organic food differently.'

In [34]:
text_dict = {}

for author, group in training_set.groupby('author'):
    text_dict[author] = extract_text(group)

author_text = [text_dict[author] for author in targets.author]
author_text[0][:100]

'I still prefer to buy foods either grown locally or where animals are treated better, but this defin'

In [35]:
test_set = pd.read_csv("data/test_data.csv", encoding="utf8")
test_set.head()

Unnamed: 0,author,subreddit,created_utc,body
0,ejchristian86,TwoXChromosomes,1388534000.0,I hadn't ever heard of them before joining thi...
1,ZenDragon,gaming,1388534000.0,"At 7680 by 4320 with 64x AA, right?"
2,savoytruffle,AskReddit,1388534000.0,bite me
3,hentercenter,stlouisblues,1388534000.0,Damn that was a good penalty :(
4,rick-o-suave,army,1388534000.0,I swore into DEP on 6-OCT and I left 5-NOV und...


In [43]:
features_dict = {}

for author, group in test_set.groupby('author'):
    features_dict[author] = extract_features(group)

In [44]:
X_test = sparse.vstack([features_dict[author] for author in test_set.author.unique()])
X_test

<15000x3468 sparse matrix of type '<class 'numpy.float64'>'
	with 144898 stored elements in Compressed Sparse Row format>

In [46]:
text_dict = {}

for author, group in test_set.groupby('author'):
    text_dict[author] = extract_text(group)

author_text_test = [text_dict[author] for author in test_set.author.unique()]

In [63]:
author_text_test[0][:345]

"I hadn't ever heard of them before joining this subreddit. They're not really a big thing in the US, but are apparently very common in many European countries.\n\nA menstrual cup is a small silicone cup with a grip on the bottom (usually a small stick, though some companies offer ball and ring grips too) that you fold up and stick in your vagina"

In [57]:
y_pred = np.zeros(X_test.shape[0])

In [58]:
solution = pd.DataFrame({"author":test_set.author.unique(), "gender":y_pred})
solution.head()

Unnamed: 0,author,gender
0,ejchristian86,0.0
1,ZenDragon,0.0
2,savoytruffle,0.0
3,hentercenter,0.0
4,rick-o-suave,0.0
