In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

import os

In [10]:
def extract_data(dirname='data/blogs/'):
    """
    Extracts information regarding gender, age,
    blog_type and zodiac in the filename
    """
    gender = []
    age = []
    blog_type = []
    zodiac = []
    
    for filename in os.listdir(dirname):
        filename_tokens = filename.split('.')
        
        gender.append(filename_tokens[1])
        age.append(int(filename_tokens[2]))
        blog_type.append(filename_tokens[3])
        zodiac.append(filename_tokens[4])
    
    return (gender, age, blog_type, zodiac)

In [11]:
gender, age, blog_type, zodiac = extract_data()

In [51]:
def get_content(data_dir='data/blogs/',n=50):
    """
    Read only 50 blogs to construct this dataset
    and read only a single post from each of those
    50 blogs.
    """
    
    blog_posts = []
    
    for filename in os.listdir(data_dir)[:n]:
        full_filename = data_dir + filename
        
        with open(full_filename) as infile:
            parsed_xml = BeautifulSoup(infile, 'lxml')
            
            posts = parsed_xml.find_all('post')
            post_text = ''
            
            for post in posts[:1]:
                post = post.get_text().strip()
                post = post.lower()
                post_text += post
            
            blog_posts.append(post_text)
    
    return blog_posts

In [52]:
blog_posts_text = get_content()

In [53]:
# column names
columns = ['gender', 'age', 'blog_type', 'zodiac', 'blog_text']

In [54]:
# create a pandas dataframe out of it
def prepare_dataset(columns, values):
    
    data = dict(zip(columns, values))
    df = pd.DataFrame(data)
    
    return df

In [55]:
data = prepare_dataset(columns, [gender[:50], age[:50], blog_type[:50], zodiac[:50], blog_posts_text[:50]])
data.head()

Unnamed: 0,age,blog_text,blog_type,gender,zodiac
0,37,"well, everyone got up and going this morning. ...",indUnk,female,Leo
1,17,"yeah, sorry for not writing for a whole there,...",Student,female,Libra
2,23,"cupid,please hear my cry, cupid, please let yo...",Arts,male,Capricorn
3,25,and did i mention that i no longer have to dea...,Arts,female,Cancer
4,25,b-logs: the business blogs paradox urllink ...,Engineering,male,Sagittarius


In [164]:
%run scripts/features.py

In [165]:
features = ['blog_text', 'blog_type', 'zodiac']

X = data[features]
y = data['gender']

In [166]:
from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder()
lbl.fit(y)

y = lbl.transform(y)

In [167]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=44)

In [168]:
# shape of training and testing set
print X_train.shape, X_test.shape

(42, 3) (8, 3)


In [169]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression

In [170]:
ft = FeatureTransformer(X_train, X_test)
log_reg = LogisticRegression()

In [171]:
pipeline_logistic = Pipeline([('ft', ft), ('log_reg', log_reg)])

In [172]:
# fit a logistic regression pipeline
pipeline_logistic.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('log_reg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

In [174]:
from sklearn.metrics import accuracy_score

predsTrain = pipeline_logistic.predict(X_train)
predsTest = pipeline_logistic.predict(X_test)

print 'Accuracy score on the training examples %f ' %(accuracy_score(y_train, predsTrain))
print 'Accuracy score on the test examples %f ' %(accuracy_score(y_test, predsTest))

Accuracy score on the training examples 0.642857 
Accuracy score on the test examples 0.375000 
