# Authorship detection with SVM

Load the data:

In [1]:
import pandas as pd
df = pd.read_csv('data/reddit2010-06_subset.csv')

#rename columns
df = df.rename(columns={'author': 'y', 'body': 'X'})

#get rid of subreddit column as well
df = df[['y', 'X']]

In [2]:
df.head()

Unnamed: 0,y,X
0,DaimyoNoNeko,I'm like this in far less complicated setup. \...
1,AJRiddle,Definitely a good reason. I think it will attr...
2,thesearenotthehammer,The people I share with generally don't have t...
3,awj,I have the Zelda Reorchestrated version of WW ...
4,Andrewr05,Just a question here.\n\nWho else thinks that ...


In [3]:
#test-training split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['X'], df['y'], test_size=0.1, random_state=42)

ScikitLearn pipeline to transform the text to a document-term matrix, do tf-idf transformation, and then apply SVM with stochastic gradient descent.

In [4]:
#Pre-processing and SVM pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=0.00005, random_state=42)),
])



Fit the model and do prediction.

In [6]:
%%time
#fit and predict
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

CPU times: user 51.1 s, sys: 167 ms, total: 51.2 s
Wall time: 51.2 s


Assess accuracy - about 15%, not great.

In [10]:
#accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, predicted))
#print out the accuracy for every single author, if desired
#print(metrics.classification_report(y_test, predicted))

0.156289707751


Do a grid search to search for the optimal alpha (regularization) parameter. Doesn't seem to help much.

In [11]:
#Grid search
from sklearn.model_selection import GridSearchCV
parameters = {'clf__alpha': (0.00005, 0.0005, 0.005, 0.05, 0.5, 1),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
gs_clf.best_score_



0.13489653224097747

In [12]:
#best parameters from grid search
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 5e-05
