In [1]:
import numpy as np
import pandas as pd
import nltk

In [227]:
df = pd.read_csv('all.csv')

In [241]:
df.columns

Index(['author', 'content', 'poem name', 'age', 'type'], dtype='object')

In [242]:
author = df['author']
content = df['content']

In [243]:
import re

def clean(string):
    for char in ['\r', '\n', '-', ',', '.', ';']:
        string = string.replace(char, ' ')
    return re.sub('\s+', ' ', string)

In [244]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
content = content.apply(lambda x: ps.stem(clean(x)))

In [245]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(content)

In [246]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(author)

In [247]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=150)
X = svd.fit_transform(X)

In [261]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [268]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=150, criterion='gini')
rfc.fit(X_train, y_train)
if X_test.shape[0]:
   print(accuracy_score(rfc.predict(X_test), y_test))   

0.5


In [269]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)
mlp.fit(X_train, y_train)
if X_test.shape[0]:
   print(accuracy_score(mlp.predict(X_test), y_test))

0.5290697674418605


In [270]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=2, gamma=2)
svm.fit(X_train, y_train)
if X_test.shape[0]:
   print(accuracy_score(svm.predict(X_test), y_test))

0.5581395348837209


In [271]:
from sklearn.ensemble import VotingClassifier

estimators = [
           ('svm', svm),
           ('mlp', mlp),
           ('rfc', rfc)
]

vc = VotingClassifier(estimators=estimators, voting='hard')
vc.fit(X_train, y_train)
if X_test.shape[0]:
   print(accuracy_score(vc.predict(X_test), y_test))

0.5174418604651163


In [274]:
t = "roses are red, violets are blue "
le.inverse_transform(vc.predict(svd.transform(vectorizer.transform([ps.stem(clean(t))]))))

array(['WILLIAM SHAKESPEARE'], dtype=object)