In [59]:
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

In [60]:

tokenizer = RegexpTokenizer(r'\w+')
en_stopwords= set(stopwords.words('english'))
ps=PorterStemmer()

In [61]:
data=pd.read_csv(r'C:\Users\Mridul Gupta\Jupyter Notebook\Test_Assignments_Extras\all.csv')

data.head(4)

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore


In [93]:
X_train=data.iloc[:,1].values
Y_train=data.iloc[:,4:].values

X_test=data.iloc[400:,1].values
Y_test=data.iloc[400:,4:].values

In [94]:
Y_test=Y_test.reshape((-1,1)).flatten()
Y_train=Y_train.reshape((-1,1)).flatten()


In [95]:
def getCleanPoem(P):
    
    P=P.lower()
    P=P.replace("\r\n", "")
    
    #Tokenize
    
    tokens = tokenizer.tokenize(P)
    new_tokens= [token for token in tokens if token not in en_stopwords]
    
    stemmed_tokens= [ps.stem(token) for token in new_tokens]
    
    cleaned_Poem=' '.join(stemmed_tokens)
    #cleaned_Poem = cleaned_Poem.splitlines()
    
    return (cleaned_Poem)    

In [96]:
x_train_clean= [getCleanPoem(i) for i in X_train]

x_test_clean= [getCleanPoem(i) for i in X_test]

In [97]:
#Vectorize training dataset
cv = CountVectorizer(ngram_range=(1,2))

x_vec = cv.fit_transform(x_train_clean).toarray()
print(x_vec)
print(x_vec.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(573, 51938)


In [98]:
## Vectorization on the test set
xt_vec = cv.transform(x_test_clean).toarray()
print(xt_vec)
cv.get_feature_names()
print(xt_vec.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(173, 51938)


In [99]:
from sklearn.model_selection import cross_val_score

In [100]:
from sklearn import svm

In [101]:
svc=svm.SVC()

svc.fit(x_vec,Y_train)
cross_val_score(svc,xt_vec,Y_test,scoring="accuracy",cv=5).mean()

0.5203361344537815

In [102]:
#Using grid search

In [103]:
params=[
    {'kernel' :['linear','rbf','poly','sigmoid'],
    'C':[0.1,0.2,0.5,1.0,2.0,5.0]
    }
]

In [104]:
import multiprocessing
from sklearn.model_selection import GridSearchCV


cpus=multiprocessing.cpu_count()


In [105]:
gs=GridSearchCV(estimator=svm.SVC(),param_grid=params,scoring="accuracy",cv=5,n_jobs=cpus)

In [None]:
gs.fit(x_vec,Y_train)

In [None]:
gs.best_estimator_

In [None]:
gs.best_score_