# Reimplementing our model with Pipelines

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/book_genre_preprocessed.csv')

In [2]:
X = df[['combined_clean','title','summary']]
#X = df['combined_clean']
y = df['genre']
print(X.shape, y.shape)

(4640, 3) (4640,)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

from sklearn.svm import LinearSVC
svc = LinearSVC()

In [4]:
from sklearn.base import TransformerMixin, BaseEstimator

class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return np.asarray(X[self.column_name]).astype(str)

In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion

pipeline = Pipeline([
    ('column_extractor', ColumnExtractor('combined_clean')),
    ('tfidf', tfidf), # vectorize combined text
    ('svc', svc),     # feed the output to classifier
])

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

def run_experiment(X, y, pipeline, num_expts=100):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2)
        model = pipeline.fit(X_train, y_train) # train the classifier
        y_pred = model.predict(X_test)         # apply model to test set
        score = accuracy_score(y_true, y_pred)
        scores.append(score)

    print(f"Average accuracy over {num_expts} experiments: {sum(scores) / num_expts}\n")
    print("Classification report for the last experiment:\n")
    print(classification_report(y_true, y_pred))

In [7]:
run_experiment(X, y, pipeline, num_expts=5)

Average accuracy over 5 experiments: 0.7172413793103447

Classification report for the last experiment:

              precision    recall  f1-score   support

       crime       0.82      0.75      0.78       110
     fantasy       0.79      0.81      0.80       166
     history       0.75      0.76      0.76       119
      horror       0.66      0.66      0.66       117
  psychology       0.90      0.41      0.56        22
     romance       0.25      0.07      0.11        15
     science       0.74      0.80      0.77       118
      sports       0.84      0.76      0.80        21
    thriller       0.68      0.76      0.72       222
      travel       0.86      0.67      0.75        18

    accuracy                           0.74       928
   macro avg       0.73      0.64      0.67       928
weighted avg       0.74      0.74      0.73       928



# Adding features to our pipeline

In [8]:
class Apply(BaseEstimator, TransformerMixin):
    def __init__(self, fn):
        self.fn = np.vectorize(fn)
        
    def transform(self, data):
        return self.fn(data.reshape(data.size, 1))
    
    def fit(self, *_):
        return self

In [53]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')

In [54]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def stopword_count(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^\w\s]|_', '', text)
    stopwords_in_text = [w for w in text.split() if w in stop_words]
    return len(stopwords_in_text)

[nltk_data] Downloading package stopwords to /home/mia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipeline2 = Pipeline([
    ('features', FeatureUnion([
        ('combined', Pipeline([
            ('combined_extractor', ColumnExtractor('combined_clean')),
            ('tfidf', tfidf),
            ('to_array', toarray())
            #('scale', StandardScaler(with_mean=False)),
            #('pca', PCA(n_components=4))
        ])),
        ('title', Pipeline([
            ('title_extractor', ColumnExtractor('title')), # extract titles
            ('title_features', FeatureUnion([
                ('title_char_count', Apply(lambda s: len(s))),
                ('title_word_count', Apply(lambda s: len(s.split()))),
                ('title_stopword_count', Apply(stopword_count))
            ])),
        ]))
    ])),
    ('svc', svc),     # feed the output to classifier
    #('knn', knn)
])

NameError: name 'toarray' is not defined

In [72]:
run_experiment(X,y,pipeline2,num_expts=1)

TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.