In [1]:
# custom functions for this project
from functions import *

# dataframe libraries
import pandas as pd
import numpy as np

# graphing libraries
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')
from wordcloud import WordCloud

# text processing
import nltk
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer 
from textblob import TextBlob as tb
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import matutils, models
import scipy.sparse
import pronouncing

# modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# miscellany
import re
import string
from collections import Counter
import time
import gzip
import pickle

# reload functions/libraries when edited
%load_ext autoreload
%autoreload 2

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# increase column width of dataframe
pd.set_option('max_colwidth', 150)

In [4]:
# uncomment to load
with gzip.open('data/poetry_umbrella_genres_df.pkl', 'rb') as hello:
    df = pickle.load(hello)

In [5]:
df.columns

Index(['poet_url', 'poem_url', 'poet', 'title', 'poem_lines', 'poem_string',
       'genre', 'clean_lines', 'num_lines', 'num_words', 'avg_len_line',
       'sentiment_polarity_score', 'sentiment_polarity',
       'sentiment_subjectivity_score', 'num_end_rhymes', 'end_rhyme_ratio',
       'end_rhyme', 'num_syllables', 'avg_syllables_word', 'lines_titled',
       'string_titled', 'string_cleaned'],
      dtype='object')

In [None]:
df = df[['genre', 'num_lines', 'num_words', 'avg_len_line', 'sentiment_polarity_score', 'sentiment_polarity',
         'sentiment_subjectivity_score', 'num_end_rhymes', 'end_rhyme_ratio', 'end_rhyme', 'avg_syllables_word',
         'lines_titled', 'string_titled', 'string_cleaned']]


In [None]:
#### Let's separate our target variable and create a features dataframe 

target = df['genre']
features = df[['title', 'clean_lines', 'num_lines', 'avg_len_line', 'sentiment_polarity_score', 'sentiment_polarity',
               'sentiment_subjectivity_score', 'num_end_rhymes', 'end_rhyme_ratio']]

#### Now we can drop the title and clean_lines columns

features.drop(columns=['title', 'clean_lines'], inplace=True)
features.columns

In [None]:
### Trim dataframe

df = pd.concat([target, features], axis=1)
df.genre.value_counts()

df.genre.value_counts(normalize=True).cumsum()

top8 = list(df.genre.value_counts().keys())[:8]
top8

df.shape

df_top = df[df.genre.isin(top8)]
df_top.shape

df_top.genre.value_counts(normalize=True)

### TF-IDF Vectorizer and a baseline model

X = df_top.string_cleaned
y = df_top.genre

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# instantiate the vectorizer
vectorizer = TfidfVectorizer()

# fit to training data and transform 
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

bnb_baseline = BernoulliNB()
bnb_baseline.fit(X_train_vec, y_train)

# predict the new document from the testing dataset
y_preds = bnb_baseline.predict(X_test_vec)

# compute the performance measures
bnb_baseline_acc = accuracy_score(y_test, y_preds)
bnb_baseline_f1 = f1_score(y_test, y_preds, average='weighted')

print(f'Accuracy: {bnb_baseline_acc}')
print(f'F1 score: {bnb_baseline_f1}')

print('\n' + '-' * 100 + '\n')

print(classification_report(y_test, y_preds, target_names=list(y.unique())))

print('\n' + '-' * 100 + '\n')

print("confusion matrix:")
print(confusion_matrix(y_test, y_preds))

### Let's try vectorizing beforehand

# instantiate the vectorizer
vectorizer = TfidfVectorizer()

# fit to training data and transform 
X_vec = vectorizer.fit_transform(X)

X = pd.DataFrame.sparse.from_spmatrix(X_vec)
y = df_top.genre

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

bnb_baseline = BernoulliNB()
bnb_baseline.fit(X_train, y_train)

# predict the new document from the testing dataset
y_preds = bnb_baseline.predict(X_test)

# compute the performance measures
bnb_baseline_acc = accuracy_score(y_test, y_preds)
bnb_baseline_f1 = f1_score(y_test, y_preds, average='weighted')

print(f'Accuracy: {bnb_baseline_acc}')
print(f'F1 score: {bnb_baseline_f1}')

print('\n' + '-' * 100 + '\n')

print(classification_report(y_test, y_preds, target_names=list(y.unique())))

print('\n' + '-' * 100 + '\n')

print("confusion matrix:")
print(confusion_matrix(y_test, y_preds))

### Let's try with our numerical data

X = df_top.drop(columns=['genre', 'sentiment_polarity', 'lines_titled', 'string_titled'])
y = df_top.genre



X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# instantiate the vectorizer
vectorizer = TfidfVectorizer()

# fit to training data and transform 
X_train_vec = vectorizer.fit_transform(X_train.string_cleaned)
X_test_vec = vectorizer.transform(X_test.string_cleaned)

X_train_vec

X_train_vec_df = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_test_vec_df = pd.DataFrame.sparse.from_spmatrix(X_test_vec)

X_train_nums = X_train.drop(columns=['string_cleaned'])
X_test_nums = X_test.drop(columns=['string_cleaned'])



scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_nums)
X_test_scaled = scaler.transform(X_test_nums)
X_train_scaled_df = pd.DataFrame(X_train_scaled)
X_test_scaled_df = pd.DataFrame(X_test_scaled)

X_train_combo = pd.concat([X_train_scaled_df, X_train_vec_df], axis=1)
X_test_combo = pd.concat([X_test_scaled_df, X_test_vec_df], axis=1)



mnb_baseline = MultinomialNB()
mnb_baseline.fit(X_train_combo, y_train)



# predict the new document from the testing dataset
y_preds = mnb_baseline.predict(X_test_combo)

# compute the performance measures
mnb_baseline_acc = accuracy_score(y_test, y_preds)
mnb_baseline_f1 = f1_score(y_test, y_preds, average='weighted')

print(f'Accuracy: {mnb_baseline_acc}')
print(f'F1 score: {mnb_baseline_f1}')

print(classification_report(y_test, y_preds, target_names=list(y.unique())))

print("confusion matrix:")
print(confusion_matrix(y_test, y_preds))

print('------------------------------')

bnb_baseline = BernoulliNB()
bnb_baseline.fit(X_train_combo, y_train)

# predict the new document from the testing dataset
y_preds = bnb_baseline.predict(X_test_combo)

# compute the performance measures
bnb_baseline_acc = accuracy_score(y_test, y_preds)
bnb_baseline_f1 = f1_score(y_test, y_preds, average='weighted')

print(f'Accuracy: {bnb_baseline_acc}')
print(f'F1 score: {bnb_baseline_f1}')

print(classification_report(y_test, y_preds, target_names=list(y.unique())))

print("confusion matrix:")
print(confusion_matrix(y_test, y_preds))

print('------------------------------')



X = df.string_cleaned
y = df.genre

# instantiate the vectorizer
vectorizer = TfidfVectorizer()

# fit to training data and transform 
X_vec = vectorizer.fit_transform(X)

X = pd.DataFrame.sparse.from_spmatrix(X_vec)
y = df.genre

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

bnb_baseline = BernoulliNB()
bnb_baseline.fit(X_train, y_train)

# predict the new document from the testing dataset
y_preds = bnb_baseline.predict(X_test)

# compute the performance measures
bnb_baseline_acc = accuracy_score(y_test, y_preds)
bnb_baseline_f1 = f1_score(y_test, y_preds, average='weighted')

print(f'Accuracy: {bnb_baseline_acc}')
print(f'F1 score: {bnb_baseline_f1}')

print('\n' + '-' * 100 + '\n')

print(classification_report(y_test, y_preds, target_names=list(y.unique())))

print('\n' + '-' * 100 + '\n')

print("confusion matrix:")
print(confusion_matrix(y_test, y_preds))



len(vectorizer.vocabulary_.keys())

vectorizer.vocabulary_

vectorizer.idf_




# list of text documents
text = features.string_cleaned
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())