In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib as plt

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import RandomOverSampler

In [2]:
# connect to SQL database and convert to dataframes

cnx = sqlite3.connect('../data/database.sqlite')

categories = pd.read_sql_query("SELECT * FROM categories", cnx)
reviews = pd.read_sql_query("SELECT * FROM reviews", cnx)
podcasts = pd.read_sql_query("SELECT * FROM podcasts", cnx)

In [3]:
# top 5 grossing podcasts according to statista

podcasts[podcasts['slug'].str.contains('armchair')]
podcasts[podcasts['slug'].str.contains('joe-rogan')]
podcasts[podcasts['slug'].str.contains('my-favorite')]
podcasts[podcasts['slug'].str.contains('dave-ram')]
podcasts[podcasts['slug'].str.contains('bill-simmons')]

Unnamed: 0,podcast_id,itunes_id,slug,itunes_url,title


In [4]:
# categories for armchair expert
print(categories[categories['podcast_id'] == 'd4baff0192b5f556b0c3f2a273e98e93'])

# categories for JRE
print(categories[categories['podcast_id'] == 'db93a05fe337aea2061b59c96e647100'])

# categories for my favorite murder
print(categories[categories['podcast_id'] == 'bc5ddad3898e0973eb541577d1df8004'])

                             podcast_id category
10736  d4baff0192b5f556b0c3f2a273e98e93   comedy
10737  d4baff0192b5f556b0c3f2a273e98e93  tv-film
                             podcast_id category
10731  db93a05fe337aea2061b59c96e647100   comedy
                             podcast_id category
10732  bc5ddad3898e0973eb541577d1df8004   comedy


In [5]:
# merge categories and reviews columns to collect comedy review data

reviews_categories = pd.merge(reviews, categories)
comedy_reviews = reviews_categories[reviews_categories['category'] == 'comedy']

In [19]:
# extract comedy reviews from dataframe

print('{} total comedy reviews'.format(len(comedy_reviews)))

X = comedy_reviews['content']
y = comedy_reviews['rating'].apply(lambda x: 1 if x > 3 else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y)

249482 total comedy reviews


In [9]:
# preliminary model

text_clf_v1 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf_v1.fit(X_train, y_train)
predicted = text_clf_v1.predict(X_test)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.75      0.02      0.04      6221
           1       0.90      1.00      0.95     56150

    accuracy                           0.90     62371
   macro avg       0.82      0.51      0.49     62371
weighted avg       0.89      0.90      0.86     62371



In [31]:
# add stop words, change to complement NB

text_clf_v2 = Pipeline([
    ('vect', CountVectorizer(stop_words = 'english')),
    ('tfidf', TfidfTransformer()),
    ('clf', ComplementNB()),
])

text_clf_v2.fit(X_train, y_train)
predicted = text_clf_v2.predict(X_test)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90     56114
           1       0.92      0.87      0.90     56296

    accuracy                           0.90    112410
   macro avg       0.90      0.90      0.90    112410
weighted avg       0.90      0.90      0.90    112410



In [21]:
X = np.array(comedy_reviews['content']).reshape(-1,1)
y = np.array(comedy_reviews['rating'].apply(lambda x: 1 if x > 3 else 0))

In [27]:
# oversample from minority class

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(np.concatenate(X_resampled, axis=0), y_resampled)

In [32]:
text_clf_v2.fit(X_train, y_train)
predicted = text_clf_v2.predict(X_test)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90     56114
           1       0.92      0.87      0.90     56296

    accuracy                           0.90    112410
   macro avg       0.90      0.90      0.90    112410
weighted avg       0.90      0.90      0.90    112410



In [42]:
# get most prominent features
text_clf_v2['clf'].get_params
# coefs = np.array(text_clf_v2['clf'].coef_)
# bag = text_clf_v2['vect'].get_feature_names()

<bound method BaseEstimator.get_params of ComplementNB()>