In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

In [3]:
df = pd.read_pickle('kickstarter_analysis.pkl')
df = df.dropna()

In [4]:
y = df['category_core'].values

In [5]:
vectorizer = CountVectorizer(min_df = 10)
X = vectorizer.fit_transform(df['blurb'].values)

In [6]:
clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
words = np.array(vectorizer.get_feature_names())
classes = clf.classes_

In [8]:
x = np.eye(X.shape[1])
probs = clf.predict_log_proba(x)

In [12]:
for i, c in enumerate(classes):
    prob = probs[:,i]
    ind = np.argsort(prob)[::-1]
    
    good_words = words[ind[:5]]
    
    good_prob = prob[ind[:5]]
    
    #print("Associated words\t     P({} | word)".format(c))
    #for w, p in zip(good_words, good_prob):
    #    print("{:>35}".format(w), "{:.2f}".format(np.exp(p)))
        
    print(c,":")
    for w, p in zip(good_words, good_prob):
        print("{} | {:.2f}".format(w, np.exp(p)))

art :
sculpture | 0.93
mural | 0.91
installation | 0.83
sculptures | 0.83
painting | 0.83
comics :
webcomic | 0.87
comic | 0.78
comics | 0.72
graphic | 0.63
manga | 0.58
crafts :
candles | 0.92
soaps | 0.79
pens | 0.76
candle | 0.74
scented | 0.74
dance :
choreographers | 0.77
ballet | 0.71
dance | 0.66
choreographer | 0.65
choreography | 0.63
design :
font | 0.61
titanium | 0.60
edc | 0.58
poster | 0.54
logos | 0.53
fashion :
footwear | 0.87
sandals | 0.79
clothing | 0.78
shoes | 0.78
apparel | 0.77
film & video :
webseries | 0.93
cortometraje | 0.91
film | 0.87
mockumentary | 0.86
animated | 0.84
food :
gourmet | 0.93
bakery | 0.92
sauces | 0.92
sauce | 0.89
brewery | 0.89
games :
platformer | 0.90
uspcc | 0.90
rpg | 0.88
28mm | 0.88
strategy | 0.86
journalism :
journalism | 0.66
news | 0.50
journalists | 0.45
reporting | 0.41
coverage | 0.39
music :
ep | 0.97
album | 0.97
cd | 0.94
lp | 0.92
recording | 0.92
photography :
photobook | 0.87
nudes | 0.77
photographing | 0.73
photograph

For the most part, most of the categories and their top 5 words make sense. They pretty much describe the category that tehy're in (i.e. the probability of art given that the word is 'sculpture' is 93%). It's surprising to see that not all categories have at least a few strongly predictive words (i.e. words giving probablity > 90%). The category "dance"'s most predictive word is "choreographers" at 77% and "design"s most preditive word is "font" at 61%.  

Additionally, there are words that don't seem to make much sense such as "edc" predicting the "design" category and "28mm" predicting "games".   

Since Kickstarters are international, there are also some words that are non-english being a big predictor of the category such as "teatro" for theater and "cortometraje"(short film) for film & video. 

# Predicting Fails

In [95]:
fy = df['failed'].values

In [97]:
vectorizer = CountVectorizer(min_df = 10)
fX = vectorizer.fit_transform(df['blurb'].values)

In [98]:
clf = MultinomialNB()
clf.fit(fX, fy)
words = np.array(vectorizer.get_feature_names())
classes = clf.classes_

In [111]:
x = np.eye(X.shape[1])
probs = clf.predict_log_proba(x)[:, 0]
ind = np.argsort(probs)

good_words = words[ind[:10]]
bad_words = words[ind[-10:]]

good_prob = probs[ind[:10]]
bad_prob = probs[ind[-10:]]

print("Failed words\t     P(1 | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Successful words\t     P(1 | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))

Failed words	     P(1 | word)
               tengo 0.95
           ecommerce 0.95
         contractors 0.94
         videography 0.94
               tarts 0.94
             webpage 0.94
              bijoux 0.94
             pallets 0.93
            toppings 0.93
                buen 0.93
Successful words	     P(1 | word)
             badgirl 0.04
                32mm 0.04
       choreographic 0.04
            calavera 0.03
            wargames 0.03
                  5e 0.03
                28mm 0.02
                 gbs 0.01
            everette 0.01
             hartsoe 0.01
