In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, fbeta_score, precision_recall_curve

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [222]:
df = pd.read_pickle('kickstarter_analysis.pkl')
df = df.dropna()

# Blurbs

In [223]:
y = df['failed'].values
words = set(nltk.corpus.words.words())

In [13]:
def non_eng(string):
    phrase = " ".join(w for w in nltk.wordpunct_tokenize(string) if w.lower() in words or not w.isalpha())
    return phrase

In [130]:
## PROCESSESING USING STEMMING
#stemmer = SnowballStemmer("english")
#df['blurb_fix'] = df['blurb'].str.lower() ##lowercase
#df['blurb_fix'] = df['blurb_fix'].apply(non_eng) ##remove non-english words
#df['blurb_fix'] = df['blurb_fix'].str.replace(r'[^\w\s]+', '') ## remove punctuation
#df['blurb_fix'] = df["blurb_fix"].apply(lambda x: [stemmer.stem(y) for y in x.split()]) ##stem words
#df['blurb_fix'] = df['blurb_fix'].apply(lambda x: ' '.join(x)) ##put into one string

In [225]:
## PROCESSESING USING LEMMATIZATION
lemmatizer = WordNetLemmatizer() 
df['blurb_fix'] = df['blurb'].str.lower() ##lowercase
df['blurb_fix'] = df['blurb_fix'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x.split()]) ##get lemma
df['blurb_fix'] = df['blurb_fix'].apply(lambda x: ' '.join(x)) ##put into one string
df['blurb_fix'] = df['blurb_fix'].apply(non_eng) ##remove non-english words
df['blurb_fix'] = df['blurb_fix'].str.replace(r'[^\w\s]+', '') ## remove punctuation

In [226]:
vectorizer = CountVectorizer(min_df = 10, stop_words='english')
X = vectorizer.fit_transform(df['blurb_fix'])

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.4, random_state = 42)

In [149]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

trn = clf.predict(X_train)
## tst = clf.predict(X_test)
tst = clf.predict_proba(X_test)[:,1] > .4

print("train score:", fbeta_score(y_train, trn, beta = 1.1))
print("test score:", fbeta_score(y_test, tst, beta = 1.1))

pd.crosstab(y_test, tst, rownames=['True'], colnames=['Predicted'], margins=True)

train score: 0.6541633212314057
test score: 0.6503898088769411


Predicted,False,True,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,24889,13259,38148
1,9157,20488,29645
All,34046,33747,67793


In [None]:
df['MNB_All'] = clf.predict(X)

In [150]:
vectorizer = TfidfVectorizer(min_df = 10, stop_words='english')
Xt = vectorizer.fit_transform(df['blurb_fix'])

In [151]:
Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt, y, 
                                                   test_size = 0.4, random_state = 42)

In [165]:
clf = MultinomialNB()
clf.fit(Xt_train, yt_train)

trnt = clf.predict(Xt_train)
## tst = clf.predict(X_test)
tstt = clf.predict_proba(Xt_test)[:,1] > .4

print("train score:", fbeta_score(yt_train, trnt, beta = 1.1))
print("test score:", fbeta_score(yt_test, tstt, beta = 1.1))

pd.crosstab(yt_test, tstt, rownames=['True'], colnames=['Predicted'], margins=True)

train score: 0.6219979022195797
test score: 0.661032017903981


Predicted,False,True,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22673,15475,38148
1,7733,21912,29645
All,30406,37387,67793


In [166]:
df['MNB_All'] = clf.predict(X)

# Blurb Categories

In [5]:
cats = list(df['category_core'].unique())

NameError: name 'df' is not defined

In [228]:
stored = {}

In [4]:
stored = {}
for c in cats:
     stored[c] = df[df['category_core'] == c]

NameError: name 'cats' is not defined

In [230]:
words = set(nltk.corpus.words.words())

In [231]:
for c in cats:
    data = stored[c]
    y = data['failed'].values
    
    vectorizer = TfidfVectorizer(min_df = 10, stop_words='english')
    X = vectorizer.fit_transform(data['blurb_fix'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.4, random_state = 42)

    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    stored[c]['MNB_cat'] = clf.predict_proba(X)[:,1]>.4
    
    print("THE WORD IS {}".format(c))
    words = np.array(vectorizer.get_feature_names())
    classes = clf.classes_
    x = np.eye(X.shape[1])
    probs = clf.predict_log_proba(x)
    for i, c in enumerate(classes):
        prob = probs[:,i]
        ind = np.argsort(prob)[::-1]
    
        good_words = words[ind[:5]]
    
        good_prob = prob[ind[:5]]
    
    #print("Associated words\t     P({} | word)".format(c))
    #for w, p in zip(good_words, good_prob):
    #    print("{:>35}".format(w), "{:.2f}".format(np.exp(p)))
        
        print(c,":")
        for w, p in zip(good_words, good_prob):
            print("{} | {:.2f}".format(w, np.exp(p)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS food
0 :
keto | 0.75
bitter | 0.72
knife | 0.71
butcher | 0.70
iconic | 0.69
1 :
smile | 0.92
wonderful | 0.91
al | 0.91
trying | 0.91
pork | 0.90


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS film & video
0 :
portrait | 0.93
documentary | 0.93
stretch | 0.92
funeral | 0.88
refugee | 0.88
1 :
healthy | 0.86
hey | 0.85
cartoon | 0.82
screenplay | 0.82
wanting | 0.81


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS publishing
0 :
letterpress | 0.93
mountain | 0.92
ocean | 0.91
coast | 0.91
picture | 0.91
1 :
outlet | 0.79
finance | 0.75
proven | 0.73
condition | 0.72
exposure | 0.71


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS photography
0 :
muse | 0.88
audience | 0.88
mature | 0.88
monograph | 0.86
contain | 0.84
1 :
visit | 0.90
able | 0.90
ich | 0.89
drone | 0.89
wedding | 0.88


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS crafts
0 :
plush | 0.91
enamel | 0.88
timeless | 0.84
giant | 0.82
orphan | 0.82
1 :
furniture | 0.86
picture | 0.86
open | 0.86
led | 0.86
soy | 0.86


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS music
0 :
folk | 0.94
heading | 0.94
alt | 0.93
printing | 0.93
roll | 0.92
1 :
viral | 0.85
skill | 0.84
speech | 0.82
ist | 0.82
aspiring | 0.79


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS art
0 :
enamel | 0.93
pin | 0.92
sketchbook | 0.92
coloring | 0.91
78 | 0.90
1 :
passionate | 0.85
candle | 0.82
active | 0.82
geared | 0.81
sand | 0.80


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS fashion
0 :
adventure | 0.90
wallet | 0.88
tote | 0.87
enamel | 0.87
anti | 0.86
1 :
semi | 0.82
precious | 0.81
lace | 0.80
headband | 0.80
user | 0.79


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS theater
0 :
identity | 0.88
installation | 0.87
produced | 0.87
satire | 0.87
cycle | 0.86
1 :
gospel | 0.82
brother | 0.79
message | 0.79
pirate | 0.78
workout | 0.77


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS journalism
0 :
winning | 0.77
annual | 0.76
och | 0.76
award | 0.75
edition | 0.72
1 :
die | 0.92
sport | 0.90
helping | 0.90
latest | 0.88
portal | 0.86


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS technology
0 :
oscilloscope | 0.85
cortex | 0.82
toothbrush | 0.79
ruler | 0.79
nixie | 0.77
1 :
pour | 0.95
para | 0.93
dating | 0.93
job | 0.93
rent | 0.92


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS games
0 :
28mm | 0.97
novel | 0.95
miniature | 0.94
5e | 0.94
visual | 0.93
1 :
ball | 0.85
rush | 0.83
tag | 0.82
flag | 0.82
golf | 0.81


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS comics
0 :
issue | 0.96
collected | 0.95
collection | 0.93
high | 0.93
face | 0.93
1 :
writing | 0.67
martial | 0.59
web | 0.58
purpose | 0.57
store | 0.54


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS dance
0 :
work | 0.96
collaboration | 0.96
concert | 0.96
length | 0.95
premiere | 0.95
1 :
confidence | 0.77
adult | 0.65
fit | 0.62
purpose | 0.60
para | 0.60
THE WORD IS design
0 :
watch | 0.95
leather | 0.95
pen | 0.95
pocket | 0.95
bag | 0.94
1 :
barn | 0.84
healing | 0.82
related | 0.79
retreat | 0.79
local | 0.78


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [248]:
final = list()
for c in cats:
    final.append(stored[c])

In [249]:
final_all = pd.concat(final)

In [250]:
final_all.to_pickle('kickstarter_NB.pkl')

# Description

In [11]:
df = pd.read_pickle('kickstarter_NB.pkl')
df = df.dropna()

In [12]:
y = df['failed'].values
words = set(nltk.corpus.words.words())

In [14]:
## PROCESSESING USING LEMMATIZATION
lemmatizer = WordNetLemmatizer() 
df['desc_fix'] = df['description'].str.lower() ##lowercase
df['desc_fix'] = df['desc_fix'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x.split()]) ##get lemma
df['desc_fix'] = df['desc_fix'].apply(lambda x: ' '.join(x)) ##put into one string
df['desc_fix'] = df['desc_fix'].apply(non_eng) ##remove non-english words
df['desc_fix'] = df['desc_fix'].str.replace(r'[^\w\s]+', '') ## remove punctuation

In [15]:
vectorizer = TfidfVectorizer(min_df = 10, stop_words='english')
Xt = vectorizer.fit_transform(df['desc_fix'])

In [16]:
Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt, y, 
                                                   test_size = 0.4, random_state = 42)

In [19]:
clf = MultinomialNB()
clf.fit(Xt_train, yt_train)

trnt = clf.predict(Xt_train)
## tst = clf.predict(X_test)
tstt = clf.predict_proba(Xt_test)[:,1] > .4

print("train score:", fbeta_score(yt_train, trnt, beta = 1.1))
print("test score:", fbeta_score(yt_test, tstt, beta = 1.1))

print(pd.crosstab(yt_test, tstt, rownames=['True'], colnames=['Predicted'], margins=True))

df['MNB_All_Desc'] = clf.predict(Xt)

train score: 0.6420398098711503
test score: 0.6891879291271844
Predicted  False   True    All
True                          
0          26354  11743  38097
1           8091  21605  29696
All        34445  33348  67793


In [21]:
cats = list(df['category_core'].unique())

In [22]:
stored = {}
for c in cats:
     stored[c] = df[df['category_core'] == c]

In [24]:
for c in cats:
    data = stored[c]
    y = data['failed'].values
    
    vectorizer = TfidfVectorizer(min_df = 10, stop_words='english')
    X = vectorizer.fit_transform(data['desc_fix'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.4, random_state = 42)

    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    stored[c]['MNB_cat_Desc'] = clf.predict_proba(X)[:,1]>.4
    
    print("THE WORD IS {}".format(c))
    words = np.array(vectorizer.get_feature_names())
    classes = clf.classes_
    x = np.eye(X.shape[1])
    probs = clf.predict_log_proba(x)
    for i, c in enumerate(classes):
        prob = probs[:,i]
        ind = np.argsort(prob)[::-1]
    
        good_words = words[ind[:5]]
    
        good_prob = prob[ind[:5]]
    
    #print("Associated words\t     P({} | word)".format(c))
    #for w, p in zip(good_words, good_prob):
    #    print("{:>35}".format(w), "{:.2f}".format(np.exp(p)))
        
        print(c,":")
        for w, p in zip(good_words, good_prob):
            print("{} | {:.2f}".format(w, np.exp(p)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS food
0 :
unlocked | 0.66
birch | 0.65
keto | 0.64
bitters | 0.64
ketogenic | 0.63
1 :
ich | 0.94
franchise | 0.89
ist | 0.89
homeless | 0.89
da | 0.88


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS film & video
0 :
archival | 0.87
documentary | 0.87
refugee | 0.85
deductible | 0.85
correction | 0.85
1 :
marshal | 0.77
twitch | 0.75
ich | 0.75
cartoon | 0.75
ruler | 0.74


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS publishing
0 :
unlocked | 0.93
letterpress | 0.93
stretch | 0.92
novella | 0.92
sherlock | 0.91
1 :
ich | 0.73
medio | 0.65
medallion | 0.63
algebra | 0.63
saber | 0.63


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS photography
0 :
surprisingly | 0.80
subscribe | 0.78
muse | 0.77
vol | 0.77
residency | 0.76
1 :
business | 0.84
equipment | 0.83
drone | 0.82
hobby | 0.82
wedding | 0.82


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS crafts
0 :
plush | 0.94
enamel | 0.85
burl | 0.85
stretch | 0.82
pin | 0.81
1 :
ai | 0.86
mon | 0.84
pour | 0.83
car | 0.83
furniture | 0.82


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS music
0 :
banjo | 0.93
bluegrass | 0.93
fiddle | 0.92
stretch | 0.91
drew | 0.90
1 :
autism | 0.80
ich | 0.75
advertise | 0.73
trap | 0.72
duro | 0.71


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS art
0 :
unlocked | 0.94
sketchbook | 0.92
pin | 0.91
enamel | 0.90
clutch | 0.89
1 :
mir | 0.74
karate | 0.74
ich | 0.74
bless | 0.73
ist | 0.73


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS fashion
0 :
unlocked | 0.89
wallet | 0.87
scarf | 0.86
sticker | 0.86
pin | 0.85
1 :
gown | 0.78
jewel | 0.78
advertise | 0.76
jewelry | 0.76
bead | 0.75


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS theater
0 :
queer | 0.86
rose | 0.85
pleasance | 0.85
fringe | 0.84
collective | 0.84
1 :
gospel | 0.73
spiritual | 0.73
fright | 0.70
domino | 0.67
inspirational | 0.67


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS journalism
0 :
eve | 0.65
dispatch | 0.61
occupy | 0.61
narrative | 0.61
stretch | 0.61
1 :
ich | 0.92
die | 0.89
fitness | 0.89
user | 0.87
sports | 0.87


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS technology
0 :
nixie | 0.78
stabilizer | 0.73
pinhole | 0.72
poplar | 0.72
kiwi | 0.72
1 :
dating | 0.95
di | 0.91
booking | 0.91
ich | 0.90
che | 0.89


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS games
0 :
miniature | 0.92
resin | 0.91
unlocked | 0.91
steam | 0.91
28mm | 0.90
1 :
application | 0.78
golf | 0.77
dart | 0.76
mobile | 0.75
clicker | 0.74


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS comics
0 :
unlocked | 0.96
variant | 0.95
stretch | 0.95
issue | 0.94
add | 0.94
1 :
famine | 0.52
hive | 0.51
hatch | 0.51
unapologetic | 0.50
khan | 0.50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS dance
0 :
designer | 0.95
choreographer | 0.94
2012 | 0.94
concert | 0.94
collaborator | 0.94
1 :
cheer | 0.59
cheerleader | 0.51
yo | 0.50
football | 0.49
rented | 0.48


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


THE WORD IS design
0 :
knife | 0.96
strap | 0.96
leather | 0.96
bag | 0.94
blade | 0.94
1 :
barn | 0.82
cabin | 0.78
skate | 0.71
land | 0.71
housing | 0.70


In [26]:
alldesc = list()
for c in cats:
    alldesc.append(stored[c])
    
alldesc = pd.concat(alldesc)

In [27]:
alldesc.head()

Unnamed: 0,blurb,country,created_at,currency,deadline,launched_at,slug,spotlight,staff_pick,state,...,created_month,created_weekday,deadline_month,deadline_weekday,length_of_project,blurb_fix,MNB_cat,desc_fix,MNB_All_Desc,MNB_cat_Desc
0,Ginger KICK! is back for the holidays with ama...,US,2017-11-05 12:05:03,USD,2017-11-12 20:33:24,2017-11-05 20:33:24,ginger-kick-holiday-cheer,1,0,successful,...,Nov,Sun,Nov,Sun,7,ginger kick is back for the holiday with amaz...,True,for those visiting for the first time here s...,0,False
11,More than just a farm.,CA,2015-06-29 03:23:38,CAD,2015-12-05 06:00:00,2015-11-05 15:02:36,flavourful-farms,1,1,successful,...,Jun,Mon,Dec,Sat,159,more than just a farm,True,built to serve you farm wa after year of resea...,1,True
13,"I'm on a mission to make the best, most sinful...",US,2014-07-07 21:05:23,USD,2014-09-09 03:44:57,2014-07-11 03:44:57,creating-a-perfect-mac-and-cheese,0,0,failed,...,Jul,Mon,Sep,Tue,63,i m on a mission to make the best most sinfu...,True,my goal is to create ultra gourmet mac and che...,1,True
14,I want to open a neighborhood bakery so I can ...,US,2015-08-14 00:29:00,USD,2015-09-14 11:21:58,2015-08-15 11:21:58,old-irish-bakery,0,0,failed,...,Aug,Fri,Sep,Mon,31,i want to open a neighborhood bakery so i can ...,True,every one of my happy childhood memory involve...,1,True
18,A wine lounge that educates the mind and the p...,US,2018-02-02 02:26:14,USD,2018-05-20 03:59:00,2018-04-19 17:32:16,blacksburg-wine-lab,1,0,successful,...,Feb,Fri,May,Sun,107,a wine lounge that the mind and the palate in...,True,overview wine lab is so because it will be de...,1,True


In [28]:
alldesc.to_pickle('kickstarter_NB.pkl')

# Predicting Fails

In [95]:
fy = df['failed'].values

In [60]:
vectorizer = CountVectorizer(min_df = 10)
fX = vectorizer.fit_transform(df['blurb_fix'].values)

In [62]:
clf = MultinomialNB()
clf.fit(fX, y)
words = np.array(vectorizer.get_feature_names())
classes = clf.classes_

In [63]:
x = np.eye(X.shape[1])
probs = clf.predict_log_proba(x)[:, 0]
ind = np.argsort(probs)

good_words = words[ind[:10]]
bad_words = words[ind[-10:]]

good_prob = probs[ind[:10]]
bad_prob = probs[ind[-10:]]

print("Failed words\t     P(1 | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Successful words\t     P(1 | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))

ValueError: shapes (7109,7109) and (7315,2) not aligned: 7109 (dim 1) != 7315 (dim 0)