In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
import operator
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import settings

import Stemmer
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
def read_data():
    df = pd.read_excel(os.path.join('..',settings.PROCESSED_DIR, "all_with_liwc_segmented.xls"), encoding="ISO-8859-1")
    return df

In [3]:
df = read_data()

In [4]:
df['norm_persuasive'].describe()

count    2406.000000
mean      137.190942
std       167.388307
min         0.000000
25%        37.566308
50%        90.985507
75%       180.264298
max      2758.625723
Name: norm_persuasive, dtype: float64

In [5]:
persuasive_median = df['norm_persuasive'].median()
persuasive_upper_quantile = df['norm_persuasive'].quantile(.75)

In [6]:
df['persuasive_label'] = np.where(df['persuasive'] >= persuasive_median, 1, 0)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
stemmer = SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [8]:
count_vect = StemmedCountVectorizer(analyzer="word", stop_words='english', min_df=2)
X_train_counts = count_vect.fit_transform(df['transcript'])
X_train_counts.shape

(2406, 20462)

In [9]:
sorted(count_vect.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)

[('î¾', 20461),
 ('éte', 20460),
 ('èé', 20459),
 ('èâthank', 20458),
 ('èâso', 20457),
 ('èâ', 20456),
 ('ève', 20455),
 ('èt', 20454),
 ('ès', 20453),
 ('ère', 20452),
 ('èm', 20451),
 ('èll', 20450),
 ('èd', 20449),
 ('ås', 20448),
 ('åres', 20447),
 ('åre', 20446),
 ('åme', 20445),
 ('àvet', 20444),
 ('àveli', 20443),
 ('àve', 20442),
 ('zx81', 20441),
 ('zurich', 20440),
 ('zulu', 20439),
 ('zuckerman', 20438),
 ('zuckerberg', 20437),
 ('zuccotti', 20436),
 ('zopa', 20435),
 ('zooplankton', 20434),
 ('zoop', 20433),
 ('zoonot', 20432),
 ('zoom', 20431),
 ('zoolog', 20430),
 ('zoo', 20429),
 ('zone', 20428),
 ('zombi', 20427),
 ('zoe', 20426),
 ('zodiac', 20425),
 ('zipper', 20424),
 ('ziploc', 20423),
 ('ziplin', 20422),
 ('zipcar', 20421),
 ('zip', 20420),
 ('zine', 20419),
 ('zinc', 20418),
 ('zimbardo', 20417),
 ('zimbabwean', 20416),
 ('zimbabw', 20415),
 ('zillion', 20414),
 ('zilch', 20413),
 ('zika', 20412),
 ('zigzag', 20411),
 ('zig', 20410),
 ('zheng', 20409),
 ('zf', 20

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2406, 20462)

In [16]:
with open('mlk.txt') as file:  
    mlk = file.read()

In [17]:
mlk

"I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.  Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity.  But one hundred years later, the Negro still is not free. One hundred years later, the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination. One hundred years later, the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. One hundred years later, the Negro is still languishing in the corners of American society and finds himself an exile in his own land. So we have come here today to dramatize a shameful condition.  In a sen

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, df['persuasive_label'])

In [20]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, y_train)

In [21]:
predicted = clf.predict(X_test)
np.mean(predicted == y_test)

0.65282392026578073

In [25]:
new_counts = count_vect.transform([mlk])

In [27]:
X_new_tfidf = tfidf_transformer.transform(new_counts)

In [29]:
result = clf.predict(X_new_tfidf)

In [30]:
result[0]

1

In [31]:
probability = clf.predict_proba(X_new_tfidf)

In [32]:
probability

array([[ 0.21722549,  0.78277451]])

In [33]:
probability[0][1]

0.78277450808621984