### Text Classification

In [None]:
%matplotlib inline

import pandas as pd

In [None]:
#https://www.kaggle.com/zynicide/wine-reviews/data

wine_df = pd.read_csv('data/wine_reviews.csv')


<div class="alert alert-info">
<h3> Your turn</h3>
<p> What's in the dataset?
</div>


In [None]:
wine_df.head()

In [None]:
wine_df['description'][:5]

![google_search.png](images/google_search.png)

In [None]:
pd.set_option('display.max_colwidth', 120)



In [None]:
wine_df['description'][:5]

In [None]:
wine_df.head()

### Turning words in to features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
vectorizer = CountVectorizer(lowercase=True,
                             ngram_range = (1,1),
                             stop_words  = 'english',
                             max_df      = .60,
                             min_df      = .01,
                             max_features = None)

In [None]:
vectorizer.fit(wine_df['description'])

In [None]:
len(vectorizer.get_feature_names())

In [None]:
review_word_counts = vectorizer.transform(wine_df['description'])

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb_classifier = MultinomialNB()


In [None]:
nb_classifier.fit(review_word_counts, wine_df['rating'])

In [None]:
nb_classifier.coef_

In [None]:
coeficients = pd.Series(nb_classifier.coef_[0], 
                        index=vectorizer.get_feature_names())

In [None]:
coeficients.sort_values()[:20]

In [None]:
coeficients.sort_values(ascending=False)[:20]

<div class="alert alert-info">
<h3> Your turn</h3>
<p> New groups! Load up the group spreadsheet and find your group. Working in your group, use the "ge_speeches.json" file to determine what are the most distinguishing words used by Hillary Clinton and Donald Trump during the 2016 election. Do this in a new notebook!

</div>


In [None]:
nb_classifier.predict(review_word_counts)

In [None]:
wine_df['prediction']  = nb_classifier.predict(review_word_counts)

In [None]:
pd.crosstab(wine_df['rating'], wine_df['prediction'])

In [None]:
nb_classifier.predict_proba(review_word_counts)

In [None]:
predict_df = pd.DataFrame(nb_classifier.predict_proba(review_word_counts), 
                          columns=nb_classifier.classes_)

In [None]:
predict_df.head()

In [None]:
wine_df_prediction = pd.concat([wine_df, predict_df], axis = 1)

In [None]:
wine_df_prediction.sort_values('High', ascending=False)[['description','points']].head(15)

In [None]:
wine_df_prediction.sort_values('Low', ascending=False)[['description','points']].head(15)

<div class="alert alert-info">
<h3> Your turn</h3>
<p> In your groups, how well do your models fit? What is the most Trumpish Trump speech? What is the least?

</div>



### What about overfitting?

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(wine_df, test_size=0.2)

In [None]:
len(train)

In [None]:
len(test)

In [None]:
vectorizer.fit(train['description'])

In [None]:
X_train = vectorizer.transform(train['description'])

In [None]:
nb_classifier.fit(X_train, train['rating'])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [None]:
print(accuracy_score(train['rating'],
                     nb_classifier.predict(X_train)))



In [None]:
print(confusion_matrix(train['rating'], 
                      nb_classifier.predict(X_train)))


In [None]:
print(classification_report(train['rating'],
                       nb_classifier.predict(X_train)))



Precision: % of selected items that are correct 

Recall: % of correct items that are selected


In [None]:
test_wf         = vectorizer.transform(test['description'])
test_prediction = nb_classifier.predict(test_wf)

In [None]:
print(accuracy_score(test['rating'], test_prediction))

In [None]:
print(classification_report(test['rating'], test_prediction))


In [None]:
vectorizer = CountVectorizer(lowercase=True,
                             ngram_range = (1,1),
                             stop_words = 'english',
                             max_df = .60,
                             min_df = 5,
                             max_features = None)

In [None]:
vectorizer.fit(train['description'])
X_train = vectorizer.transform(train['description'])
nb_classifier.fit(X_train, train['rating'])


In [None]:
print(accuracy_score(train['rating'],
                     nb_classifier.predict(X_train)))



In [None]:

print(accuracy_score(test['rating'],
                     nb_classifier.predict(vectorizer.transform(test['description']))))


<div class="alert alert-info">
<h3> Your turn</h3>
<p> What happens to your model if you change some of the parameters for your vectorizer? Be sure to spit the data between train and test!

</div>




### What about a different model?

![](images/knn1.png)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors = 3)

In [None]:
knn_classifier.fit(review_word_counts, wine_df['rating'])

In [None]:
knn_prediction = knn_classifier.predict(review_word_counts)

In [None]:

print(accuracy_score(test['rating'], knn_prediction))



In [None]:
print(classification_report(test['rating'], knn_prediction))

<div class="alert alert-info">
<h3> Your turn</h3>
<p> What does a k-nearest neigbhor for your speech dataset look like? How does the accuracy compare?
</div>





![](images/knn2.png)

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors = 15)

### But what's the best fitting model?

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'n_neighbors' : (2, 3, 4)}
              

In [None]:
grid = GridSearchCV(KNeighborsClassifier(), parameters, cv=5)

In [None]:
grid.fit(review_word_counts,
         wine_df['rating'])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pipeline = Pipeline([
                     ('vectorizer' , CountVectorizer()),
                     ('classifier' , MultinomialNB())
                    ])

In [None]:
parameters = {'vectorizer__max_df' : (.2, .4),
              'vectorizer__min_df' : (100, 150)
             }
              

In [None]:
grid_search = GridSearchCV(pipeline,
                           parameters,
                           n_jobs = -1,
                           cv = 3,
                           verbose = 1)

In [None]:
grid_search.fit(wine_df['description'],
                wine_df['rating'])

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_

In [None]:
parameters = {'vectorizer__max_df'      : [.1, .15, .2, .25],
              'vectorizer__min_df'      : [25, 50, 100],
              'vectorizer__stop_words'  : [None, 'english'],
              'vectorizer__ngram_range' : [(1,1), (1,2)]
             }
              

In [None]:
grid_search.best_score_

In [None]:
grid_search = GridSearchCV(pipeline,
                           parameters,
                           n_jobs = -1,
                           cv = 5,
                           verbose = 1)

In [None]:
grid_search.fit(wine_df_extremes['description'],
                wine_df_extremes['rating'])

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_.get_params