In [30]:
import glob
from collections import defaultdict
from pprint import pprint
import pickle
import operator
import os
import nltk
import sklearn
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams
import random
import re
from sklearn.model_selection import GridSearchCV

In [54]:
riloff_eval_dataset = pickle.load(open('march27/riloff-tokenized-and-tagged.pkl','rb'))
for i in range(10):
    random.shuffle(riloff_eval_dataset)

In [11]:
train_set_x = [tweet['text'].lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[:200]]
train_set_y = [tweet['label'] for tweet in riloff_eval_dataset[:200]]
test_set_x =  [tweet['text'].lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[200:]]
test_set_y = [tweet['label'] for tweet in riloff_eval_dataset[200:]]

In [14]:
train_set_x_sklearn = [' '.join(tweet['tokens']).lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[:200]]
test_set_x_sklearn =  [' '.join(tweet['tokens']).lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[200:]]

In [17]:
categories = ['NOT_SARCASM','SARCASM']

In [23]:
'''
If documents are pre-tokenized by an external package, 
then store them in files (or strings) with the tokens 
separated by whitespace and pass analyzer=str.split
(from http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)
'''

vect = CountVectorizer(analyzer=str.split)
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split)
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split)
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split)
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

             precision    recall  f1-score   support

NOT_SARCASM       0.78      0.99      0.88      1498
    SARCASM       0.53      0.04      0.07       426

avg / total       0.73      0.78      0.70      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.86      0.83      1498
    SARCASM       0.35      0.26      0.30       426

avg / total       0.70      0.73      0.71      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.78      1.00      0.88      1498
    SARCASM       0.00      0.00      0.00       426

avg / total       0.61      0.78      0.68      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.95      0.87      1498
    SARCASM       0.47      0.15      0.22       426

avg / total       0.72      0.77      0.72      1924



  'precision', 'predicted', average, warn_for)


In [25]:
vect = CountVectorizer(analyzer=str.split)
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split)
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split)
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split)
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

             precision    recall  f1-score   support

NOT_SARCASM       0.78      0.99      0.87      1498
    SARCASM       0.36      0.03      0.05       426

avg / total       0.69      0.77      0.69      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.79      0.93      0.86      1498
    SARCASM       0.37      0.15      0.21       426

avg / total       0.70      0.76      0.71      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.78      1.00      0.88      1498
    SARCASM       0.00      0.00      0.00       426

avg / total       0.61      0.78      0.68      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.94      0.87      1498
    SARCASM       0.47      0.18      0.26       426

avg / total       0.73      0.77      0.73      1924



  'precision', 'predicted', average, warn_for)


In [26]:
vect = CountVectorizer()
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer()
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer()
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer()
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

             precision    recall  f1-score   support

NOT_SARCASM       0.78      0.98      0.87      1498
    SARCASM       0.48      0.05      0.09       426

avg / total       0.72      0.78      0.70      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.94      0.86      1498
    SARCASM       0.41      0.16      0.23       426

avg / total       0.71      0.76      0.72      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.78      1.00      0.88      1498
    SARCASM       0.00      0.00      0.00       426

avg / total       0.61      0.78      0.68      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.93      0.86      1498
    SARCASM       0.44      0.19      0.26       426

avg / total       0.72      0.77      0.73      1924



  'precision', 'predicted', average, warn_for)


In [None]:
vect = CountVectorizer(ngram_range=(2))
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer()
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer()
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer()
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

In [27]:
vect = CountVectorizer(analyzer=str.split, ngram_range=(2,2))
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split, ngram_range=(2,2))
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split, ngram_range=(2,2))
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(analyzer=str.split, ngram_range=(2,2))
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

             precision    recall  f1-score   support

NOT_SARCASM       0.78      0.99      0.88      1498
    SARCASM       0.53      0.04      0.07       426

avg / total       0.73      0.78      0.70      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.86      0.83      1498
    SARCASM       0.35      0.26      0.30       426

avg / total       0.70      0.73      0.71      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.78      1.00      0.88      1498
    SARCASM       0.00      0.00      0.00       426

avg / total       0.61      0.78      0.68      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.95      0.87      1498
    SARCASM       0.47      0.15      0.22       426

avg / total       0.72      0.77      0.72      1924



  'precision', 'predicted', average, warn_for)


In [29]:
vect = CountVectorizer(ngram_range=(2,2))
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(ngram_range=(2,2))
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(ngram_range=(2,2))
clf = MultinomialNB()
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

vect = CountVectorizer(ngram_range=(2,2))
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

             precision    recall  f1-score   support

NOT_SARCASM       0.79      0.98      0.88      1498
    SARCASM       0.65      0.10      0.17       426

avg / total       0.76      0.79      0.72      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.98      0.88      1498
    SARCASM       0.58      0.12      0.19       426

avg / total       0.75      0.79      0.73      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.78      1.00      0.88      1498
    SARCASM       0.00      0.00      0.00       426

avg / total       0.61      0.78      0.68      1924

             precision    recall  f1-score   support

NOT_SARCASM       0.79      0.98      0.87      1498
    SARCASM       0.51      0.08      0.14       426

avg / total       0.73      0.78      0.71      1924



  'precision', 'predicted', average, warn_for)


In [35]:
vect = CountVectorizer(analyzer=str.split)
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_set_x_sklearn,train_set_y)

             precision    recall  f1-score   support

NOT_SARCASM       0.80      0.95      0.87      1498
    SARCASM       0.47      0.15      0.22       426

avg / total       0.72      0.77      0.72      1924



In [36]:
print(gs_clf.best_score_)

0.83999999999999997

In [37]:
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [41]:
vect = CountVectorizer()
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_set_x,train_set_y)
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.855
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (2, 2)


In [60]:
'''
for i in range(10):
    random.shuffle(riloff_eval_dataset)

train_set_x_sklearn = [' '.join(tweet['tokens']).lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[:200]]
test_set_x_sklearn =  [' '.join(tweet['tokens']).lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[200:]]
train_set_x = [tweet['text'].lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[:200]]
test_set_x =  [tweet['text'].lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[200:]]

train_set_y = [tweet['label'] for tweet in riloff_eval_dataset[:200]]
test_set_y = [tweet['label'] for tweet in riloff_eval_dataset[200:]]
'''

vect = CountVectorizer()
clf =  SGDClassifier(penalty='l2', n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf__loss': ('hinge','squared_hinge','log')
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1,cv=10)
gs_clf = gs_clf.fit(train_set_x,train_set_y)
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    
vect = CountVectorizer(tokenizer=str.split)
clf =  SGDClassifier(penalty='l2', n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf__loss': ('hinge','squared_hinge','log')
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1,cv=10)
gs_clf = gs_clf.fit(train_set_x_sklearn,train_set_y)
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.81
clf__alpha: 0.001
clf__loss: 'squared_hinge'
tfidf__use_idf: True
vect__ngram_range: (1, 1)
0.82
clf__alpha: 0.0001
clf__loss: 'log'
tfidf__use_idf: False
vect__ngram_range: (1, 1)


In [65]:
'''
PREVIOUS BEST (with a different data set):
vect = CountVectorizer(analyzer=str.split)
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))
    
 precision    recall  f1-score   support

NOT_SARCASM       0.80      0.86      0.83      1498
    SARCASM       0.35      0.26      0.30       426

avg / total       0.70      0.73      0.71      1924
'''

vect = CountVectorizer(tokenizer=str.split,ngram_range=(1,1))
clf =  SGDClassifier(loss='log', penalty='l2', alpha=1e-4, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x_sklearn, train_set_y)
predicted = text_clf.predict(test_set_x_sklearn)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

## THIS WAS CHOSEN OVER THE ONE BELOW, because of a healthier f1-score on SARCASM

             precision    recall  f1-score   support

NOT_SARCASM       0.82      0.86      0.84      1512
    SARCASM       0.36      0.30      0.33       412

avg / total       0.72      0.74      0.73      1924



In [62]:
'''
for i in range(10):
    random.shuffle(riloff_eval_dataset)

train_set_x_sklearn = [' '.join(tweet['tokens']).lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[:200]]
test_set_x_sklearn =  [' '.join(tweet['tokens']).lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[200:]]
train_set_x = [tweet['text'].lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[:200]]
test_set_x =  [tweet['text'].lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in riloff_eval_dataset[200:]]

train_set_y = [tweet['label'] for tweet in riloff_eval_dataset[:200]]
test_set_y = [tweet['label'] for tweet in riloff_eval_dataset[200:]]
'''

vect = CountVectorizer()
clf =  SGDClassifier(penalty='l2', n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf__loss': ('hinge','squared_hinge','log')
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1,cv=10)
gs_clf = gs_clf.fit(train_set_x,train_set_y)
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    
vect = CountVectorizer(tokenizer=str.split)
clf =  SGDClassifier(penalty='l2', n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf__loss': ('hinge','squared_hinge','log')
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1,cv=10)
gs_clf = gs_clf.fit(train_set_x_sklearn,train_set_y)
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.83
clf__alpha: 0.001
clf__loss: 'hinge'
tfidf__use_idf: True
vect__ngram_range: (1, 2)
0.82
clf__alpha: 0.01
clf__loss: 'squared_hinge'
tfidf__use_idf: False
vect__ngram_range: (1, 1)


In [63]:
vect = CountVectorizer(ngram_range=(1,2))
clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-2, n_iter=5, random_state=42)
text_clf = Pipeline([('vect', vect),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', clf)])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

             precision    recall  f1-score   support

NOT_SARCASM       0.79      1.00      0.88      1512
    SARCASM       1.00      0.01      0.02       412

avg / total       0.83      0.79      0.70      1924



In [66]:
dataset_to_use = (train_set_x_sklearn,train_set_y,test_set_x_sklearn,test_set_y)
pickle.dump(dataset_to_use,open('march27/dataset_used-includes_train_test_split.pkl','wb'))