In [1]:
from pickle import load
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, f1_score, precision_score, recall_score

In [2]:
df = load(open('df_clean.pkl', 'rb'))

In [3]:
df.head()

Unnamed: 0,text,company,is_ther,target,product
0,wesley83 i have a 3g iphone after 3 hrs tweeti...,apple,Negative emotion,Negative emotion,iphone
1,jessedee know about fludapp awesome ipadiphon...,apple,Positive emotion,Positive emotion,ipad or iphone app
2,swonderlin can not wait for ipad 2 also they s...,apple,Positive emotion,Positive emotion,ipad
3,sxsw i hope this years festival isnt as crashy...,apple,Negative emotion,Negative emotion,ipad or iphone app
4,sxtxstate great stuff on fri sxsw marissa maye...,google,Positive emotion,Positive emotion,google


In [4]:
X = df['text']
y = df.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [6]:
sw = stopwords.words('english')

In [7]:
micro_recall = make_scorer(recall_score, average='micro')
pipe = make_pipeline(TfidfVectorizer(max_features=300, stop_words= sw), MultinomialNB())
cv = cross_validate(pipe, X_train, y_train, scoring = micro_recall)
cv

{'fit_time': array([0.11865926, 0.09221482, 0.09179568, 0.09217215, 0.11224842]),
 'score_time': array([0.02194142, 0.0419004 , 0.04783058, 0.02194095, 0.02289724]),
 'test_score': array([0.61298274, 0.63599014, 0.62612983, 0.63105998, 0.62530813])}

In [8]:
y_hat_train = cross_val_predict(pipe, X_train, y_train)

In [9]:
confusion_matrix(y_train, y_hat_train)

array([[   5,  326,   94],
       [  11, 3033,  398],
       [   0, 1445,  773]], dtype=int64)

In [10]:
micro_recall = make_scorer(recall_score, average = 'micro')

In [11]:
param = {'tfidfvectorizer__max_features': [500, 1000, 2000], 'tfidfvectorizer__stop_words': [None, sw]}
pipe2 = make_pipeline(TfidfVectorizer(), MultinomialNB())
grid = GridSearchCV(pipe2, param, scoring= micro_recall)

In [12]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                                       ('multinomialnb', MultinomialNB())]),
             param_grid={'tfidfvectorizer__max_features': [500, 1000, 2000],
                         'tfidfvectorizer__stop_words': [None,
                                                         ['i', 'me', 'my',
                                                          'myself', 'we', 'our',
                                                          'ours', 'ourselves',
                                                          'you', "you're",
                                                          "you've", "you'll",
                                                          "you'd", 'your',
                                                          'yours', 'yourself',
                                                          'yourselves', 'he',
                                                          'him', 'his',
            

In [13]:
grid.best_score_

0.6476581758422351

In [14]:
grid.best_params_

{'tfidfvectorizer__max_features': 2000,
 'tfidfvectorizer__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'ou

In [15]:
grid.best_estimator_

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=2000,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('multinomialnb', MultinomialNB())])