In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [2]:
# Read the data from CSV files
temp = pd.read_csv('For_preprocessing.csv')
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4768 entries, 0 to 4767
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  4763 non-null   object
 1   Sentiment    4766 non-null   object
dtypes: object(2)
memory usage: 74.6+ KB


In [3]:
temp=temp.dropna(subset=['Review_Text'])
temp=temp.dropna(subset=['Sentiment'])
temp = temp.reset_index(drop=True)
temp_val = temp['Sentiment'].tolist()
#len(temp_val)
ind_list = []

for i in range(len(temp['Sentiment'])):
    if temp_val[i] != '1' and temp_val[i] != '2' and temp_val[i] != '3':
        ind_list.append(i)
temp.reset_index(drop=True)
temp.drop(temp.index[ind_list], inplace=True)

In [4]:
temp = temp.reset_index(drop=True)
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4699 entries, 0 to 4698
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  4699 non-null   object
 1   Sentiment    4699 non-null   object
dtypes: object(2)
memory usage: 73.5+ KB


In [5]:
def tokenize_reg():
    temp['Review_Text'] = temp['Review_Text'].replace(r'http\S+','',regex=True).replace(r'www\S+','',regex=True).replace(r'\d+','',regex=True)
    tokens = RegexpTokenizer(r'\w+')
    temp['Review_Text']=temp['Review_Text'].apply(lambda x:tokens.tokenize(x.lower()))
    stop_words = set(stopwords.words('english')) 
    stop_words = stop_words.union(",","(",")","[","]","{","}","#","@","!",":",";",".","?")
    temp['Review_Text'] = temp['Review_Text'].apply(lambda x: [item for item in x if item not in stop_words])
tokenize_reg()

In [6]:
lem = WordNetLemmatizer()
def lemm(text):
    sent=[]
    for word in text:
        sent.append(lem.lemmatize(word))
    return sent

ps=PorterStemmer()

def stemm(text):        
    sent = []
    for word in text:
        sent.append(ps.stem(word))
    return sent

temp['Review_Text'] =  temp.apply(lambda x: stemm(lemm(x['Review_Text'])), axis=1)

In [7]:
revs=temp['Review_Text'].tolist()
for i in range(len(revs)):
    revs[i]=' '.join(revs[i])
temp['Review_Text'] = revs
temp

Unnamed: 0,Review_Text,Sentiment
0,realli nice place stay especi busi tourist purpos,3
1,seem hotel check basic amen room hand room tra...,1
2,worst hotel ever encount never think stay thii...,1
3,good time hotel staff kumar aishwarya hous kee...,3
4,good hotel staff veg food good non veg breakfa...,3
...,...,...
4694,fifth stay hotel busi room great restaur excel...,3
4695,enjoy,3
4696,impress servic staff area good restaur fit cen...,3
4697,linen smell bad elev pungent odour housekeep p...,1


In [8]:
st = temp['Sentiment'].tolist()
st=set(st)

## 3. Training MultinomialNB
A Pipeline class was used to make the vectorizer => transformer => classifier easier to work with. Such hyper-parameters as n-grams range, IDF usage, TF-IDF normalization type and Naive Bayes alpha were tunned using grid search. The performance of the selected hyper-parameters was measured on a test set that was not used during the model training step.

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV


text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC(kernel='rbf'))])

tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C' : [0.001, 0.01, 0.1, 1, 10],
    'clf__gamma' : [0.001, 0.01, 0.1, 1],
}

The dataset was plited into train and test subsets.

In [10]:
x_train, x_test, y_train, y_test = train_test_split(temp['Review_Text'], temp['Sentiment'], test_size=0.33, random_state=42)

In [11]:
# from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

# score = 'f1_macro'
# print("# Tuning hyper-parameters for %s" % score)
# print()
# np.errstate(divide='ignore')
# clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring=score,n_jobs=4)
# clf.fit(x_train, y_train)

# print("Best parameters set found on development set:")
# print()
# print(clf.best_params_)
# print()
# print("Grid scores on development set:")
# print()
# for mean, std, params in zip(clf.cv_results_['mean_test_score'], 
#                              clf.cv_results_['std_test_score'], 
#                              clf.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
# print()

# print("Detailed classification report:")
# print()
# print("The model is trained on the full development set.")
# print("The scores are computed on the full evaluation set.")
# print()
# print(classification_report(y_test, clf.predict(x_test), digits=4))
# print()
# print ("Confusion Matrix")
# print(confusion_matrix(y_test, clf.predict(x_test)))
# print("Accuracy Score")
# print(accuracy_score(y_test, clf.predict(x_test)))

In [None]:
grid = GridSearchCV(text_clf,tuned_parameters,refit = True, verbose=2)
grid.fit(x_train,y_train)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s


[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.2s
[CV] clf__C=0.001, clf__g

[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.2s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   2.0s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   1.8s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   1.9s
[CV] clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   2.1s
[CV] clf__C=0.001, clf__g

[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   3.9s
[CV] clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   4.0s
[CV] clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   3.9s
[CV] clf__C=0.001, clf__g

[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.5s
[CV] clf__C=0.001, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.5s
[CV] clf__C=0.001, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.4s
[CV] clf__C=0.001, clf__gamma=0.1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=0.1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.001, clf__gamma=0.1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=0.1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.001, clf__gamma=0.1

[CV]  clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.6s
[CV] clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.6s
[CV] clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.7s
[CV] clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.5s
[CV] clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   2.5s
[CV] clf__C=0.001, clf__gamma=0.1, tfidf__n

[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.4s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.2s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.5s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.7s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l1, tfi

[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.7s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.8s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.7s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.9s
[CV] clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.001, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.7s
[CV] clf__C=0.01, clf__gamma=0.001, tfidf__norm=l1, 

[CV]  clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.4s
[CV] clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.5s
[CV] clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.7s
[CV] clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.5s
[CV] clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=0.001, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   3.5s
[CV] clf__C=0.01, clf__gamma=0.001

[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   2.3s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   2.3s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.1s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   2.3s
[CV] clf__C=0.01, clf__gamma=0.01, tf

[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   4.4s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   4.1s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   3.9s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.5s
[CV] clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.01, clf__gamma=0.01, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(2, 2), total=   2.5s
[CV] clf__C=0.01, clf__gamma=0.01,

[CV]  clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   2.9s
[CV] clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 1), total=   3.0s
[CV] clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), total=   4.8s
[CV] clf__C=0.01, clf__gamma=0.1, tfidf__norm=l2, tf

[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   3.9s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   3.8s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   3.7s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(2, 2), total=   3.0s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 1), total=   3.4s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l1, tfidf__use_idf=False

[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   4.6s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   5.2s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__C=0.01, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), total=   5.0s
[CV] clf__C=0.01, clf__gamma=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 


In [None]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)


## 4. Conslusion
The model, which was trained on the development set, demonstrated $F_1=0.765$ on the evaluation set.

## References
1. Y. Rubtsova, "Constructing a Corpus for Sentiment Classification Training", Software & Systems, vol. 109, no. 1, pp. 72-78, 2015. 
2. "Naive Bayes", scikit-learn.org, 2018. [Online]. Available: http://scikit-learn.org/stable/modules/naive_bayes.html. [Accessed: 26- Aug- 2018]. 
3. "Working With Text Data", scikit-learn.org, 2018. [Online]. Available: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html. [Accessed: 26- Aug- 2018].