### Sentiment Analysis

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline

from sklearn.datasets import load_files

from sklearn.model_selection import train_test_split

from sklearn import metrics

In [None]:
movie_reviews_data_folder = 'data/txt_sentoken'
dataset = load_files(movie_reviews_data_folder, shuffle=False)

print("n_samples: {}".format(len(dataset.data)))

In [None]:
print(dataset.data[-1][:1000])

In [None]:
dataset.target[:5]

In [None]:
dataset.target[-5:]

In [None]:
dataset.target_names[:5]

In [None]:
# split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=1)

- Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent

In [None]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.95)),
    ('clf', LinearSVC(C=10000))
    ])

- Build a grid search to find out whether unigrams or bigrams are more useful.
- Fit the pipeline on the training set using grid search for the parameters

In [None]:
pipeline.fit(docs_train, y_train);

- Predict the test set

In [None]:
y_predicted = pipeline.predict(docs_test)

In [None]:
    # Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                        target_names=dataset.target_names))

In [None]:
# Print and plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

In [None]:
sentences = [
    'The movie has an abrupt ending.',
    'The movie is awesome',
    'The movie is boring',
    'The movie will be a blockbuster'
]
predicted = pipeline.predict(sentences)

In [None]:
for s, p in zip(sentences, predicted):
    print("The language of {} is '{}'".format(s, dataset.target_names[p]))