![title](https://uploads.toptal.io/blog/image/443/toptal-blog-image-1407508081138.png)

# Use Case - Machine Learning

# Import all the stuff we need

In [4]:
from nltk.corpus import movie_reviews
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit

# Defining the directory paths

In [57]:
DIR_PATH_DATA = "C:\\Users\\peter.sandberg\\Development\\Python-Presentation\\data"
DIR_PATH_NEG = os.path.join(PATH_DATA, "neg")
DIR_PATH_POS = os.path.join(PATH_DATA, "pos")

## Create the paths to all the files

In [58]:
FILE_PATHS_NEG = [os.path.join(DIR_PATH_NEG, f) for f in os.listdir(DIR_PATH_NEG)]
FILE_PATHS_POS = [os.path.join(DIR_PATH_POS, f) for f in os.listdir(DIR_PATH_POS)]

# Read all movie reviews into memory

In [59]:
documents = []
labels = []

for file_path in FILE_PATHS_NEG:
    with open(file_path) as f:
        documents.append(" ".join(f.readlines()))
    labels.append(0)
 
for file_path in FILE_PATHS_POS:
    with open(file_path) as f:
        documents.append(" ".join(f.readlines()))
    labels.append(1)
    
documents[1]

'the happy bastard\'s quick movie review \n damn that y2k bug . \n it\'s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . \n little do they know the power within . . . \n going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . \n we don\'t know why the crew was really out in the middle of nowhere , we don\'t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don\'t know why donald sutherland is stumbling around drunkenly throughout . \n here , it\'s just " hey , let\'s chase these people around with some robots " . \n the acting is below average , even from the likes of curtis . \n you\'re more likely to get a kick 

# Defining a simple document cleaning method

In [60]:
def clean_documents(_documents):
    _documents = [" ".join(d.split("\n")) for d in _documents]
    documents_cleaned = []
    for d in _documents:
        d_cleaned = d.lower()
        d_cleaned = re.sub("[^a-z0-9\s]", "", d_cleaned)
        d_cleaned = " ".join(d_cleaned.split())
        documents_cleaned.append(d_cleaned)
    return documents_cleaned

## Cleaning the documents

In [61]:
documents = clean_documents(documents)
documents[1]

'the happy bastards quick movie review damn that y2k bug its got a head start in this movie starring jamie lee curtis and another baldwin brother william this time in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on little do they know the power within going for the gore and bringing on a few action sequences here and there virus still feels very empty like a movie going for all flash and no substance we dont know why the crew was really out in the middle of nowhere we dont know the origin of what took over the ship just that a big pink flashy thing hit the mir and of course we dont know why donald sutherland is stumbling around drunkenly throughout here its just hey lets chase these people around with some robots the acting is below average even from the likes of curtis youre more likely to get a kick out of her work in halloween h20 sutherland is wasted and baldwin well hes acting like a

## Convert the labels and documents into numpy arrays

In [62]:
documents = np.asarray(documents)
labels = np.asarray(labels)

# Shuffle both the documents and the labels

In [63]:
p = np.random.permutation(documents.shape[0])
documents = documents[p]
labels = labels[p]

# Transform the documents to numeric arrays

In [64]:
transformer = TfidfVectorizer()
X = transformer.fit_transform(documents)
y = labels

# Split the data into a training set and test set

In [65]:
train_i, test_i = next(StratifiedShuffleSplit(n_splits=2, test_size=0.1, random_state=1).split(X, y).__iter__())
 
X_train = X[train_i]
X_test = X[test_i]
y_train = y[train_i]
y_test = y[test_i]

# Training the model

In [66]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Evaluate the performance of the model

In [67]:
y_pred = model.predict(X_test)
np.average(y_pred == y_test)

0.82499999999999996

# Predicting the value of a new review

In [68]:
new_review = """Back To The Future is such an inventive and exciting piece of filmmaking that it is impossible to forget about it. The casting of every character involved was absolutely perfect, and the performances were spectacular. I first saw this film when I was six years old, and it is the only movie that I know of that I don't think I could ever get sick of.

One of the best things about Back To The Future is that it really makes you think. You can have in-depth conversations about the plausibility of the story-line and spend hours discussing the film. The story is awesome, it is delivered cleverly and entertainingly, and the movie as a whole is just so much FUN that it can be forgiven whatever number of time-travel discrepancies that may be found in the plot. 

I cannot recommend this movie enough. I have never found a movie other than this one that provides such a huge dose of great fun, and it has the desirable quality where you are able to see something new each time you see it. The only problem is that they stopped at part III."""
new_review

"Back To The Future is such an inventive and exciting piece of filmmaking that it is impossible to forget about it. The casting of every character involved was absolutely perfect, and the performances were spectacular. I first saw this film when I was six years old, and it is the only movie that I know of that I don't think I could ever get sick of.\n\nOne of the best things about Back To The Future is that it really makes you think. You can have in-depth conversations about the plausibility of the story-line and spend hours discussing the film. The story is awesome, it is delivered cleverly and entertainingly, and the movie as a whole is just so much FUN that it can be forgiven whatever number of time-travel discrepancies that may be found in the plot. \n\nI cannot recommend this movie enough. I have never found a movie other than this one that provides such a huge dose of great fun, and it has the desirable quality where you are able to see something new each time you see it. The onl

In [69]:
cleaned_text = clean_documents([new_review])
X = transformer.transform(cleaned_text)
y_pred = model.predict(X)[0]
 
print("I think this is a {} review".format('positive' if y_pred else 'negative'))

I think this is a positive review
