# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [5]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/antelinvestigacionydesarrollo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/antelinvestigacionydesarrollo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [9]:
from sklearn.cross_validation import train_test_split

# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = LogisticRegression()

# Fit and/or transform each to the data
X_train_t = vect.fit_transform(X_train)
X_train_t = tfidf.fit_transform(X_train_t)
clf.fit(X_train_t, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [16]:
# Transform test data
X_test_t = vect.transform(X_test)
X_test_t = tfidf.transform(X_test_t)

# Predict test labels
y_pred = clf.predict(X_test_t)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [27]:
X_test_t[:5]

<5x5310 sparse matrix of type '<class 'numpy.float64'>'
	with 79 stored elements in Compressed Sparse Row format>

In [19]:
X_train[:5]

array(['#NYC can be a place that will attract &amp  create great workforce talent. #NYCTechEconomy',
       "We think it's a shame  Cake to Bake  didn't make it through. #Eurovision #GoodFoodGoodLife",
       'Mauricio CÌÄåÁrdenas "Good education, good nutrition&amp;physical activity give tremendous return in terms of healthier population" #wef2014 #davos',
       '@angry_veteran The tests have reconfirmed the data from our factory. #Nestle NAN H.A. 1 Gold is safe. http://t.co/OsLjuDg1',
       'MerckEngage has a brand new look! Visit the redesigned site and get healthy living tips and information: http://t.co/MMHVE6dcQz'],
      dtype=object)

In [29]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

np.unique(y_train).tolist()

['Action', 'Dialogue', 'Information']

In [30]:
labels = np.unique(y_train).tolist()
confusion_mat = confusion_matrix(y_test, y_pred, labels)
accuracy = accuracy_score(y_test, y_pred)

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action', 'Dialogue', 'Information']
Confusion Matrix:
 [[ 70   0  64]
 [  0  22   3]
 [  2   0 560]]
Accuracy: 0.9042995839112344


# Final Step: Refactor
Organize these steps into the following functions.

In [31]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

def display_results():
    labels = np.unique(y_train).tolist()
    confusion_mat = confusion_matrix(y_test, y_pred, labels)
    accuracy = accuracy_score(y_test, y_pred)
    
    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    # load data
    X, y = load_data()

    # perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
    
    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = LogisticRegression()

    # Fit and/or transform each to the data
    X_train_t = vect.fit_transform(X_train)
    X_train_t = tfidf.fit_transform(X_train_t)
    clf.fit(X_train_t, y_train)
    
    # Transform test data
    X_test_t = vect.transform(X_test)
    X_test_t = tfidf.transform(X_test_t)

    # Predict test labels
    y_pred = clf.predict(X_test_t)

In [32]:
# run program
main()

In [33]:
display_results()

Labels: ['Action', 'Dialogue', 'Information']
Confusion Matrix:
 [[ 70   0  64]
 [  0  22   3]
 [  2   0 560]]
Accuracy: 0.9042995839112344
