# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /Users/mxagar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mxagar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import confusion_matrix


In [34]:
import chardet
from encodings.aliases import aliases

# Test encodings: get all encodings with which the file can be opened
alias_values = set(aliases.values())
candidate_encodings = []
for alias in alias_values:
    try:
        df = pd.read_csv('corporate_messaging.csv', encoding=alias)
        candidate_encodings.append(alias)
    except:
         pass
candidate_encodings

['mac_turkish',
 'cp862',
 'iso8859_4',
 'mac_greek',
 'iso8859_15',
 'iso8859_13',
 'hp_roman8',
 'cp850',
 'mac_cyrillic',
 'mac_roman',
 'cp775',
 'cp857',
 'cp863',
 'cp437',
 'mac_iceland',
 'cp865',
 'latin_1',
 'cp866',
 'cp855',
 'cp860',
 'cp861',
 'kz1048',
 'iso8859_14',
 'mac_latin2',
 'iso8859_10',
 'iso8859_9',
 'cp852',
 'cp858',
 'iso8859_2',
 'cp1125',
 'iso8859_16',
 'cp1256',
 'iso8859_5',
 'koi8_r',
 'ptcp154',
 'cp864',
 'cp1251']

In [36]:
# Select a part in the text which seems to have issues
# and open & print that part with different encodings
# In this case, all encodings seem to lead to issues...
for encoding in candidate_encodings:
    df = pd.read_csv('corporate_messaging.csv', encoding=encoding)
    print(encoding, df.text.loc[30][50:65])

mac_turkish todayâğ™s multi
cp862 todayי█¬s multi
iso8859_4 todayÛĒs multi
mac_greek todayâέΣs multi
iso8859_15 todayÛªs multi
iso8859_13 todayŪŖs multi
hp_roman8 todayÜˆs multi
cp850 todayë█¬s multi
mac_cyrillic todayЙџ™s multi
mac_roman todayâ€™s multi
cp775 todayē█¬s multi
cp857 todayë█¬s multi
cp863 todayë█¬s multi
cp437 todayë█¬s multi
mac_iceland todayâ€™s multi
cp865 todayë█¬s multi
latin_1 todayÛªs multi
cp866 todayЙ█кs multi
cp855 todayЅ█фs multi
cp860 todayÊ█¬s multi
cp861 todayë█¬s multi
kz1048 today‰ЫҒs multi
iso8859_14 todayÛẂs multi
mac_latin2 todayČŘ™s multi
iso8859_10 todayÛŠs multi
iso8859_9 todayÛªs multi
cp852 todayë█¬s multi
cp858 todayë█¬s multi
iso8859_2 todayŰŞs multi
cp1125 todayЙ█кs multi
iso8859_16 todayÛȘs multi
cp1256 today‰غھs multi
iso8859_5 todayлЊs multi
koi8_r today┴ш╙s multi
ptcp154 todayүЫӘs multi
cp864 today┬¦ﺕs multi
cp1251 today‰ЫЄs multi


In [45]:
# Try encoding with chardet
with open("corporate_messaging.csv", 'rb') as file:
    print(chardet.detect(file.read()))

{'encoding': 'MacRoman', 'confidence': 0.69798054614607, 'language': ''}


In [47]:
df = pd.read_csv('corporate_messaging.csv', encoding='mac_roman')
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,category,category:confidence,category_gold,id,screenname,text
0,662822308,False,finalized,3,2/18/15 4:31,Information,1.0,,4.36528e+17,Barclays,Barclays CEO stresses the importance of regula...
1,662822309,False,finalized,3,2/18/15 13:55,Information,1.0,,3.86013e+17,Barclays,Barclays announces result of Rights Issue http...
2,662822310,False,finalized,3,2/18/15 8:43,Information,1.0,,3.7958e+17,Barclays,Barclays publishes its prospectus for its Â£5....
3,662822311,False,finalized,3,2/18/15 9:13,Information,1.0,,3.6753e+17,Barclays,Barclays Group Finance Director Chris Lucas is...
4,662822312,False,finalized,3,2/18/15 6:48,Information,1.0,,3.60385e+17,Barclays,Barclays announces that Irene McDermott Brown ...


In [49]:
df.shape

(3118, 11)

In [51]:
df.category.value_counts()

Information    2129
Action          724
Dialogue        226
Exclude          39
Name: category, dtype: int64

In [38]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='mac_roman')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [39]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [40]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = RandomForestClassifier()

# Fit and/or transform each to the data
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, y_train)


RandomForestClassifier()

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [41]:
# Transform test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

# Predict test labels
y_pred = clf.predict(X_test_tfidf)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [42]:
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 80   0  28]
 [  0  21   5]
 [  3   1 463]]
Accuracy: 0.9384359400998337


# Final Step: Refactor
Organize these steps into the following functions.

In [43]:
def display_results(y_test, y_pred):
    # insert step 4 here
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()
    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

def main():
    # load data
    X, y = load_data()

    # perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # Fit and/or transform each to the data
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)
    
    # Transform test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)

    # Predict test labels
    y_pred = clf.predict(X_test_tfidf)
     # display results
    display_results(y_test, y_pred)

In [44]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 86   0  21]
 [  3  26   3]
 [  5   1 456]]
Accuracy: 0.9450915141430949
