# Create a Custom Transformer

In [1]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to /home/mickie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mickie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mickie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

In [6]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

### Implement the StartingVerbExtractor class

In [22]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        # tokenize by sentences
        sentence_list = nltk.sent_tokenize(text)
        print(sentence_list)
        for sentence in sentence_list:
            # tokenize each sentence into words and tag part of speech
            pos_tags = nltk.pos_tag(tokenize(sentence))
            print(pos_tags)
            print('\n')

            # index pos_tags to get the first word and part of speech tag
            first_word, first_tag = pos_tags[0]
            
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True

            return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)

        return pd.DataFrame(X_tagged)

### Run program to test

In [23]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def model_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', RandomForestClassifier())
    ])

    return pipeline


def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = model_pipeline()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)

main()

['RT @wealthinsights: WSJ: Philanthropy as Enterprise http://ow.ly/Juyz']
[('rt', 'NN'), ('@', 'JJ'), ('wealthinsights', 'NNS'), (':', ':'), ('wsj', 'NN'), (':', ':'), ('philanthropy', 'NN'), ('a', 'DT'), ('enterprise', 'NN'), ('urlplaceholder', 'NN')]


['In Mexico, Danone Semilla project plans to create 4,000 jobs in microdistribution by 2015 #social_innovation http://t.co/7oWctKhH']
[('in', 'IN'), ('mexico', 'NN'), (',', ','), ('danone', 'NN'), ('semilla', 'NN'), ('project', 'NN'), ('plan', 'NN'), ('to', 'TO'), ('create', 'VB'), ('4,000', 'CD'), ('job', 'NN'), ('in', 'IN'), ('microdistribution', 'NN'), ('by', 'IN'), ('2015', 'CD'), ('#', '#'), ('social_innovation', 'NN'), ('urlplaceholder', 'NN')]


['Corporate Responsibility at Merck: $23.8 million investment in partnerships to address underlying barriers to health.', '#BeWell']
[('corporate', 'JJ'), ('responsibility', 'NN'), ('at', 'IN'), ('merck', 'NN'), (':', ':'), ('$', '$'), ('23.8', 'CD'), ('million', 'CD'), ('investment', 'N

[('the', 'DT'), ('new', 'JJ'), ('science', 'NN'), ('fair', 'JJ'), ('section', 'NN'), ('of', 'IN'), ('think', 'NN'), ('science', 'NN'), ('now', 'RB'), ('highlight', 'VBZ'), ('a', 'DT'), ('lot', 'NN'), ('of', 'IN'), ('fun', 'NN'), ('fact', 'NN'), ('about', 'IN'), ('science', 'NN'), ('and', 'CC'), ('medicine', 'NN'), ('urlplaceholder', 'NN')]


['RT Welcome to @PfizerLifeUK.', 'Pfizer Life provides tailored information, helping you to proactively manage your #health']
[('rt', 'NN'), ('welcome', 'NN'), ('to', 'TO'), ('@', 'VB'), ('pfizerlifeuk', 'NN'), ('.', '.')]


['William Ringo, Pfizer\x89Ûªs Senior Vice President, Worldwide Business Development, Strategy & Innovation, To Retire']
[('william', 'NN'), ('ringo', 'NN'), (',', ','), ('pfizer\x89ûªs', 'JJ'), ('senior', 'JJ'), ('vice', 'NN'), ('president', 'NN'), (',', ','), ('worldwide', 'NN'), ('business', 'NN'), ('development', 'NN'), (',', ','), ('strategy', 'NN'), ('&', 'CC'), ('innovation', 'NN'), (',', ','), ('to', 'TO'), ('retire', '

[('latino', 'NN'), ('usa', 'JJ'), ('anchor', 'NN'), ('maria', 'NNS'), ('hinojosa', 'VBP'), ('join', 'NN'), ('pfizer', 'NN'), ('in', 'IN'), ('hosting', 'VBG'), ('a', 'DT'), ('town', 'NN'), ('hall', 'NN'), ('in', 'IN'), ('la', 'NN'), ('to', 'TO'), ('focus', 'VB'), ('attention', 'NN'), ('on', 'IN'), ('cancer', 'NN'), ('in', 'IN'), ('the', 'DT'), ('latino', 'NN'), ('community', 'NN')]


['Recent studies indicate that 45% of residents of elderly homes are malnourished: http://t.co/AouKkLdt']
[('recent', 'JJ'), ('study', 'NN'), ('indicate', 'VBP'), ('that', 'IN'), ('45', 'CD'), ('%', 'NN'), ('of', 'IN'), ('resident', 'NN'), ('of', 'IN'), ('elderly', 'JJ'), ('home', 'NN'), ('are', 'VBP'), ('malnourished', 'VBN'), (':', ':'), ('urlplaceholder', 'NN')]


['This Global Community Day, 3,800 #CitiVolunteers in #Taiwan will give back by spending time w/ orphans &amp  underprivileged children.']
[('this', 'DT'), ('global', 'JJ'), ('community', 'NN'), ('day', 'NN'), (',', ','), ('3,800', 'CD'), ('#',

[('pfizer', 'NN'), ('global', 'JJ'), ('health', 'NN'), ('fellows', 'NNS'), ('&', 'CC'), ('amp', 'JJ'), ('accordia', 'NN'), ('global', 'JJ'), ('health', 'NN'), ('foundation', 'NN'), ('recognize', 'VB'), ('10', 'CD'), ('year', 'NN'), ('of', 'IN'), ('improving', 'VBG'), ('healthcare', 'NN'), ('in', 'IN'), ('africa', 'NN'), ('urlplaceholder', 'NN')]


['#Nestle CEO  A strong portfolio is driven by innovation, by putting the right resources behind the right things  2013 full-year results']
[('#', '#'), ('nestle', 'JJ'), ('ceo', 'NN'), ('a', 'DT'), ('strong', 'JJ'), ('portfolio', 'NN'), ('is', 'VBZ'), ('driven', 'VBN'), ('by', 'IN'), ('innovation', 'NN'), (',', ','), ('by', 'IN'), ('putting', 'VBG'), ('the', 'DT'), ('right', 'JJ'), ('resource', 'NN'), ('behind', 'IN'), ('the', 'DT'), ('right', 'JJ'), ('thing', 'NN'), ('2013', 'CD'), ('full-year', 'JJ'), ('result', 'NN')]


['Pfizer helps @RareDiseases launch a new resource to educate on Gaucher disease.', 'Learn more: http://t.co/5MiN0DFaFQ'

[('this', 'DT'), ('month\x89ûªs', 'JJ'), ('white', 'JJ'), ('paper', 'NN'), ('explores', 'NNS'), ('barclays', 'VBZ'), ('new', 'JJ'), ('entrepreneurs', 'NNS'), ('index', 'NN'), ('which', 'WDT'), ('map', 'VBZ'), ('business', 'NN'), ('activity', 'NN'), ('and', 'CC'), ('wealth', 'NN'), ('creation', 'NN'), ('.', '.')]


['1st panel session starting now  Creating Shared Value: beyond philantropy &amp  corporate social responsibility  http://t.co/8Z57RcCC #Nestlecsv']
[('1st', 'CD'), ('panel', 'NN'), ('session', 'NN'), ('starting', 'VBG'), ('now', 'RB'), ('creating', 'VBG'), ('shared', 'VBN'), ('value', 'NN'), (':', ':'), ('beyond', 'IN'), ('philantropy', 'NN'), ('&', 'CC'), ('amp', 'JJ'), ('corporate', 'JJ'), ('social', 'JJ'), ('responsibility', 'NN'), ('urlplaceholder', 'JJ'), ('#', '#'), ('nestlecsv', 'NN')]


['J. Baensch: We rely on a global Innovation, Technology and R&D network #Nestle #NestleIR']
[('j.', 'NN'), ('baensch', 'NN'), (':', ':'), ('we', 'PRP'), ('rely', 'VBP'), ('on', 'IN')

[('barbara', 'NN'), ('rolls', 'NNS'), (':', ':'), ('``', '``'), ('what', 'WP'), ('we', 'PRP'), ('need', 'VBP'), ('to', 'TO'), ('consider', 'VB'), ('is', 'VBZ'), ('that', 'IN'), ('the', 'DT'), ('water', 'NN'), ('content', 'NN'), ('of', 'IN'), ('food', 'NN'), ('ha', 'NN'), ('the', 'DT'), ('biggest', 'JJS'), ('impact', 'NN'), ('on', 'IN'), ('energy', 'NN'), ('density', 'NN'), ('.', '.'), ("''", "''")]


['The economic recovery \x89ÛÏis fairly mediocre by historical standards, says Michael Dicks http://ow.ly/UcvA']
[('the', 'DT'), ('economic', 'JJ'), ('recovery', 'NN'), ('\x89ûïis', 'VBD'), ('fairly', 'RB'), ('mediocre', 'VBN'), ('by', 'IN'), ('historical', 'JJ'), ('standard', 'NN'), (',', ','), ('say', 'VBP'), ('michael', 'NN'), ('dicks', 'NNS'), ('urlplaceholder', 'VBP')]


['Paul Bulcke  We commit to be being part of the solution  #wef2014 #health']
[('paul', 'NN'), ('bulcke', 'NN'), ('we', 'PRP'), ('commit', 'VBP'), ('to', 'TO'), ('be', 'VB'), ('being', 'VBG'), ('part', 'NN'), ('of', '

[('today', 'NN'), ('is', 'VBZ'), ('children\x89ûªs', 'JJ'), ('health', 'NN'), ('day', 'NN'), ('.', '.')]


['Banks Caught In How Many Mortgage Settlement Violations?', ': The government-appointed monitor ove... http://t.co/9DUjSrvOtX #Citigroup #BRK']
[('banks', 'NNS'), ('caught', 'VBD'), ('in', 'IN'), ('how', 'WRB'), ('many', 'JJ'), ('mortgage', 'NN'), ('settlement', 'NN'), ('violations', 'NNS'), ('?', '.')]


['Watch the moving story of @Parinaam assisting the poor w/ support that will help them for a lifetime: http://t.co/qrwrDjslxb #FTCitiAwards']
[('watch', 'VB'), ('the', 'DT'), ('moving', 'VBG'), ('story', 'NN'), ('of', 'IN'), ('@', 'NNP'), ('parinaam', 'NN'), ('assisting', 'VBG'), ('the', 'DT'), ('poor', 'JJ'), ('w/', 'JJ'), ('support', 'NN'), ('that', 'WDT'), ('will', 'MD'), ('help', 'VB'), ('them', 'PRP'), ('for', 'IN'), ('a', 'DT'), ('lifetime', 'NN'), (':', ':'), ('urlplaceholder', 'JJ'), ('#', '#'), ('ftcitiawards', 'NNS')]


['This Is What Will Boost Your Bank Stock Right 

['In #Nigeria to celebrate Global Community Day, 725 #CitiVolunteers are working w/ students to support their academic performance &amp  more.']
[('in', 'IN'), ('#', '#'), ('nigeria', 'NNS'), ('to', 'TO'), ('celebrate', 'VB'), ('global', 'JJ'), ('community', 'NN'), ('day', 'NN'), (',', ','), ('725', 'CD'), ('#', '#'), ('citivolunteers', 'NNS'), ('are', 'VBP'), ('working', 'VBG'), ('w/', 'JJ'), ('student', 'NN'), ('to', 'TO'), ('support', 'VB'), ('their', 'PRP$'), ('academic', 'JJ'), ('performance', 'NN'), ('&', 'CC'), ('amp', 'NN'), ('more', 'RBR'), ('.', '.')]


['Watching the globes 2nite?', 'Don\x89Ûªt miss @RevolutionFoods TV debut \x89ÛÒ #progressmakers serving #healthy kid-inspired food!', 'http://t.co/bdaQW0J12q']
[('watching', 'VBG'), ('the', 'DT'), ('globe', 'NN'), ('2nite', 'CD'), ('?', '.')]


['Barclays Wealth Jersey Community Awards winners announced at presentation.', 'http://ow.ly/33Eai']
[('barclays', 'NNS'), ('wealth', 'VBP'), ('jersey', 'NN'), ('community', 'NN'), ('a

[('barclays', 'NNS'), ('wealth', 'VBP'), ('wa', 'NN'), ('named', 'VBN'), ('wealth', 'NN'), ('manager', 'NN'), ('of', 'IN'), ('the', 'DT'), ('year', 'NN'), ('for', 'IN'), ('the', 'DT'), ('third', 'JJ'), ('consecutive', 'JJ'), ('year', 'NN'), ('at', 'IN'), ('the', 'DT'), ('2011', 'CD'), ('global', 'JJ'), ('investor', 'NN'), ('awards', 'NNS'), ('.', '.')]


['#Food is a daily need for each one of us... http://t.co/K8azSSom']
[('#', '#'), ('food', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('daily', 'JJ'), ('need', 'NN'), ('for', 'IN'), ('each', 'DT'), ('one', 'CD'), ('of', 'IN'), ('u', 'NN'), ('...', ':'), ('urlplaceholder', 'NN')]


["It's #Halloween!", 'How about starting your day with a spooky cup of @Nescafe to get you in the right mood!', 'http://t.co/4ahX2stva4']
[('it', 'PRP'), ("'s", 'VBZ'), ('#', '#'), ('halloween', 'NN'), ('!', '.')]


['Enrobing delicate foods such as ice-cream in chocolate can help to hold their shape.', '#ChocolateCoveredFacts']
[('enrobing', 'VBG'), ('delicate', 'JJ

[('barclays', 'NNS'), ('renews', 'VBZ'), ('it', 'PRP'), ('sponsorship', 'NN'), ('of', 'IN'), ('the', 'DT'), ('premier', 'NN'), ('league', 'NN'), ('urlplaceholder', 'NN')]


["Pfizer's Pat Ford, Sr Dir, Global Security, on email spammers in cyberspace: http://on.pfizer.com/hLiGu8"]
[('pfizer', 'NN'), ("'s", 'POS'), ('pat', 'NN'), ('ford', 'NN'), (',', ','), ('sr', 'JJ'), ('dir', 'NN'), (',', ','), ('global', 'JJ'), ('security', 'NN'), (',', ','), ('on', 'IN'), ('email', 'JJ'), ('spammer', 'NN'), ('in', 'IN'), ('cyberspace', 'NN'), (':', ':'), ('urlplaceholder', 'NN')]


['#Danone with @iofbonehealth organized a walking day for #bonehealth in Brazil &amp; encouraged 20,000 people to take care of their #bones']
[('#', '#'), ('danone', 'NN'), ('with', 'IN'), ('@', 'NNP'), ('iofbonehealth', 'NN'), ('organized', 'VBD'), ('a', 'DT'), ('walking', 'JJ'), ('day', 'NN'), ('for', 'IN'), ('#', '#'), ('bonehealth', 'NN'), ('in', 'IN'), ('brazil', 'NN'), ('&', 'CC'), ('amp', 'NN'), (';', ':'), ('enco



["We're supporting a Dubai public health campaign to offer women advice on steps they can take to prevent osteoporosis: http://t.co/GqZQM9SJ"]
[('we', 'PRP'), ("'re", 'VBP'), ('supporting', 'VBG'), ('a', 'DT'), ('dubai', 'JJ'), ('public', 'JJ'), ('health', 'NN'), ('campaign', 'NN'), ('to', 'TO'), ('offer', 'VB'), ('woman', 'NN'), ('advice', 'NN'), ('on', 'IN'), ('step', 'NN'), ('they', 'PRP'), ('can', 'MD'), ('take', 'VB'), ('to', 'TO'), ('prevent', 'VB'), ('osteoporosis', 'NN'), (':', ':'), ('urlplaceholder', 'NN')]


["[Blogpost]: NutriGo, Danone's project to curb malnutrition among Chinese babies in rural areas http://t.co/AC5osuehc1  cc @danonecommunity"]
[('[', 'NN'), ('blogpost', 'NN'), (']', 'NN'), (':', ':'), ('nutrigo', 'NN'), (',', ','), ('danone', 'NN'), ("'s", 'POS'), ('project', 'NN'), ('to', 'TO'), ('curb', 'VB'), ('malnutrition', 'NN'), ('among', 'IN'), ('chinese', 'JJ'), ('baby', 'NN'), ('in', 'IN'), ('rural', 'JJ'), ('area', 'NN'), ('urlplaceholder', 'NN'), ('cc', 'NN'

[('data', 'NNS'), ('for', 'IN'), ('#', '#'), ('nsclc', 'NN'), ('to', 'TO'), ('be', 'VB'), ('presented', 'VBN'), ('on', 'IN'), ('patient-reported', 'JJ'), ('symptom', 'NN'), (',', ','), ('global', 'JJ'), ('qol', 'NN'), ('&', 'CC'), ('amp', 'JJ'), ('general', 'JJ'), ('health', 'NN'), ('status', 'NN'), ('compared', 'VBN'), ('#', '#'), ('3.400', 'CD'), ('&', 'CC'), ('amp', 'VBD'), ('3.412/hall', 'CD'), ('4', 'CD')]


['.', "@Michelle9647 We're not draining Pakistan\x89Ûªs water.", 'We firmly believe access to water is a human right http://t.co/4NkwV8oLTj']
[('.', '.')]


['Our #iPad app has been included in an interesting list of top investor relations apps http://bit.ly/dSlqvL #IR']
[('our', 'PRP$'), ('#', '#'), ('ipad', 'NN'), ('app', 'NN'), ('ha', 'NN'), ('been', 'VBN'), ('included', 'VBN'), ('in', 'IN'), ('an', 'DT'), ('interesting', 'JJ'), ('list', 'NN'), ('of', 'IN'), ('top', 'JJ'), ('investor', 'NN'), ('relation', 'NN'), ('apps', 'IN'), ('urlplaceholder', 'JJ'), ('#', '#'), ('ir', '

[('treasury', 'NN'), ('pick', 'NN'), ('tries', 'NNS'), ('to', 'TO'), ('cast', 'VB'), ('his', 'PRP$'), ('history', 'NN'), ('a', 'DT'), ('right', 'NN'), ('for', 'IN'), ('the', 'DT'), ('job', 'NN'), (':', ':'), ('jacob', 'NN'), ('lew', 'NN'), (',', ','), ('who', 'WP'), ('face', 'VBP'), ('a', 'DT'), ('senate', 'NN'), ('pa', 'NN'), ('...', ':'), ('urlplaceholder', 'JJ'), ('#', '#'), ('citigroup', 'NN'), ('#', '#'), ('brk', 'NN')]


['Thanks - you too!', 'RT @GaboNYC On our way to #Brooklyn to join #CitiVolunteers in revitalizing a school.', 'Have a great Global Community Day!', '!']
[('thanks', 'NNS'), ('-', ':'), ('you', 'PRP'), ('too', 'RB'), ('!', '.')]


["Did you know we support nearly 300 initiatives which support United Nation's Millennium Development Goals?", 'http://t.co/jxjAvSaD']
[('did', 'VBD'), ('you', 'PRP'), ('know', 'VBP'), ('we', 'PRP'), ('support', 'VBP'), ('nearly', 'RB'), ('300', 'CD'), ('initiative', 'NN'), ('which', 'WDT'), ('support', 'NN'), ('united', 'VBD'), ('natio

[('barclays', 'NNS'), ('announces', 'NNS'), ('that', 'WDT'), ('irene', 'VBP'), ('mcdermott', 'RBS'), ('brown', 'JJ'), ('ha', 'NN'), ('been', 'VBN'), ('appointed', 'VBN'), ('a', 'DT'), ('group', 'NN'), ('human', 'JJ'), ('resources', 'NNS'), ('director', 'NN'), ('urlplaceholder', 'NN')]


[" With the help of Fair Labor Association we've identified 11 areas of recommendations to tackle child labour issue.", 'http://t.co/76Ttq66p']
[('with', 'IN'), ('the', 'DT'), ('help', 'NN'), ('of', 'IN'), ('fair', 'JJ'), ('labor', 'NN'), ('association', 'NN'), ('we', 'PRP'), ("'ve", 'VBP'), ('identified', 'VBN'), ('11', 'CD'), ('area', 'NN'), ('of', 'IN'), ('recommendation', 'NN'), ('to', 'TO'), ('tackle', 'VB'), ('child', 'VB'), ('labour', 'JJ'), ('issue', 'NN'), ('.', '.')]


['Volunteers help to make a difference to rescue animals.', 'http://t.co/NupD1MAJ']
[('volunteers', 'NNS'), ('help', 'VBP'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('difference', 'NN'), ('to', 'TO'), ('rescue', 'VB'), ('anim