In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import datasets
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import PunktSentenceTokenizer

In [16]:
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = np.unique(y, return_inverse=True)

X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

In [21]:
model = xgb.XGBClassifier()
print model

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


In [17]:
model.fit(X_train,y_train,eval_metric='rmse')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [20]:
model.predict_proba(X_test)

array([[ 0.65184641,  0.34815356],
       [ 0.74603701,  0.25396299],
       [ 0.32435191,  0.67564809],
       ..., 
       [ 0.71962714,  0.28037286],
       [ 0.08567679,  0.91432321],
       [ 0.01836032,  0.98163968]], dtype=float32)

In [24]:
150/60.0

2.5

In [26]:
X_train.shape

(2000, 10)

In [38]:
def load_data(filename):
    '''
    Load data into a data frame for use in running model
    '''
    return pickle.load(open(filename, 'rb'))


def stem_tokens(tokens, stemmer):
    '''Stem the tokens.'''
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def OHStokenize(text):
    '''Tokenize & stem. Stems automatically for now.
    Leaving "stemmer" out of function call, so it works with TfidfVectorizer'''
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

###########################################################################
# tokenization code

def seperatePunct(incomingString):
    outstr = ''
    characters = set(['!','@','#','$',"%","^","&","*",":","\\",
                  "(",")","+","=","?","\'","\"",";","/",
                  "{","}","[","]","<",">","~","`","|"])

    for char in incomingString:
        if char in characters:
            outstr = outstr + ' ' + char + ' '
        else:
            outstr = outstr + char

    return outstr

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

def text_cleaner(wordList):
    '''
    INPUT: List of words to be tokenized
    OUTPUT: List of tokenized words
    '''

    tokenziedList = []

    for word in wordList:

        #remove these substrings from the word
        word = word.replace('[deleted]','')
        word = word.replace('&gt','')

        #if link, replace with linktag
        if 'http' in word:
            tokenziedList.append('LINK_TAG')
            continue

        #if reference to subreddit, replace with reddittag
        if '/r/' in word:
            tokenziedList.append('SUBREDDIT_TAG')
            continue

        #if reference to reddit user, replace with usertag
        if '/u/' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if reference to twitter user, replace with usertag
        if '@' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if number, replace with numtag
        #m8 is a word, 5'10" and 54-59, 56:48 are numbers
        if hasNumbers(word) and not any(char.isalpha() for char in word):
            tokenziedList.append('NUM_TAG')
            continue

        #seperate puncuations and add to tokenizedList
        newwords = seperatePunct(word).split(" ")
        tokenziedList.extend(newwords)

    return tokenziedList

def mytokenize(comment):
    '''
    Input: takes in a reddit comment as a str or unicode and tokenizes it
    Output: a tokenized list
    '''
    tokenizer = PunktSentenceTokenizer()
    sentenceList = tokenizer.tokenize(comment)
    wordList = []
    for sentence in sentenceList:
        wordList.extend(sentence.split(" "))

    return text_cleaner(wordList)

In [31]:
path = '../../data/labeledRedditComments2.p'
cvpath = '../../data/twitter_cross_val.csv'

df = pickle.load(open(path, 'rb'))
dfcv = pd.read_csv(cvpath)




In [71]:
#take a subset of the data for testing this code
randNums = np.random.randint(low=0,high=len(df.index),size=(200,1))
rowList = [int(row) for row in randNums]
dfsmall = df.ix[rowList,:]

In [72]:
nf = dfsmall
X = nf.body
y = nf.label

Xcv = dfcv['tweet_text'].values
ycv = dfcv['label'].values

In [73]:
vect = TfidfVectorizer(stop_words='english', decode_error='ignore',
                           tokenizer=mytokenize)


# fit & transform comments matrix
tfidf_X = vect.fit_transform(X)
tfidf_Xcv = vect.transform(Xcv)

In [74]:
print tfidf_X.shape
print tfidf_Xcv.shape

(200, 2384)
(10000, 2384)


In [75]:
tfidf_Xcvd = tfidf_Xcv.todense()

In [76]:
xg_train = xgb.DMatrix(tfidf_X, label=y)
xg_cv = xgb.DMatrix(tfidf_Xcv, label=ycv)
xg_cvd = xgb.DMatrix(tfidf_Xcvd, label=ycv)

In [79]:
print xg_train.num_col()
print xg_cv.num_col()
print xg_cvd.num_col()

2384
2381
2384


In [78]:
type(tfidf_X)

scipy.sparse.csr.csr_matrix