In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'latin-1', 
                 names = ['sentiment', 'id', 'date','query', 'user_name', 'tweet'])

df.loc[0:1]

Unnamed: 0,sentiment,id,date,query,user_name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...


In [2]:
pd.set_option('display.max_colwidth', -1)
df.drop(columns = ['id', 'date', 'query', 'user_name'], inplace = True)

df['sentiment'].value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

# Transform special characters

1) @random_user_name --> @

2) http{blah blah} --> http

3) www{ blah blah} --> www

4) {blah blah}com --> com

5) # --> hashtag

In [3]:
%%time
def transform_special_characters(df, text_column):
    '''Takes a DataFrame, parses through the text_column
    and transfroms http:... -> http
    www.blahblah... -> www
    #hashtag -> hashtag
    @user_name -> at_symbol'''
    
    transforms = {r'http[^\s]+': 'http',
                 r'\#\w+': 'hashtag',
                 r' www[^\s]+': 'www',
                 r'\@\w+': 'at_symbol',
                 r'[^\s]+\.com': 'com'}
    
    df[text_column].replace(regex = transforms, inplace = True)
    return None

transform_special_characters(df, 'tweet')

CPU times: user 16.3 s, sys: 31.8 ms, total: 16.4 s
Wall time: 16.4 s


Here, we perform standard steps, by converting into lower case and tokenizing.

1) Convert into lower case

2) Tokenize the words

In [4]:
%%time
from nltk.tokenize import TreebankWordTokenizer

def set_to_lower(df, text_column):
    df[text_column] = df[text_column].str.lower()
    return None

def tokenize(df, text_column):
    tok = TreebankWordTokenizer()    
    
    start = 0
    end = 20000
    num_tweets = len(df)
    for i in range(num_tweets//end + 1):
        df[text_column][start:end] = df[text_column][start:end].apply(lambda row: tok.tokenize(row))
        start = end
        if end + 20000 < num_tweets:
            end = end + 20000
        else:
            end = num_tweets
    
    return None

set_to_lower(df, 'tweet')
tokenize(df, 'tweet')
df['tweet']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


CPU times: user 2min 23s, sys: 958 ms, total: 2min 24s
Wall time: 2min 29s


0          [at_symbol, http, -, awww, ,, that, 's, a, bummer., you, shoulda, got, david, carr, of, third, day, to, do, it., ;, d]                    
1          [is, upset, that, he, ca, n't, update, his, facebook, by, texting, it, ..., and, might, cry, as, a, result, school, today, also., blah, !]
2          [at_symbol, i, dived, many, times, for, the, ball., managed, to, save, 50, %, the, rest, go, out, of, bounds]                             
3          [my, whole, body, feels, itchy, and, like, its, on, fire]                                                                                 
4          [at_symbol, no, ,, it, 's, not, behaving, at, all., i, 'm, mad., why, am, i, here, ?, because, i, ca, n't, see, you, all, over, there, .] 
                                                                             ...                                                                     
1599995    [just, woke, up., having, no, school, is, the, best, feeling, ever]                      

After tokenizing, we use the WordNetLemmatizer in order to make our vocabulary more robust. We choose not to choose a stemmer, because stemmer is quite more crude with respect to words. Let's see if this works!

In [5]:
from nltk.stem import WordNetLemmatizer

def lemmatize(df, text_column):
    
    lemmatizer = WordNetLemmatizer()
    for row in df[text_column]:
        i = 0
        n = len(row)
        while i < n:
            row[i] = lemmatizer.lemmatize(row[i])
            i += 1
    return None

lemmatize(df, 'tweet')
print(WordNetLemmatizer().lemmatize('behaving'))
df['tweet']

behaving


0          [at_symbol, http, -, awww, ,, that, 's, a, bummer., you, shoulda, got, david, carr, of, third, day, to, do, it., ;, d]                   
1          [is, upset, that, he, ca, n't, update, his, facebook, by, texting, it, ..., and, might, cry, a, a, result, school, today, also., blah, !]
2          [at_symbol, i, dived, many, time, for, the, ball., managed, to, save, 50, %, the, rest, go, out, of, bound]                              
3          [my, whole, body, feel, itchy, and, like, it, on, fire]                                                                                  
4          [at_symbol, no, ,, it, 's, not, behaving, at, all., i, 'm, mad., why, am, i, here, ?, because, i, ca, n't, see, you, all, over, there, .]
                                                                             ...                                                                    
1599995    [just, woke, up., having, no, school, is, the, best, feeling, ever]                            

In [6]:
vocabulary = {}

for row in df['tweet']:
    for word in row:
        if word in vocabulary:
            vocabulary[word] += 1
        else:
            vocabulary[word] = 1

In [7]:
uniques = 0
for (key,value) in vocabulary.items():
    if value == 1:
        uniques += 1
print("Number of unique Words:", uniques)
print("Number of words in total:", len(vocabulary))

Number of unique Words: 282581
Number of words in total: 420459


That shows that we have 282581 unique words in our vocabulary. For convenience and faster convergence, we will delete these words and replace them with UNKNOWN. 

In [8]:
unknown_words = set({})

for (key, value) in vocabulary.items():
    if value == 1:
        unknown_words.add(key)
vocabulary['UNKNOWN'] = len(unknown_words)
print(len(vocabulary), len(unknown_words))

420460 282581


In [9]:
def replace_with_unknown(df, text_column, vocabulary):
    '''Takes df and text_column to access df[text_column].
    Then parses each row, and replaces every word that has count = 1 
    with the word UNKNOWN. (Of course we picked capitals because unknown
    may belong in the vocabulary)'''
    
    for row in df[text_column]:
        i = 0
        n = len(row)
        while i < n:
            if vocabulary[row[i]] == 1:
                row[i] = 'UNKNOWN'
            i += 1
    return None

replace_with_unknown(df, 'tweet', vocabulary)
df.loc[:10]

Unnamed: 0,sentiment,tweet
0,0,"[at_symbol, http, -, awww, ,, that, 's, a, bummer., you, shoulda, got, david, carr, of, third, day, to, do, it., ;, d]"
1,0,"[is, upset, that, he, ca, n't, update, his, facebook, by, texting, it, ..., and, might, cry, a, a, result, school, today, also., blah, !]"
2,0,"[at_symbol, i, dived, many, time, for, the, ball., managed, to, save, 50, %, the, rest, go, out, of, bound]"
3,0,"[my, whole, body, feel, itchy, and, like, it, on, fire]"
4,0,"[at_symbol, no, ,, it, 's, not, behaving, at, all., i, 'm, mad., why, am, i, here, ?, because, i, ca, n't, see, you, all, over, there, .]"
5,0,"[at_symbol, not, the, whole, crew]"
6,0,"[need, a, hug]"
7,0,"[at_symbol, hey, long, time, no, see, !, yes.., rain, a, bit, ,, only, a, bit, lol, ,, i, 'm, fine, thanks, ,, how, 's, you, ?]"
8,0,"[at_symbol, nope, they, did, n't, have, it]"
9,0,"[at_symbol, que, me, UNKNOWN, ?]"


Essentially in the above step we replaced each word that occured once with the word UNKNOWN. The reason is twofold

1) We get rid of 282581 words. This of course will give us a boost in the computing performance

2) A word that appears only one on the text doesn't play any role in the training process of the XGBoost classifier (Or of any classifier for that reason). The classifier cannot understand whether the context is positive and secondly even if it understands (say this word is found on the training set), then on the test set it will not play any role. Moreover, if we generalise the classifier, this unique word has a huge possibility of appearing minimal amount of time. 

Therefore, it makes sense to delete them.

In [10]:
df['tweet'] = df['tweet'].str.join(' ')
df['tweet'][:1]

0    at_symbol http - awww , that 's a bummer. you shoulda got david carr of third day to do it. ; d
Name: tweet, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

labels = df['sentiment'].to_numpy()/4; del df['sentiment']
X = df['tweet'].to_numpy()


vectorizer = CountVectorizer(stop_words = stopwords.words('english'))
X_vectorized = vectorizer.fit_transform(X)
X_vectorized

<1600000x99029 sparse matrix of type '<class 'numpy.int64'>'
	with 11790520 stored elements in Compressed Sparse Row format>

In [13]:
%%time
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import joblib
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, train_size = 0.75)

model = xgb.XGBClassifier()

parameters = {"max_depth":[7, 8], "n_estimators":[700,900,1100]}
cv = GridSearchCV(model, parameters, scoring = 'accuracy', n_jobs=4,verbose = True)

with joblib.parallel_backend('threading',n_jobs = 4):
    cv.fit(X_train, y_train)

best = cv.best_estimator_
print('Best model: ', best)
print('Test score: ', accuracy_score(y_test, best.predict(X_test)))



Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed: 321.0min finished


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator'

For the record (just to be funny): The execution of this cell lasted about 6 hours (17:50 - 12:00). It was extremely dissapointing to notice around 23:00 that on the line 'best = cv.best_estimator_' I had actually forgotten the last _. (Now this is fixed, in case someone wants to reproduce the result)

Of course this doesn't harm the script since the cv object is of interest and this would be saved anyways.

In [14]:
best = cv.best_estimator_
results = cv.cv_results_
cv.best_score_

0.7713816666666666

In [15]:
best

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=1100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [17]:
accuracy_score(y_test, best.predict(X_test))

0.771505

In [19]:
pd.DataFrame(results, columns = results.keys())

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,3356.768337,29.678433,45.068923,7.671434,7,700,"{'max_depth': 7, 'n_estimators': 700}",0.760758,0.761502,0.759779,0.76068,0.000706,6
1,3942.239021,230.673356,66.583686,12.710741,7,900,"{'max_depth': 7, 'n_estimators': 900}",0.766043,0.766485,0.765167,0.765898,0.000548,4
2,4626.904127,9.105055,156.427061,53.091106,7,1100,"{'max_depth': 7, 'n_estimators': 1100}",0.769523,0.769707,0.768349,0.769193,0.000601,2
3,3345.248359,31.540824,44.278647,1.307694,8,700,"{'max_depth': 8, 'n_estimators': 700}",0.763466,0.763975,0.763229,0.763557,0.000311,5
4,4296.554097,57.746276,106.108022,36.932765,8,900,"{'max_depth': 8, 'n_estimators': 900}",0.768201,0.769192,0.767337,0.768243,0.000758,3
5,4044.082176,867.938469,47.047902,16.822773,8,1100,"{'max_depth': 8, 'n_estimators': 1100}",0.771843,0.771863,0.770439,0.771382,0.000666,1


# Some Conclusions:

Looking at the results table we see 

1) The models with the biggest number of estimators scored the highest. Rank 1, 2 have 1100 estimators, rank 3,4 have 900 and rank 5,6 have 700.

2) The model that scored highest was the most complex one. That suggests, that running an iteration with depth = 8 and more iterators or depth  9 plus more iterators AND playing with other hyperparameters, may give us some better results. Due to my limited computing capacity (laptop) I refrain for the time being from doing it. However, I save the model for later use.



In [39]:
import pickle

pickle.dump(best, open("Twitter_XGBooster.pickle.dat", "wb"))

experimental_model = pickle.load(open("Twitter_XGBooster.pickle.dat", "rb"))

accuracy_score(experimental_model.predict(X_test), y_test)

0.771505

Last step is to save the model using pickle. You can reuse the model by the above code snipet.