In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'latin-1', 
                 names = ['sentiment', 'id', 'date','query', 'user_name', 'tweet'])

df.loc[0:1]

Unnamed: 0,sentiment,id,date,query,user_name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...


In [2]:
pd.set_option('display.max_colwidth', -1)
df.drop(columns = ['id', 'date', 'query', 'user_name'], inplace = True)

df['sentiment'].value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

# Transform special characters

1) @random_user_name --> @

2) http{blah blah} --> http

3) www{ blah blah} --> www

4) {blah blah}com --> com

5) # --> hashtag

In [3]:
%%time
def transform_special_characters(df, text_column):
    '''Takes a DataFrame, parses through the text_column
    and transfroms http:... -> http
    www.blahblah... -> www
    #hashtag -> hashtag
    @user_name -> at_symbol'''
    
    transforms = {r'http[^\s]+': 'http',
                 r'\#\w+': 'hashtag',
                 r' www[^\s]+': 'www',
                 r'\@\w+': 'at_symbol',
                 r'[^\s]+\.com': 'com'}
    
    df[text_column].replace(regex = transforms, inplace = True)
    return None

transform_special_characters(df, 'tweet')

CPU times: user 15.7 s, sys: 33.7 ms, total: 15.8 s
Wall time: 15.8 s


Here, we perform standard steps, by converting into lower case and tokenizing.

1) Convert into lower case

2) Tokenize the words

In [4]:
%%time
from nltk.tokenize import TreebankWordTokenizer

def set_to_lower(df, text_column):
    df[text_column] = df[text_column].str.lower()
    return None

def tokenize(df, text_column):
    tok = TreebankWordTokenizer()    
    
    start = 0
    end = 20000
    num_tweets = len(df)
    for i in range(num_tweets//end + 1):
        df[text_column][start:end] = df[text_column][start:end].apply(lambda row: tok.tokenize(row))
        start = end
        if end + 20000 < num_tweets:
            end = end + 20000
        else:
            end = num_tweets
    
    return None

set_to_lower(df, 'tweet')
tokenize(df, 'tweet')
df['tweet']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


CPU times: user 2min 22s, sys: 852 ms, total: 2min 23s
Wall time: 2min 29s


0          [at_symbol, http, -, awww, ,, that, 's, a, bummer., you, shoulda, got, david, carr, of, third, day, to, do, it., ;, d]                    
1          [is, upset, that, he, ca, n't, update, his, facebook, by, texting, it, ..., and, might, cry, as, a, result, school, today, also., blah, !]
2          [at_symbol, i, dived, many, times, for, the, ball., managed, to, save, 50, %, the, rest, go, out, of, bounds]                             
3          [my, whole, body, feels, itchy, and, like, its, on, fire]                                                                                 
4          [at_symbol, no, ,, it, 's, not, behaving, at, all., i, 'm, mad., why, am, i, here, ?, because, i, ca, n't, see, you, all, over, there, .] 
                                                                             ...                                                                     
1599995    [just, woke, up., having, no, school, is, the, best, feeling, ever]                      

After tokenizing, we use the WordNetLemmatizer in order to make our vocabulary more robust. We choose not to choose a stemmer, because stemmer is quite more crude with respect to words. Let's see if this works!

In [5]:
%%time
from nltk.stem import WordNetLemmatizer

def lemmatize(df, text_column):
    
    lemmatizer = WordNetLemmatizer()
    for row in df[text_column]:
        i = 0
        n = len(row)
        while i < n:
            row[i] = lemmatizer.lemmatize(row[i])
            i += 1
    return None

lemmatize(df, 'tweet')
print(WordNetLemmatizer().lemmatize('behaving'))
df['tweet']

behaving
CPU times: user 1min 19s, sys: 59.9 ms, total: 1min 19s
Wall time: 1min 20s


0          [at_symbol, http, -, awww, ,, that, 's, a, bummer., you, shoulda, got, david, carr, of, third, day, to, do, it., ;, d]                   
1          [is, upset, that, he, ca, n't, update, his, facebook, by, texting, it, ..., and, might, cry, a, a, result, school, today, also., blah, !]
2          [at_symbol, i, dived, many, time, for, the, ball., managed, to, save, 50, %, the, rest, go, out, of, bound]                              
3          [my, whole, body, feel, itchy, and, like, it, on, fire]                                                                                  
4          [at_symbol, no, ,, it, 's, not, behaving, at, all., i, 'm, mad., why, am, i, here, ?, because, i, ca, n't, see, you, all, over, there, .]
                                                                             ...                                                                    
1599995    [just, woke, up., having, no, school, is, the, best, feeling, ever]                            

In [6]:
vocabulary = {}

for row in df['tweet']:
    for word in row:
        if word in vocabulary:
            vocabulary[word] += 1
        else:
            vocabulary[word] = 1

In [7]:
uniques = 0
between_2_and_5 = 0
for (key,value) in vocabulary.items():
    if value == 1:
        uniques += 1
    if value >=2 and value <=5:
        between_2_and_5 += 1
print("Number of unique words:", uniques)
print("Numer of words that appear between 2 and 5 times", between_2_and_5)
print("Number of words in total:", len(vocabulary))

Number of unique words: 282581
Numer of words that appear between 2 and 5 times 84705
Number of words in total: 420459


That shows that we have 282581 unique words, and 84705 words that appear between 2 and 5 times in our vocabulary. For convenience and faster convergence, we will delete these words (More precicely, the words between 2 and 4) and replace them with UNKNOWN.

In [8]:
unknown_words = set({})

values_to_delete = set({1,2,3,4})
for (key, value) in vocabulary.items():
    if value in values_to_delete:
        unknown_words.add(key)
        
vocabulary['UNKNOWN'] = len(unknown_words)
print(len(vocabulary), len(unknown_words))

420460 359507


In [9]:
def replace_with_unknown(df, text_column, vocabulary):
    '''Takes df and text_column to access df[text_column].
    Then parses each row, and replaces every word that has count = 1 
    with the word UNKNOWN. (Of course we picked capitals because unknown
    may belong in the vocabulary)'''
    
    values_to_delete = set({1,2,3,4})
    for row in df[text_column]:
        i = 0
        n = len(row)
        while i < n:
            if vocabulary[row[i]] in values_to_delete:
                row[i] = 'UNKNOWN'
            i += 1
    return None

replace_with_unknown(df, 'tweet', vocabulary)
df.loc[:10]

Unnamed: 0,sentiment,tweet
0,0,"[at_symbol, http, -, awww, ,, that, 's, a, bummer., you, shoulda, got, david, carr, of, third, day, to, do, it., ;, d]"
1,0,"[is, upset, that, he, ca, n't, update, his, facebook, by, texting, it, ..., and, might, cry, a, a, result, school, today, also., blah, !]"
2,0,"[at_symbol, i, UNKNOWN, many, time, for, the, ball., managed, to, save, 50, %, the, rest, go, out, of, bound]"
3,0,"[my, whole, body, feel, itchy, and, like, it, on, fire]"
4,0,"[at_symbol, no, ,, it, 's, not, behaving, at, all., i, 'm, mad., why, am, i, here, ?, because, i, ca, n't, see, you, all, over, there, .]"
5,0,"[at_symbol, not, the, whole, crew]"
6,0,"[need, a, hug]"
7,0,"[at_symbol, hey, long, time, no, see, !, yes.., rain, a, bit, ,, only, a, bit, lol, ,, i, 'm, fine, thanks, ,, how, 's, you, ?]"
8,0,"[at_symbol, nope, they, did, n't, have, it]"
9,0,"[at_symbol, que, me, UNKNOWN, ?]"


Essentially in the above step we replaced each word that occured 1-4 times with the word UNKNOWN. The reason is twofold

1) We get rid of 359507 words. This of course will give us a boost in the computing performance

2) A word that appears only 1-4 on the text doesn't play any role in the training process of the XGBoost classifier (Or of any classifier for that reason). The classifier cannot understand whether the context is positive and secondly even if it understands (say this word is found on the training set), then on the test set it will possibly not play any role. Moreover, if we generalise the classifier, this unique word has a huge possibility of appearing minimal amount of time. 

Therefore, it makes sense to delete them.

In [10]:
df['tweet'] = df['tweet'].str.join(' ')
df['tweet'][:1]

0    at_symbol http - awww , that 's a bummer. you shoulda got david carr of third day to do it. ; d
Name: tweet, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

labels = df['sentiment'].to_numpy()/4; del df['sentiment']
X = df['tweet'].to_numpy()


vectorizer = CountVectorizer(stop_words = stopwords.words('english'))
X_vectorized = vectorizer.fit_transform(X)
X_vectorized

<1600000x46683 sparse matrix of type '<class 'numpy.int64'>'
	with 11726114 stored elements in Compressed Sparse Row format>

In [12]:
%%time
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import joblib
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, train_size = 0.75)

model = xgb.XGBClassifier()

parameters = {"max_depth":[7, 8, 9], "n_estimators":[1050, 1150]}
cv = GridSearchCV(model, parameters, scoring = 'accuracy', n_jobs=4,verbose = True, cv = 3)

with joblib.parallel_backend('threading',n_jobs = 4):
    cv.fit(X_train, y_train)

best = cv.best_estimator_
print('Best model: ', best)
print('Test score: ', accuracy_score(y_test, best.predict(X_test)))

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed: 395.4min finished


Best model:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=1150, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
Test score:  0.77467
CPU times: user 1d 57min 52s, sys: 59.1 s, total: 1d 58min 51s
Wall time: 7h 33min 21s


In [13]:
best = cv.best_estimator_
results = cv.cv_results_
cv.best_score_

0.7734325

In [14]:
pd.DataFrame(results, columns = results.keys())

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,4377.219946,4.901161,116.542374,22.916282,7,1050,"{'max_depth': 7, 'n_estimators': 1050}",0.768568,0.767745,0.767739,0.768018,0.000389,6
1,4723.354614,85.893007,171.514794,55.574327,7,1150,"{'max_depth': 7, 'n_estimators': 1150}",0.769881,0.769068,0.769212,0.769387,0.000354,5
2,4818.097902,61.821466,181.904716,83.768779,8,1050,"{'max_depth': 8, 'n_estimators': 1050}",0.770901,0.769515,0.770229,0.770215,0.000566,4
3,5417.892612,106.69869,197.58068,46.876992,8,1150,"{'max_depth': 8, 'n_estimators': 1150}",0.772178,0.770765,0.771397,0.771447,0.000578,3
4,5194.832294,59.543668,224.361218,65.686789,9,1050,"{'max_depth': 9, 'n_estimators': 1050}",0.772896,0.771602,0.772379,0.772293,0.000531,2
5,4106.084026,1029.436163,70.68542,25.201425,9,1150,"{'max_depth': 9, 'n_estimators': 1150}",0.774048,0.77272,0.773529,0.773432,0.000546,1


# Some Conclusions:

Looking at the results table we see 

1) The models with the biggest number of estimators and higher depth scored the highest. The rank is complete analogous to the complexity.

2) The model that scored highest was the most complex one. That suggests, that running an iteration with depth = 8 and more iterators or depth  9 plus more iterators AND playing with other hyperparameters, may give us some better results. Due to my limited computing capacity (laptop) I refrain for the time being from doing it. However, I save the model for later use.

In [16]:
import pickle

pickle.dump(best, open("XGBooster_9_1150.pickle.dat", "wb"))

experimental_model = pickle.load(open("XGBooster_9_1150.pickle.dat", "rb"))

accuracy_score(experimental_model.predict(X_test), y_test)

0.77467

Last step is to save the model using pickle. You can reuse the model by the above code snipet.