In [1]:
import numpy as np
import pandas as pd
import scipy
import re
import time
import json
import boto3
import io
import warnings
warnings.filterwarnings('ignore')
import os
import redis

In [2]:
import nltk
import spacy
spacy.load('en')
from nltk.corpus import stopwords
import preprocessor as p

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import dill as pickle

In [4]:
s3 = boto3.client('s3')

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import xgboost as xgb
from xgboost.sklearn import XGBClassifier


In [6]:
def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

In [7]:
def create_upload(object, file_name):
    # Create full name
    full_name = '{}_{}.pkl'.format(file_name, session)
    # Pickle File
    with open(full_name, 'wb') as file:
        pickle.dump(object, file)


In [8]:

df= pd.read_csv("training.1600000.processed.noemoticon.csv",encoding="latin-1",names=['target','id','date','flag','user','text'])
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
def getCleanTweet(text):
    tok = WordPunctTokenizer()
    user_pattern = '@[A-Za-z0-9_]+'
    http_pattern = 'https?://[^ ]+'
    www_pattern = 'www.[^ ]+'
    combined_pattern = '|'.join((user_pattern, http_pattern, www_pattern))
    negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                    "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                    "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                    "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                    "mustn't":"must not"}
    neg_pattern = re.compile('\b(' + '|'.join(negations_dic.keys()) + ')\b')
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pattern, '', bom_removed)
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], stripped)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled).lower()
    cleaned = (" ".join(x for x in tok.tokenize(letters_only) if len(x) > 1)).strip()
    cleaned = ''.join(k + k if sum(1 for i in g) > 1 else k for k, g in itertools.groupby(cleaned))
    return cleaned

In [10]:
import itertools
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
import re

In [11]:
df['clean_text'] = [getCleanTweet(text) for text in df['text']]
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
target        1600000 non-null int64
id            1600000 non-null int64
date          1600000 non-null object
flag          1600000 non-null object
user          1600000 non-null object
text          1600000 non-null object
clean_text    1600000 non-null object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [12]:
# Down Sample
tweets_subsampled_1, tweets_subsampled_2 = train_test_split(df, test_size=0.1)


In [13]:
# Split between outcome and Features
y = tweets_subsampled_2['target']
X = tweets_subsampled_2['text']

In [14]:
start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()


In [15]:
pipe = Pipeline(steps=[('vectidf', TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                                                   lowercase=True, use_idf=True, max_df=0.5,
                                                   min_df=2, norm='l2', smooth_idf=True)),
                       ('svd', TruncatedSVD(500)),
                       #('norm',Normalizer(copy=False))
                       ])

In [16]:
tweets_transform = pipe.fit_transform(X)


In [17]:
print("Transform Data - Execution time: %s seconds ---" % (time.time() - start_time))
# splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tweets_transform, y, test_size=0.25)

Transform Data - Execution time: 68.43557000160217 seconds ---


In [18]:
# Base Model Build
warnings.filterwarnings('ignore')
start_time = time.time()

In [19]:
xgb_model = XGBClassifier(random_state=10)


In [20]:
parameters = {'n_jobs': [-1],
              }

In [21]:
clf = GridSearchCV(xgb_model, parameters, cv=3, verbose=0, n_jobs=1)
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constrai...
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=10, reg_alpha=None,
                                     reg_lambda=None, sc

In [22]:
print("Base Line Model - Execution time: %s seconds ---" % (time.time() - start_time))
print("Base Line Model - CV Score: " + str(clf.best_score_))
print("Best Params: " + str(clf.best_params_))

Base Line Model - Execution time: 1113.5749802589417 seconds ---
Base Line Model - CV Score: 0.7127916666666667
Best Params: {'n_jobs': -1}


In [23]:
warnings.filterwarnings('ignore')
start_time = time.time()
print("Starting full model training...")

Starting full model training...


In [24]:
tweets_subsampled_1, tweets_subsampled_2 = train_test_split(df, test_size=0.5)

In [25]:
y = tweets_subsampled_2['target']
X = tweets_subsampled_2['text']

In [26]:
# Transform Data
print("Starting vectorization...")
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=0.5,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))

tweets_tfidf = vectorizer.fit_transform(X)

print("Vectorizing Finished. Number of features: %d" % tweets_tfidf.get_shape()[1])


Starting vectorization...
Vectorizing Finished. Number of features: 565186


In [27]:
#tweets_tfidf

In [28]:
print("Starting Dimension Reduction...")

Starting Dimension Reduction...


In [30]:
#pipe = Pipeline(steps=[('svd', TruncatedSVD(10000)),
                       #('norm', Normalizer(copy=False))
                      # ])
#tweets_transform = pipe.fit_transform(tweets_tfidf)

In [31]:
print('Start model training...')
start_time = time.time()
X_train, X_test, y_train, y_test = train_test_split(tweets_tfidf, y, test_size=0.3)

Start model training...


In [32]:
xgb_model = XGBClassifier(max_depth=5,
                          min_child_weight=5,
                          gamma=0.1,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          scale_pos_weight=1,
                          random_state=10,
                          n_estimators=5000,
                          learning_rate=0.01,
                          n_jobs=-1)

In [33]:
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.1, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=nan, monotone_constraints=None,
              n_estimators=5000, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=10, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.8, tree_method=None,
              validate_parameters=False, verbosity=None)

In [34]:
print("Test Set Score: " + str(xgb_model.score(X_test, y_test)))
print("Train - Execution time: %s seconds ---" % (time.time() - start_time))

Test Set Score: 0.7331083333333334
Train - Execution time: 8509.643510341644 seconds ---


In [48]:
from sklearn.metrics import confusion_matrix, classification_report
from keras.models import Model
from keras.layers import Input, Dense
from keras.models import Sequential
pred = xgb_model.predict(X_test)
pred = np.argmax(pred) 
y_true = np.argmax(y_test)

In [52]:

y_pred = xgb_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))

[[76030 44399]
 [19655 99916]]


In [57]:
#CM = confusion_matrix(y_test, y_pred)
#import matplotlib.pyplot as plt
#from mlxtend.plotting import plot_confusion_matrix
#fig, ax = plot_confusion_matrix(conf_mat=CM ,  figsize=(5, 5))
#plt.show()
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.63      0.70    120429
           4       0.69      0.84      0.76    119571

    accuracy                           0.73    240000
   macro avg       0.74      0.73      0.73    240000
weighted avg       0.74      0.73      0.73    240000



0.7331083333333334