# Classification Models for Tweets

### Importing Libraries

In [2]:
import numpy as np
import _pickle as pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import GridSearchCV

### Loading in the Data

In [4]:
# Loading in the DF
with open("main_df.pkl",'rb') as fp:
    main_df = pickle.load(fp)

# Loading in the cleaned tweet data
with open("clean_tweets.pkl",'rb') as fp:
    data = pickle.load(fp)

## Preparing the Data
### Train, test, split

In [5]:
X, y = data, main_df.City

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tf_idf_data_train = vectorizer.fit_transform(X_train)

tf_idf_data_test = vectorizer.transform(X_test)

## Classifier Models
* Dummy Classifier - baseline
* Naive Bayes
* Random Forest
* Logistic Regression

### Dummy Classifier - Baseline Model

In [7]:
from sklearn.dummy import DummyClassifier

dm_class = DummyClassifier()

dm_class.fit(tf_idf_data_train, y_train)
dm_train_preds = dm_class.predict(tf_idf_data_train)
dm_test_preds = dm_class.predict(tf_idf_data_test)

dm_train_score = accuracy_score(y_train, dm_train_preds)
dm_test_score = accuracy_score(y_test, dm_test_preds)

print('Dummy Classifier')
print(f"Training Accuracy: {dm_train_score} \t\t Testing Accuracy: {dm_test_score}")

Dummy Classifier
Training Accuracy: 0.49650262303272547 		 Testing Accuracy: 0.4905094905094905


### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100)

rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Random Forest
Training Accuracy: 0.9728 		 Testing Accuracy: 0.6169


#### Random Forest with Grid Search

In [9]:
# Setting the Parameters to be tested
rf_param_grid = {'n_estimators': [10,30, 60,100],
                 'criterion': ['gini', 'entropy'],
                 'max_depth': [None, 2, 5, 10],
                 'min_samples_split': [5,10],
                 'min_samples_leaf': [1, 2, 5]}

In [None]:
# Grid Searching
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=3, return_train_score=True, verbose=2)
rf_grid_search.fit(tf_idf_data_train, y_train)

In [None]:
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=3, return_train_score=True, verbose=2)
rf_grid_search.fit(tf_idf_data_train, y_train)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_leaf=1, min_samples_split=10)

rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest with GridSearch')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

### Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()

nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.823 		 Testing Accuracy: 0.6049


### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

lr_class = LogisticRegression()

lr_class.fit(tf_idf_data_train, y_train)
lr_train_preds = lr_class.predict(tf_idf_data_train)
lr_test_preds = lr_class.predict(tf_idf_data_test)

lr_train_score = accuracy_score(y_train, lr_train_preds)
lr_test_score = accuracy_score(y_test, lr_test_preds)

print('Logistic Regression')
print(f"Training Accuracy: {lr_train_score} \t\t Testing Accuracy: {lr_test_score}")

Logistic Regression
Training Accuracy: 0.8202598051461404 		 Testing Accuracy: 0.6123876123876124




#### Logistic Regression with Grid Search

In [12]:
# parameters to grid search
log_param_grid = {'C': [1.5**n for n in range(0, 20, 2)],
                  'fit_intercept': [True, False],
                  'intercept_scaling': [1, 5, 10, 25, 50, 100],
                  'solver': ['liblinear', 'saga']}

In [None]:
log_grid_search = GridSearchCV(lr_class, log_param_grid, cv=3, return_train_score=True, verbose=3)
log_grid_search.fit(tf_idf_data_train, y_train)

In [None]:
print(f"Testing Accuracy: {log_grid_search.best_score_*100}")
print(f"Optimal Parameters: {log_grid_search.best_params_}")

In [None]:
lr_class = log_grid_search.best_estimator_

lr_class.fit(tf_idf_data_train, y_train)

lr_train_preds = lr_class.predict(tf_idf_data_train)
lr_test_preds = lr_class.predict(tf_idf_data_test)

lr_train_score = accuracy_score(y_train, lr_train_preds)
lr_test_score = accuracy_score(y_test, lr_test_preds)

print('Logistic Regression with GridSearch')
print(f"Training Accuracy: {lr_train_score} \t\t Testing Accuracy: {lr_test_score}")

## Deep Learning and Keras

In [13]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

Using TensorFlow backend.


In [15]:
y = pd.get_dummies(y).values

In [None]:
# tokenizer = text.Tokenizer(num_words=20000)
# tokenizer.fit_on_texts(list(main_df.tweet))
# list_tokenized_headlines = tokenizer.texts_to_sequences(main_df.tweet)
# X_t = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)

In [33]:
tokenizer = text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X)
list_tokenized_tweets = tokenizer.texts_to_sequences(X)
X_t = sequence.pad_sequences(list_tokenized_tweets, maxlen=100)

In [40]:
embedding_size = 500
input_ = Input(shape=(100,))
x = Embedding(1000, embedding_size)(input_)
x = LSTM(50, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 2 different possible classes, so we use 2 neurons in our output layer
x = Dense(2, activation='sigmoid')(x)

model = Model(inputs=input_, outputs=x)

In [41]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 100, 500)          500000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100, 50)           110200    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 50)                0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_10 (Dropout)         (None, 50)                0   

In [43]:
model.fit(X_t, y, epochs=10, batch_size=2000, validation_split=0.2)

Train on 8006 samples, validate on 2002 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a3d9eeac8>