In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
twitter = pd.read_csv("preprocessed_without_emoji.csv")   

Wall time: 2.88 s


In [3]:
twitter[:10]

Unnamed: 0,_score,tweet_id,text,identification,emotion,Category
0,391,0x376b20,peopl post add snapchat must dehydr cuz man,train,anticipation,4
1,433,0x2d5350,brianklaa see trump danger freepress around wo...,train,sadness,8
2,376,0x1cd5b0,issa stalk tasha,train,fear,1
3,120,0x1d755c,riskshow thekevinallison thx best time tonight...,train,joy,7
4,1021,0x2c91a8,still wait suppli liscu,train,anticipation,4
5,481,0x368e95,love know gender,train,joy,7
6,827,0x249c0c,dstvngcare dstvng highlight shown actual sport...,train,sadness,8
7,631,0x359db9,ssm debat manufactur fantasi use distract igno...,train,anticipation,4
8,839,0x23b037,love suffer love valium doe noth help love doc...,train,joy,7
9,560,0x1fde89,someon tell whi feed scroll back tweet saw min...,train,anger,3


In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(twitter.text, twitter.emotion,
                                                test_size=0.2, random_state = 42)

In [5]:
x_train.shape

(1161149,)

In [6]:
y_train.shape

(1161149,)

In [7]:
%%time
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

BOW = TfidfVectorizer(tokenizer=nltk.word_tokenize, stop_words='english', max_df = 0.7, max_features = 19500)

x_train = BOW.fit_transform(x_train.astype('U'))
x_test = BOW.transform(x_test.astype('U'))

term_weight = np.asarray(x_train.mean(axis=0)).ravel().tolist()
term_array = pd.DataFrame({'term': BOW.get_feature_names(), 'weight': term_weight})
term_array.sort_values(by='weight', ascending=False, inplace=True)
term_array

Wall time: 2min 16s


Unnamed: 0,term,weight
16997,thi,0.016192
9919,love,0.016149
9605,life,0.014369
16804,thank,0.013688
3976,day,0.013204
...,...,...
9086,khooni,0.000006
11699,nofalto,0.000006
161,addz,0.000006
7373,heartuch,0.000006


In [8]:
## deal with label (string -> one-hot)
import keras
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    #return keras.utils.to_categorical(enc)
    return keras.utils.np_utils.to_categorical(enc)   #Allison/Moo said so, because of version

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

print('\n\n## After convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']

## Before convert
y_train[0:4]:
 382887         joy
235898     sadness
77240         fear
1427061        joy
Name: emotion, dtype: object

y_train.shape:  (1161149,)
y_test.shape:  (290288,)


## After convert
y_train[0:4]:
 [[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]]

y_train.shape:  (1161149, 8)
y_test.shape:  (290288, 8)


In [9]:
# I/O check
input_shape = x_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  19500
output_shape:  8


In [10]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, )) 
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64,32
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64,32
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 19500)]           0         
                                                                 
 dense (Dense)               (None, 64)                1248064   
                                                                 
 re_lu (ReLU)                (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 re_lu_1 (ReLU)              (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 8)                 520       
                                                                 
 softmax (Softmax)           (None, 8)                 0     

In [11]:
x_train.sort_indices()
x_test.sort_indices()

In [12]:
from keras.callbacks import CSVLogger
import tensorflow as tf
csv_logger = CSVLogger('logs/training_log.csv')

# training setting
epochs = 10
batch_size = 64

es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max')
callbacks = [es] #Early Stopping

# training!
history = model.fit(x_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=callbacks,
                    validation_data = (x_test, y_test))
print('training finish')

Epoch 1/10




Epoch 2/10
training finish


In [13]:
pred_result = model.predict(x_test, batch_size=128) #128
pred_result[:5]

array([[6.45819530e-02, 6.70043230e-02, 3.50421429e-01, 2.44890694e-02,
        2.19452947e-01, 1.38298810e-01, 9.78399962e-02, 3.79115753e-02],
       [3.67980683e-04, 2.09558737e-02, 1.64103473e-03, 5.28627075e-02,
        7.97110319e-01, 2.63718255e-02, 5.49151516e-03, 9.51987356e-02],
       [1.48562891e-02, 7.38052577e-02, 1.64714679e-01, 2.17323732e-02,
        4.49528307e-01, 9.54440683e-02, 5.11007905e-02, 1.28818303e-01],
       [1.33473016e-02, 1.59267351e-01, 1.17980704e-01, 3.53553146e-02,
        3.45056772e-01, 2.16153756e-01, 1.91497263e-02, 9.36890692e-02],
       [2.79260576e-02, 1.99798599e-01, 1.06754467e-01, 2.61157081e-02,
        3.89471978e-01, 1.23301186e-01, 3.48189883e-02, 9.18130204e-02]],
      dtype=float32)

In [14]:
pred_result = label_decode(label_encoder, pred_result)
pred_result[:5]

array(['disgust', 'joy', 'joy', 'joy', 'joy'], dtype=object)

In [15]:
from sklearn.metrics import accuracy_score

print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_test), pred_result), 2)))

testing accuracy: 0.54


In [16]:
#Let's take a look at the training log
training_log = pd.DataFrame()
training_log = pd.read_csv("logs/training_log.csv")
training_log

Unnamed: 0,epoch,accuracy,loss,val_accuracy,val_loss
0,0,0.594474,1.132769,0.501264,1.441656
1,1,0.596342,1.129073,0.499828,1.441165
2,2,0.598299,1.124981,0.497427,1.445517
3,3,0.599833,1.121017,0.495253,1.455589
4,4,0.60157,1.117118,0.496145,1.455027
5,5,0.603274,1.113266,0.494915,1.461066
6,6,0.604869,1.109313,0.496397,1.469685
7,7,0.606178,1.105807,0.495005,1.472489
8,8,0.607553,1.102619,0.492545,1.475671
9,9,0.60931,1.099336,0.491946,1.483884


In [17]:
twitter_test_data = pd.read_csv("twitter_test_data.csv")
twitter_test_data


Unnamed: 0,_score,tweet_id,text,identification
0,232,0x28b412,"Confident of your obedience, I write to you, k...",test
1,989,0x2de201,"""Trust is not the same as faith. A friend is s...",test
2,66,0x218443,When do you have enough ? When are you satisfi...,test
3,104,0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test
4,310,0x26289a,"In these tough times, who do YOU turn to as yo...",test
...,...,...,...,...
411967,602,0x2913b4,"""For this is the message that ye heard from th...",test
411968,598,0x2a980e,"""There is a lad here, which hath five barley l...",test
411969,827,0x316b80,When you buy the last 2 tickets remaining for ...,test
411970,368,0x29d0cb,I swear all this hard work gone pay off one da...,test


In [18]:
%%time
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def stemming(sentence):
    ps = PorterStemmer()
    words = word_tokenize(sentence)
    res = []
    for w in words:
        res.append(ps.stem(w))
    return ' '.join(res)


def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)



def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

twitter_test_data["text"] = twitter_test_data.text.apply(stemming)
twitter_test_data["text"] = twitter_test_data.text.apply(remove_URL)
twitter_test_data["text"] = twitter_test_data.text.apply(remove_html)
twitter_test_data["text"] = twitter_test_data.text.apply(remove_emoji)
twitter_test_data["text"] = twitter_test_data.text.apply(remove_punct)
twitter_test_data["text"] = twitter_test_data.text.apply(remove_stopwords)

twitter_test_data


Wall time: 2min 49s


Unnamed: 0,_score,tweet_id,text,identification
0,232,0x28b412,confid obedi write know even ask philemon 121 ...,test
1,989,0x2de201,trust faith friend someon trust put faith anyo...,test
2,66,0x218443,enough satisfi goal realli money materi money ...,test
3,104,0x2939d5,god woke chase day godsplan godswork,test
4,310,0x26289a,tough time turn symbol hope,test
...,...,...,...,...
411967,602,0x2913b4,thi messag ye heard begin love one anoth john ...,test
411968,598,0x2a980e,lad hath five barley loav two small fish among...,test
411969,827,0x316b80,buy last 2 ticket remain show sell mixedfeel b...,test
411970,368,0x29d0cb,swear thi hard work gone pay one day,test


In [19]:
%%time
x_test_twitter = BOW.transform(twitter_test_data['text'])
x_test_twitter.sort_indices()

pred_result_test_data = model.predict(x_test_twitter, batch_size=64)

print('x_test.shape: ', x_test_twitter.shape)

x_test.shape:  (411972, 19500)
Wall time: 39.3 s


In [20]:
pred_result_test_data = label_decode(label_encoder, pred_result_test_data)
pred_result_test_data[:5]

array(['trust', 'anticipation', 'joy', 'anticipation', 'trust'],
      dtype=object)

In [21]:
upload_df = pd.DataFrame()
upload_df["id"] = twitter_test_data["tweet_id"]
upload_df["emotion"] = pred_result_test_data
upload_df

Unnamed: 0,id,emotion
0,0x28b412,trust
1,0x2de201,anticipation
2,0x218443,joy
3,0x2939d5,anticipation
4,0x26289a,trust
...,...,...
411967,0x2913b4,joy
411968,0x2a980e,anticipation
411969,0x316b80,sadness
411970,0x29d0cb,joy


In [24]:
upload_df.emotion.value_counts()

joy             199011
sadness          76747
anticipation     52466
disgust          42561
trust            24090
fear              8745
anger             4274
surprise          4078
Name: emotion, dtype: int64

In [23]:
upload_df.to_csv("predictions/BOWKeras3.csv",index=False)