## load data and preprocess

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json

In [5]:
dataset_dir = 'dataset/'

loading data

In [6]:
all_text_df = pd.read_json(dataset_dir + 'tweets_DM.json', lines=True)
all_text_df

Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets
...,...,...,...,...,...
1867530,827,hashtag_tweets,"{'tweet': {'hashtags': ['mixedfeeling', 'butim...",2015-05-12 12:51:52,tweets
1867531,368,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x29d0...",2017-10-02 17:54:04,tweets
1867532,498,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2a6a...",2016-10-10 11:04:32,tweets
1867533,840,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x24fa...",2016-09-02 14:25:06,tweets


In [7]:
del all_text_df['_index']
del all_text_df['_type']

In [8]:
def scraptext(source):
    return source['tweet']['text']

def scrapid(source):
    return source['tweet']['tweet_id']

all_text_df['text'] = all_text_df['_source'].apply(scraptext)
all_text_df['id'] = all_text_df['_source'].apply(scrapid)
del all_text_df['_source']

loading the train/test split

In [9]:
split = pd.read_csv(dataset_dir + 'data_identification.csv').set_index('tweet_id')['identification'].to_dict()
split['0x28cc61']

'test'

In [10]:
all_text_df['split'] = all_text_df['id'].map(split)
all_text_df

Unnamed: 0,_score,_crawldate,text,id,split
0,391,2015-05-23 11:42:47,"People who post ""add me on #Snapchat"" must be ...",0x376b20,train
1,433,2016-01-28 04:52:09,"@brianklaas As we see, Trump is dangerous to #...",0x2d5350,train
2,232,2017-12-25 04:39:20,"Confident of your obedience, I write to you, k...",0x28b412,test
3,376,2016-01-24 23:53:05,Now ISSA is stalking Tasha 😂😂😂 <LH>,0x1cd5b0,train
4,989,2016-01-08 17:18:59,"""Trust is not the same as faith. A friend is s...",0x2de201,test
...,...,...,...,...,...
1867530,827,2015-05-12 12:51:52,When you buy the last 2 tickets remaining for ...,0x316b80,test
1867531,368,2017-10-02 17:54:04,I swear all this hard work gone pay off one da...,0x29d0cb,test
1867532,498,2016-10-10 11:04:32,@Parcel2Go no card left when I wasn't in so I ...,0x2a6a4f,test
1867533,840,2016-09-02 14:25:06,"Ah, corporate life, where you can date <LH> us...",0x24faed,train


In [11]:
train_df = all_text_df[all_text_df['split'] == 'train']
test_df = all_text_df[all_text_df['split'] == 'test']
del all_text_df
del train_df['split']
del test_df['split']
print(f'training samples: {len(train_df)}')
print(f'testing samples: {len(test_df)}')

training samples: 1455563
testing samples: 411972


then we append the label to the train df

In [12]:
labels = pd.read_csv(dataset_dir + 'emotion.csv').set_index('tweet_id')['emotion'].to_dict()
train_df['emotion'] = train_df['id'].map(labels)
train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['emotion'] = train_df['id'].map(labels)


Unnamed: 0,_score,_crawldate,text,id,emotion
0,391,2015-05-23 11:42:47,"People who post ""add me on #Snapchat"" must be ...",0x376b20,anticipation
1,433,2016-01-28 04:52:09,"@brianklaas As we see, Trump is dangerous to #...",0x2d5350,sadness
3,376,2016-01-24 23:53:05,Now ISSA is stalking Tasha 😂😂😂 <LH>,0x1cd5b0,fear
5,120,2015-06-11 04:44:05,@RISKshow @TheKevinAllison Thx for the BEST TI...,0x1d755c,joy
6,1021,2015-08-18 02:30:07,Still waiting on those supplies Liscus. <LH>,0x2c91a8,anticipation
...,...,...,...,...,...
1867526,94,2016-12-26 02:44:07,I'm SO HAPPY!!! #NoWonder the name of this sho...,0x321566,joy
1867527,627,2015-04-01 08:14:56,In every circumtance I'd like to be thankful t...,0x38959e,joy
1867528,274,2016-11-17 23:46:22,there's currently two girls walking around the...,0x2cbca6,joy
1867533,840,2016-09-02 14:25:06,"Ah, corporate life, where you can date <LH> us...",0x24faed,joy


## classification

w2v + lstm

In [20]:
import ollama

# we use llama3.2:1b
def generate_embeddings(row, text_column_name='text'):
    embeddings = ollama.embeddings(
        model='llama3.2:1b',
        prompt=row[text_column_name],
    )
    return embeddings["embedding"]

In [21]:
# We use the text column
column_name = 'text'

# Apply the function to the specified column and store the result in a new column 'embeddings'
train_df['embeddings'] = train_df.apply(lambda row: generate_embeddings(row, column_name), axis=1)
test_df['embeddings'] = test_df.apply(lambda row: generate_embeddings(row, column_name), axis=1)

KeyboardInterrupt: 

In [None]:
from sklearn.preprocessing import LabelEncoder

y_train = train_df['emotion']

le = LabelEncoder()
le.fit(y_train)

X_llama_train = np.vstack(df_train["embeddings"].to_numpy())
X_llama_test = np.vstack(df_test["embeddings"].to_numpy())
y_train_llama = label_encode(le, y_train)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

model_input = Input(shape=(len(X_llama_train[0]), ))
X = model_input

# 1st hidden layer
X_W1 = Dense(units=512)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary() #We can notice that our parameter numbers went up because of the increase in the dimension of our input

In [None]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('training_log.csv')

# training setting
epochs = 25
batch_size = 32

# training!
history = model.fit(X_llama_train, y_train_llama, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=[csv_logger],
                    validation_data = (X_llama_test, y_test_llama))
print('training finish')

In [None]:
pred_result = model.predict(X_llama_test, batch_size=128)
pred_result[:5]