# MultiClass Text Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import spacy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

In [None]:
# importing data
data = pd.read_csv('/content/drive/MyDrive/Sentiment_multi_class_analysis/cleaned_hm.csv',index_col=False)
data = data.dropna()
data.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
5,27678,45,24h,I meditated last night.,I meditated last night.,True,1,leisure,leisure
24,27697,498,24h,My grandmother start to walk from the bed afte...,My grandmother start to walk from the bed afte...,True,1,affection,affection
32,27705,5732,24h,I picked my daughter up from the airport and w...,I picked my daughter up from the airport and w...,True,1,bonding,affection
42,27715,2272,24h,when i received flowers from my best friend,when i received flowers from my best friend,True,1,bonding,bonding


In [None]:
# renaming the columns
data.rename(columns={'cleaned_hm':'moment','ground_truth_category':'target'},inplace=True)
data.head(2)

Unnamed: 0,hmid,wid,reflection_period,original_hm,moment,modified,num_sentence,target,predicted_category
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
5,27678,45,24h,I meditated last night.,I meditated last night.,True,1,leisure,leisure


In [None]:
# splitting the data and looking at our target categories
x_train,x_test,y_train,y_test = train_test_split(data['moment'].to_numpy(),data['target'].to_numpy(),test_size=0.2,random_state=123)
data['target'].value_counts(),x_train.shape

(target
 affection           4810
 achievement         4276
 bonding             1750
 enjoy_the_moment    1514
 leisure             1306
 nature               252
 exercise             217
 Name: count, dtype: int64,
 (11300,))

In [None]:
# creating simple model with tfidfvectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
pipe_lr = make_pipeline(TfidfVectorizer(),LogisticRegression())
pipe_lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
pipe_lr.score(x_train,y_train)

0.891504424778761

In [None]:
pipe_lr.score(x_test,y_test)

0.8187610619469027

In [None]:
# creating a function which does preprocessing on text data and returns lemma tokens in a dataframe
from sklearn.base import TransformerMixin

def c_i_v(df):
    vectors = []
    lemma_words = []
    for text in df:
        w_v = []
        doc = nlp(text)
        for word in doc:
            if not word.is_space and not word.is_punct and not word.is_stop:
                lemma = word.lemma_
                w_v.append(lemma)
        lemma_words.append(" ".join(w_v))
    lemma_data = pd.DataFrame(data=lemma_words)
    return lemma_data

x_train_tran = c_i_v(x_train)
x_test_tran = c_i_v(x_test)

In [None]:
x_train.head()

71328                              Helping my friend move.
92466                                I WENT TO FRIEND HOME
33023    I broke a score of 80 while playing golf at a ...
27528    When I woke up this morning, my dog was nestle...
65038    We completed and have maintained our first sal...
Name: moment, dtype: object

In [None]:
x_train_tran.head()

Unnamed: 0,0
0,help friend
1,go FRIEND HOME
2,break score 80 play golf course recently start...
3,wake morning dog nestle girlfriend look peaceful
4,complete maintain saltwater tank


In [None]:
x_train_tran.squeeze()

0                                              help friend
1                                           go FRIEND HOME
2        break score 80 play golf course recently start...
3         wake morning dog nestle girlfriend look peaceful
4                         complete maintain saltwater tank
                               ...                        
11295                   son start singe fun preschool song
11296    extremely happy receive vacation approval work...
11297                         successfully bake bread able
11298                                             graduate
11299            hear Bannon get kick crotch kind like lot
Name: 0, Length: 11300, dtype: object

 **squeezing the dataframe because pipe only takes series or numpy array here because of tfidfvector (generally pipe will take a dataframe if it is not dealing with strs values)**

In [None]:
lemma_lgr = make_pipeline(TfidfVectorizer(),LogisticRegression())
lemma_lgr.fit(x_train_tran.squeeze(),y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# score of model which has trained on preprocessed text data.
lemma_lgr.score(x_train_tran.squeeze(),y_train)

0.8893805309734514

In [None]:
# score of model which has trained on preprocessed text data.
lemma_lgr.score(x_test_tran.squeeze(),y_test)

0.8148672566371682

#  Creating Model of Feedforward neural network with an embedding layer (Sentiment Analysis on twitter data)



# In sentiment analysis target values must be in 'int'

In [2]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
import zipfile
zip_folder = zipfile.ZipFile("nlp_getting_started.zip","r")
zip_folder.extractall()
zip_folder.close()

--2024-04-15 13:25:51--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.211.207, 173.194.212.207, 173.194.213.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.211.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-04-15 13:25:51 (48.6 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [3]:
# importing data
train_df = pd.read_csv('/content/train.csv',index_col=False)
train_df = train_df.dropna()
train_df.head(3)

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1


In [4]:
test_df = pd.read_csv('/content/test.csv',index_col=False)
test_df = test_df.dropna()
test_df.head(2)

Unnamed: 0,id,keyword,location,text
15,46,ablaze,London,Birmingham Wholesale Market is ablaze BBC News...
16,47,ablaze,Niall's place | SAF 12 SQUAD |,@sunkxssedharry will you wear shorts for race ...


In [5]:
train_sents,val_sents,train_label,val_label = train_test_split(train_df['text'].to_numpy(),train_df['target'].to_numpy(),test_size=0.2,random_state=123)

In [6]:
# Find average number of tokens (words) in training Tweets
max_length = round(sum([len(i.split()) for i in train_sents])/len(train_sents))
max_length

15

In [None]:
# Setup text vectorization with custom variables
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    output_mode = 'int',
                                    output_sequence_length = max_length)
text_vectorizer.adapt(train_sents)

In [None]:
# Creating an Embedding using an Embedding Layer
embedding = layers.Embedding(input_dim = 10000,
                             output_dim = 128,
                             embeddings_initializer = 'uniform',
                            input_length = max_length)

In [None]:
# Setup text vectorization with custom variables
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    output_mode = 'int',
                                    output_sequence_length = max_length)
text_vectorizer.adapt(train_sents)

# Creating an Embedding using an Embedding Layer
embedding = layers.Embedding(input_dim = 10000,
                             output_dim = 128,
                             embeddings_initializer = 'uniform',
                            input_length = max_length)

# creating simple feedforwarding model noot "The RNN" model
from tensorflow import keras
inputs = layers.Input(shape = (1,), dtype = 'string')
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.GlobalAveragePooling1D()(x)
x = layers.SimpleRNN(64)(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)
model = keras.Model(inputs,outputs)

# compiling the model
model.compile(loss = 'binary_crossentropy',
              optimizer = keras.optimizers.Adam(),
              metrics = ['accuracy'])

# fitting the model
model_rnn = model.fit(train_sents,
          train_label,
          epochs = 5,
          validation_data = (val_sents,val_label))

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                12352     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1292417 (4.93 MB)
Trainable params: 1292417 (4.93 MB)
Non-trainable params: 0 (0.00 Byte)
_______________________

In [None]:
# creating model of lstm
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
from tensorflow import keras
tf.random.set_seed(42)

txt_vect = TextVectorization(max_tokens = 10000,
                             output_mode = 'int',
                             output_sequence_length = max_length)
txt_vect.adapt(train_sents)

embedding = layers.Embedding(input_dim = 10000,
                             output_dim = 128,
                             embeddings_initializer = 'uniform',
                             input_length = max_length)

inputs = layers.Input(shape = (1,), dtype = 'string')
x = txt_vect(inputs)
x = embedding(x)
# x = layers.GlobalAveragePooling1D()(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)
lstm_model = keras.Model(inputs,outputs)

lstm_model.compile(loss = 'binary_crossentropy',
                  optimizer = tf.keras.optimizers.Adam(),
                  metrics = ['accuracy'])

lstm_model.fit(train_sents,
               train_label,
               validation_data = (val_sents,val_label),
               epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7847acf69e70>

# model2 Using united sentence encoder as embedding and simple layers of neural networks.