## Applying Conditional Random Fields for POS tagging

### Notebook content

1. Reading the data
2. Creating POS using NLTK library (labelled dataset)
3. Preparing the data for training
4. Training the Conditional Random Field (CRF)
5. Applying POS predictions with CRF model

### 1. Reading the data

In [14]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

import pycrfsuite
from sklearn_crfsuite import metrics

import pandas as pd
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

In [15]:
df = pd.read_csv('data/emotion_data_merged_4.csv')

# Check if every emotion category has at least 20 sentences
min_count = df['emotion'].value_counts().min()
sample_size = min(5000, min_count)

# Sample 5000 sentences for each emotion
df_sample = df.groupby('emotion').apply(lambda x: x.sample(n=sample_size, random_state=1)).reset_index(drop=True)

df_sample

  df_sample = df.groupby('emotion').apply(lambda x: x.sample(n=sample_size, random_state=1)).reset_index(drop=True)


Unnamed: 0,sentence,emotion
0,i feel that im a selfish person never give you...,anger
1,i realized as long as i feel this way i will n...,anger
2,i feel grouchy and i feel heavy,anger
3,i feel like i am slowly drowning and there is ...,anger
4,i was just having to decide on go and enjoy ti...,anger
...,...,...
34995,"Against all odds, a forgotten melody managed t...",surprise
34996,Just when the lottery ticket thought find a tr...,surprise
34997,No one could foresee that the famous actor wou...,surprise
34998,"Against all odds, the long-lost sibling manage...",surprise


### 2. Creating POS using NLTK library (labelled dataset)

In [16]:
# Function to tokenize a sentence and tag it with POS
def extract_pos(sentence):
    tokens = word_tokenize(sentence)
    return pos_tag(tokens)


In [17]:
# Apply the function to each sentence in the DataFrame
df_sample['POS'] = df_sample['sentence'].progress_apply(extract_pos)

progress-bar: 100%|██████████| 35000/35000 [00:53<00:00, 648.85it/s] 


In [18]:
df_sample

Unnamed: 0,sentence,emotion,POS
0,i feel that im a selfish person never give you...,anger,"[(i, NN), (feel, VBP), (that, IN), (im, VBZ), ..."
1,i realized as long as i feel this way i will n...,anger,"[(i, RB), (realized, VBN), (as, RB), (long, RB..."
2,i feel grouchy and i feel heavy,anger,"[(i, NN), (feel, VBP), (grouchy, NN), (and, CC..."
3,i feel like i am slowly drowning and there is ...,anger,"[(i, JJ), (feel, VBP), (like, IN), (i, NN), (a..."
4,i was just having to decide on go and enjoy ti...,anger,"[(i, NN), (was, VBD), (just, RB), (having, VBG..."
...,...,...,...
34995,"Against all odds, a forgotten melody managed t...",surprise,"[(Against, IN), (all, DT), (odds, NNS), (,, ,)..."
34996,Just when the lottery ticket thought find a tr...,surprise,"[(Just, RB), (when, WRB), (the, DT), (lottery,..."
34997,No one could foresee that the famous actor wou...,surprise,"[(No, DT), (one, NN), (could, MD), (foresee, V..."
34998,"Against all odds, the long-lost sibling manage...",surprise,"[(Against, IN), (all, DT), (odds, NNS), (,, ,)..."


In [19]:
pos_lists = df_sample['POS'].tolist()

print(pos_lists[0])

[('i', 'NN'), ('feel', 'VBP'), ('that', 'IN'), ('im', 'VBZ'), ('a', 'DT'), ('selfish', 'JJ'), ('person', 'NN'), ('never', 'RB'), ('give', 'VBP'), ('you', 'PRP'), ('space', 'NN'), ('at', 'IN'), ('all', 'DT')]


### 3. Preparing the data for training

In [20]:
# Define a function to extract features for each word in a sentence
def word_features(sentence, i):
	word = sentence[i][0]
	features = {

		'word': word,
		'is_first': i == 0, # if the word is a first word
		'is_last': i == len(sentence) - 1, # if the word is a last word
		'is_capitalized': word[0].upper() == word[0],
		'is_all_caps': word.upper() == word,	 # word is in uppercase
		'is_all_lower': word.lower() == word,	 # word is in lowercase

		# Prefix of the word
		'prefix-1': word[0], 
		'prefix-2': word[:2],
		'prefix-3': word[:3],

		# Suffix of the word
		'suffix-1': word[-1],
		'suffix-2': word[-2:],
		'suffix-3': word[-3:],

		# Extracting previous word
		'prev_word': '' if i == 0 else sentence[i-1][0],

		# Extracting next word
		'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
		'has_hyphen': '-' in word, # if word has hypen
		'is_numeric': word.isdigit(), # if word is in numeric
		'capitals_inside': word[1:].lower() != word[1:]
	}

	return features


In [21]:
# Empty lists to store the X and y values
X = []
y = []

In [22]:
# Loop through each sentence and extract features
for sentence in pos_lists:
	X_sentence = []
	y_sentence = []
	for i in range(len(sentence)):
		X_sentence.append(word_features(sentence, i))
		y_sentence.append(sentence[i][1])
	X.append(X_sentence)
	y.append(y_sentence)

In [23]:
# Split the data into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]


### 4. Training the Conditional Random Field (CRF)

In [24]:
# Train a CRF model suing pysrfsuite
trainer = pycrfsuite.Trainer(verbose=False)
for x, y in zip(X_train, y_train):
	trainer.append(x, y)
trainer.set_params({
	'c1': 1.0,
	'c2': 1e-3,
	'max_iterations': 50,
	'feature.possible_transitions': True
})

trainer.train('pos.crfsuite')

In [25]:
tagger = pycrfsuite.Tagger()
tagger.open('pos.crfsuite')

# Predicting the tags for each sentence in the test set
y_pred = []
for xseq in X_test:
    y_pred.append(tagger.tag(xseq))

# You can calculate overall accuracy like this:
accuracy = metrics.flat_accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9384892059488542


In [28]:
# Tag a new sentence
sentence = 'You are a beautiful person'.split()
features = [word_features(sentence, i) for i in range(len(sentence))]
tags = tagger.tag(features)
print(list(zip(sentence, tags)))

[('You', 'CC'), ('are', 'DT'), ('a', 'DT'), ('beautiful', 'NN'), ('person', 'NN')]


### 5. Applying the CRF model POS to my original dataset

In [29]:
# Function to preprocess and predict POS for a single sentence
def predict_pos(sentence):
    # Tokenize the sentence - you might need nltk.word_tokenize or a similar function
    tokens = sentence.split()  # Adjust this if you have a more complex tokenization step
    
    # Extract features for each token in the sentence
    features = [word_features(tokens, i) for i in range(len(tokens))]
    
    # Use the CRF model to predict POS tags
    tags = tagger.tag(features)
    
    # Return the list of (token, POS) tuples
    return list(zip(tokens, tags))

In [33]:
df.dropna(inplace=True)

df.drop(df[df['emotion'] == 'neutral'].index, inplace=True)

df

Unnamed: 0,sentence,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,happiness
4,i was feeling a little vain when i did this one,sadness
...,...,...
662394,Witnessing the destruction of a natural habita...,disgust
662395,The vulgar display of wealth amidst poverty wa...,disgust
662396,The disregard for personal space and boundarie...,disgust
662397,Their manipulation of others' emotions for per...,disgust


In [34]:
# Apply the function to the 'sentence' column of df_sample
df['POS_crf'] = df['sentence'].progress_apply(predict_pos)

progress-bar: 100%|██████████| 479140/479140 [01:14<00:00, 6414.92it/s]


In [35]:
df

Unnamed: 0,sentence,emotion,POS_crf
0,im feeling rather rotten so im not very ambiti...,sadness,"[(im, NN), (feeling, VBP), (rather, NN), (rott..."
1,im updating my blog because i feel shitty,sadness,"[(im, NN), (updating, VBP), (my, JJ), (blog, N..."
2,i never make her separate from me because i do...,sadness,"[(i, NN), (never, VBP), (make, JJ), (her, NN),..."
3,i left with my bouquet of red and yellow tulip...,happiness,"[(i, NN), (left, VBP), (with, JJ), (my, NN), (..."
4,i was feeling a little vain when i did this one,sadness,"[(i, NN), (was, VBP), (feeling, IN), (a, DT), ..."
...,...,...,...
662394,Witnessing the destruction of a natural habita...,disgust,"[(Witnessing, NNP), (the, NN), (destruction, N..."
662395,The vulgar display of wealth amidst poverty wa...,disgust,"[(The, DT), (vulgar, NN), (display, NN), (of, ..."
662396,The disregard for personal space and boundarie...,disgust,"[(The, DT), (disregard, NN), (for, NN), (perso..."
662397,Their manipulation of others' emotions for per...,disgust,"[(Their, DT), (manipulation, JJ), (of, NN), (o..."


In [36]:
df.to_csv('data/emotion_data_merged_4_POS_crf.csv', index=False)

In [40]:
kaggle_base = pd.read_csv("data/kaggle_data.csv", sep="\t")

# Apply the function to the 'sentence' column of df_sample
kaggle_base['POS_crf'] = kaggle_base['sentence'].progress_apply(predict_pos)

progress-bar: 100%|██████████| 1436/1436 [00:00<00:00, 12271.64it/s]


In [42]:
kaggle_base.to_csv('data/kaggle_data_POS_crf.csv', index=False)