In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [21]:
df = pd.read_pickle("data.pkl")

# undersample the data
df = df.sample(frac=0.1)

In [22]:
# Do some basic cleaning

# remove href link
df['text'] = df['text'].str.replace(r'href', '')
df['text'] = df['text'].str.replace(r'http', '')
df['text'] = df['text'].str.replace(r'www', '')

df['text'] = df['text'].str.lower() # We need to check if there is a frequent user of capital letters to express emotions
df['text'] = df['text'].str.replace(r'[^\w\s]', '')
df['text'] = df['text'].str.replace(r'\d+', '')
df['text'] = df['text'].str.replace(r'\n', ' ')
df['text'] = df['text'].str.replace(r'\s+', ' ')
df['text'] = df['text'].str.strip()


In [23]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# if there is a stopword with an apostrophe, add the word without the apostrophe and a space instead of the apostrophe, and the word without the apostrophe
more_stop_words = set()
for word in stop_words:
    if "'" in word:
        more_stop_words.add(word.replace("'", " "))
        more_stop_words.add(word.replace("'", ""))
stop_words = stop_words.union(more_stop_words)

print(stop_words)


# remove negative words from stop_words list

neg_stop_words = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "can't", "cant"}

more_neg_words = set()
# for each negative word with an apostrophe, we will add the word without the apostrophe and a space instead of the apostrophe
for word in neg_stop_words:
    if "'" in word:
        more_neg_words.add(word.replace("'", " "))
        more_neg_words.add(word.replace("'", ""))

neg_stop_words = neg_stop_words.union(more_neg_words)

print(neg_stop_words)

stop_words = stop_words - neg_stop_words

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# if a word is in neg_stop_words, we will add a prefix "not_" to the word that follows it and remove the negative word

def add_not_prefix(text):
    words = text.split()
    new_words = []
    i = 0
    while i < len(words):
        if words[i] in neg_stop_words:
            try:
                new_words.append('not_' + words[i+1])
            except:
                new_words.append('not')
            i += 2
        else:
            new_words.append(words[i])
            i += 1
    return ' '.join(new_words)

df['text'] = df['text'].apply(add_not_prefix)




{"hasn't", 'each', 'if', "don't", 'should', 'mustn', 'theirs', 'been', 'hasn t', "wasn't", 'they', 'in', 'we', 'youd', "you've", "didn't", 'the', 'aren', 'until', 'didn', 'my', 'during', "doesn't", 'youve', 'to', 'is', 'have', 'hasn', 'dont', 'me', 'her', 'further', 'here', 'no', "shouldn't", 'just', 'so', 'being', 'yourself', 'off', 'our', 'can', 'ain', 'o', 'hadn', 'themselves', 'most', 'as', 'into', 'y', 'and', 'he', 'youll', 'aren t', 'same', 'that ll', 'hasnt', 'ourselves', 'shouldve', 'are', "you're", 'she', 'your', 'on', 'werent', 'having', 'down', 'should ve', 'be', 'when', 't', 'their', 'that', 'doing', "won't", 'hadnt', 'mustn t', 'shant', 'its', 'shouldnt', 'hadn t', 'but', 'because', 'by', 'itself', 'at', 'will', 'll', "wouldn't", "aren't", 'not', 'had', 'shouldn t', 'too', 'below', 'after', 'isn t', 'wasn', 'you', 'his', 'whom', 'an', 'before', 'which', 'while', 'i', 'it s', 'ours', 'such', 'wouldnt', "should've", 'you re', 'mightn', 'of', 'didnt', 'herself', 'shes', 'migh

In [24]:
# remove all words that are less than 3 characters long
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))


In [25]:
# Stemming and lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def stem(text):
    return " ".join([stemmer.stem(word) for word in text.split()])


df["text"] = df["text"].apply(lemmatize)
df["text"] = df["text"].apply(stem)


In [26]:
# Show the first 3 with emoitoins = joy
print(df[df['emotions'] == 'joy'].head(10))


                                                     text emotions
27784   face return longwood not_sur feel one hand eag...      joy
24235   feel love look sun feel warmth insid feel joy ...      joy
66171                   feel terrif terribl weak shall pa      joy
5267                                      feel lot better      joy
122434                    uncomfort start feel relax zone      joy
130128  feel clever say like make brand new mistak eve...      joy
118549  feel bit adventur decid tri loreal feria wild ...      joy
23863   not_want hurt necessarili feel strong physic a...      joy
11176             sit feel like put time machin not_pleas      joy
137535  happi bless grate feel amaz level content peac...      joy


In [27]:
# save df to pickle
df.to_pickle("df_v1.pkl")


In [28]:
from collections import defaultdict
from nltk import ngrams

def generate_ngrams(sentence, n):
    words = sentence.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = ' '.join(words[i:i+n])
        ngrams.append(ngram)
    return ngrams




# Generate n-grams
n = 3
ngram_counts = defaultdict(int)
for sentence in df['text']:
    for ngram in generate_ngrams(sentence, n):
        ngram_counts[ngram] += 1

ngram_df = pd.DataFrame(sorted(ngram_counts.items(), key=lambda x: x[1])[::-1], columns=['ngram', 'count'])
print(ngram_df.head(10))

# save ngram_df to pickle
ngram_df.to_pickle("ngram_v1.pkl")



              ngram  count
0    feel littl bit    120
1     feel like ive    102
2    make feel like     73
3    feel like need     63
4     feel like get     59
5   still feel like     54
6  feel like punish     48
7   feel like peopl     47
8   alway feel like     45
9  feel like someth     43


In [29]:
df.head(10)


Unnamed: 0,text,emotions
27784,face return longwood not_sur feel one hand eag...,joy
13895,not_help feel ceiliuradh miss opportun acknowl...,sadness
11451,feel like retreat back mother place support en...,love
765,almost back track beauti day feel bless,love
24235,feel love look sun feel warmth insid feel joy ...,joy
5888,not_feel graciou magnanim feel like curl fetal...,love
66171,feel terrif terribl weak shall pa,joy
34584,not_feel total heartless monster may alway tol...,anger
82926,need not_feel worthless,sadness
113501,feel remors,sadness


In [30]:
#make a sparce array of the words in the dataset using one hot encoding
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['text']).toarray()
print(X.shape)

# make a dataframe of the words
df_words = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())



print(df.head(1))

#print the columns of df_words that are not 0 for df.head(1)
print(df_words.columns[df_words.iloc[0] != 0])


# save df_words to pickle                                 WARNING: This file is too big uwu
#df_words.to_pickle("df_words_v1.pkl") 




(41681, 17972)
                                                    text emotions
27784  face return longwood not_sur feel one hand eag...      joy
Index(['back', 'colleg', 'eager', 'face', 'feel', 'freedom', 'friend', 'hand',
       'longwood', 'not_sur', 'one', 'return', 'see'],
      dtype='object')


In [31]:
from sentence_transformers import SentenceTransformer

def convert_text_to_embeddings(df, text_column, model_name='all-MiniLM-L6-v2'):

    # Load SentenceTransformer model
    model = SentenceTransformer(model_name)
    
    # Get the texts from the DataFrame
    texts = df[text_column].tolist()
    
    # Compute embeddings
    embeddings = model.encode(texts)

    # insert the embeddings into the DataFrame into a single new column
    df['embeddings'] = embeddings.tolist()
    
        
    
    return embeddings

# Example usage:
# Assuming you have a DataFrame df with a 'text' column
embeddings = convert_text_to_embeddings(df, 'text', 'all-MiniLM-L6-v2')





    

In [32]:
# print a line of the embeddings
print(embeddings[0])


[-5.81458732e-02 -5.60483709e-03  8.68722498e-02  4.17182781e-02
  1.92652512e-02  3.85539085e-02  6.65507019e-02 -6.02931455e-02
 -8.92053638e-03 -2.67916750e-02  4.97117154e-02 -4.83248979e-02
 -3.81650962e-02 -4.73941900e-02  6.84752455e-03 -5.11829481e-02
 -5.13229333e-03 -3.70376334e-02 -2.56631244e-02  1.07601456e-01
  6.60977606e-03  1.83822692e-03 -6.51664510e-02 -3.26131284e-02
 -2.13531661e-03 -5.26386537e-02 -3.27641070e-02 -3.78051512e-02
  6.38306439e-02 -9.13201123e-02  5.57275452e-02  1.32038862e-01
 -3.58072631e-02  3.95253068e-03 -4.15708199e-02  1.30974427e-01
 -3.78422923e-02 -5.21705188e-02 -1.13910604e-02  5.93322851e-02
 -1.29474699e-01  7.89017358e-04  1.51101360e-02  4.78607789e-02
 -4.01755702e-03 -4.56100218e-02  2.49709561e-02  1.00279246e-02
  4.62591685e-02 -7.68105462e-02  1.22880042e-02 -1.31569011e-02
 -6.09980784e-02 -2.93911854e-03  1.77230891e-02  2.48986315e-02
 -8.93952232e-03 -8.63799676e-02 -1.81567650e-02  3.69794816e-02
 -1.28638847e-02 -7.54874

In [33]:
#pickle the embeddings
# this wittle file is twooooo bwiig uwu
#df.to_pickle("df_embeddings_v1.pkl")


# TODO List

- Stopwords
- Stemming
- Lemmatization
- N-grams
- TF-IDF
- Word Embeddings
