In [2]:
import spacy
import pandas as pd
from spacy_ngram import NgramComponent
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from spacytextblob.spacytextblob import SpacyTextBlob

In [3]:
nlp= spacy.load('en_core_web_md')
nlp.add_pipe('spacy-ngram', config={
    'sentence_level': False,  # initialize sentence-level ngrams
    'doc_level': True,  # skip processing at document-level
    'ngrams': (2, 3)
})

<spacy_ngram.ngram_component.NgramComponent at 0x1eb97982890>

In [4]:
# Initialize SpacyTextBlob with the nlp object
@spacy.Language.factory("spacy_textblob")
def create_spacy_textblob(nlp, name):
    return SpacyTextBlob(nlp)

nlp.add_pipe("spacy_textblob")

<spacytextblob.spacytextblob.SpacyTextBlob at 0x1eb981f7550>

In [4]:
df = pd.read_csv("./Datasets/simplified_emotions.csv")
df

Unnamed: 0,sentence,emotion
0,That game hurt.,other
1,"You do right, if you don't care then fuck 'em!",other
2,Man I love reddit.,happiness
3,"[NAME] was nowhere near them, he was by the Fa...",other
4,Right? Considering it’s such an important docu...,happiness
...,...,...
797512,that was what i felt when i was finally accept...,happiness
797513,i take every day as it comes i m just focussin...,other
797514,i just suddenly feel that everything was fake,other
797515,im feeling more eager than ever to claw back w...,happiness


In [5]:
df.shape

(797517, 2)

In [6]:
sent_list = df['sentence'].to_list()
doc_list = list(nlp.pipe(sent_list))
doc_list

[That game hurt.,
 You do right, if you don't care then fuck 'em!,
 Man I love reddit.,
 [NAME] was nowhere near them, he was by the Falcon. ,
 Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!,
 He isn't as big, but he's still quite popular. I've heard the same thing about his content. Never watched him much.,
 That's crazy; I went to a super [RELIGION] high school and I think I can remember 2 girls the entire 4 years that became teen moms.,
 that's adorable asf,
 "Sponge Blurb Pubs Quaw Haha GURR ha AAa!" finale is too real,
 I have, and now that you mention it, I think that's what triggered my nostalgia. ,
 I wanted to downvote this, but it's not your fault homie.,
 BUT IT'S HER TURN! /s,
 That is odd.,
 That is odd.,
 Build a wall? /jk,
 I appreciate it, that's good to know. I hope I'll have to apply that knowledge one day,
 I appreciate it, that's good to know. I hope I'll have to apply that know

In [7]:
features = {
    'doc_entities': [],
    'doc_noun_chunks': [],
    'doc_2_grams': [],
    'doc_3_grams': [],
    'token_part_of_speech': [],
    'token_lemmatized': [],
    'token_normalized': [],
    'token_dependancy': [],
    'token_sentiment': []
}

for doc in doc_list:
    features['doc_entities'].append(doc.ents)
    features['doc_noun_chunks'].append(list(doc.noun_chunks))
    features['doc_2_grams'].append(doc._.ngram_2)
    features['doc_3_grams'].append(doc._.ngram_3)
    features['token_part_of_speech'].append(list(map(lambda x: x.pos_, doc)))
    features['token_lemmatized'].append(list(map(lambda x: x.lemma_, doc)))
    features['token_normalized'].append(list(map(lambda x: x.norm_, doc)))
    features['token_dependancy'].append(list(map(lambda x: x.dep_, doc)))
    features['token_sentiment'].append(list(map(lambda x: x._.blob.polarity, doc)))

In [None]:
# Create DataFrame from features dictionary
feature_df = pd.DataFrame(features)
feature_df.head()

In [None]:
# Load the datasets
df_1 = pd.read_csv("./Datasets/simplified_emotions.csv")

# Concatenate the datasets vertically
merged_df = pd.concat([df_1, feature_df], axis=1)
merged_df.head()

In [10]:
# Assuming df is your DataFrame
merged_df.drop_duplicates(subset=['sentence'], keep='first', inplace=True)

In [12]:
# Iterate over each column and try to pickle it
problematic_columns = []
for col in merged_df.columns:
    try:
        merged_df[col].to_pickle(f'test_{col}.pkl')
    except Exception as e:
        print(f"Error occurred when pickling column '{col}': {e}")
        problematic_columns.append(col)

print("Columns causing the problem:", problematic_columns)

Error occurred when pickling column 'doc_entities': [E112] Pickling a span is not supported, because spans are only views of the parent Doc and can't exist on their own. A pickled span would always have to include its Doc and Vocab, which has practically no advantage over pickling the parent Doc directly. So instead of pickling the span, pickle the Doc it belongs to or use Span.as_doc to convert the span to a standalone Doc object.
Error occurred when pickling column 'doc_noun_chunks': [E112] Pickling a span is not supported, because spans are only views of the parent Doc and can't exist on their own. A pickled span would always have to include its Doc and Vocab, which has practically no advantage over pickling the parent Doc directly. So instead of pickling the span, pickle the Doc it belongs to or use Span.as_doc to convert the span to a standalone Doc object.
Columns causing the problem: ['doc_entities', 'doc_noun_chunks']


In [17]:
# Drop the problematic columns from the DataFrame
#merged_df.drop(columns=['doc_entities', 'doc_noun_chunks'], inplace=True)

# Save the DataFrame as a pickle file
merged_df.to_pickle('./Datasets/feature_extraction.pkl')

print("DataFrame has been saved as feature_extraction.pkl")

DataFrame has been saved as feature_extraction.pkl


In [14]:
# Save the DataFrame as a CSV file
merged_df.to_csv("./Datasets/feature_extraction.csv", index=False)

print("DataFrame has been saved as feature_extraction.csv")

DataFrame has been saved as feature_extraction.csv


---
**Feature Extraction on the original merged dataset**

In [5]:
df_2 = pd.read_csv("./Datasets/emotion_data_merged.csv")
df_2

Unnamed: 0,sentence,emotion
0,That game hurt.,sadness
1,"You do right, if you don't care then fuck 'em!",neutral
2,Man I love reddit.,happiness
3,"[NAME] was nowhere near them, he was by the Fa...",neutral
4,Right? Considering it’s such an important docu...,happiness
...,...,...
797512,that was what i felt when i was finally accept...,happiness
797513,i take every day as it comes i m just focussin...,fear
797514,i just suddenly feel that everything was fake,sadness
797515,im feeling more eager than ever to claw back w...,happiness


In [6]:
sent_list_2 = df_2['sentence'].to_list()
doc_list_2 = list(nlp.pipe(sent_list_2))
doc_list_2

[That game hurt.,
 You do right, if you don't care then fuck 'em!,
 Man I love reddit.,
 [NAME] was nowhere near them, he was by the Falcon. ,
 Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!,
 He isn't as big, but he's still quite popular. I've heard the same thing about his content. Never watched him much.,
 That's crazy; I went to a super [RELIGION] high school and I think I can remember 2 girls the entire 4 years that became teen moms.,
 that's adorable asf,
 "Sponge Blurb Pubs Quaw Haha GURR ha AAa!" finale is too real,
 I have, and now that you mention it, I think that's what triggered my nostalgia. ,
 I wanted to downvote this, but it's not your fault homie.,
 BUT IT'S HER TURN! /s,
 That is odd.,
 That is odd.,
 Build a wall? /jk,
 I appreciate it, that's good to know. I hope I'll have to apply that knowledge one day,
 I appreciate it, that's good to know. I hope I'll have to apply that know

In [7]:
features_2 = {
    'doc_entities': [],
    'doc_noun_chunks': [],
    'doc_2_grams': [],
    'doc_3_grams': [],
    'token_part_of_speech': [],
    'token_lemmatized': [],
    'token_normalized': [],
    'token_dependancy': [],
    'token_sentiment': []
}

for doc in doc_list_2:
    features_2['doc_entities'].append(doc.ents)
    features_2['doc_noun_chunks'].append(list(doc.noun_chunks))
    features_2['doc_2_grams'].append(doc._.ngram_2)
    features_2['doc_3_grams'].append(doc._.ngram_3)
    features_2['token_part_of_speech'].append(list(map(lambda x: x.pos_, doc)))
    features_2['token_lemmatized'].append(list(map(lambda x: x.lemma_, doc)))
    features_2['token_normalized'].append(list(map(lambda x: x.norm_, doc)))
    features_2['token_dependancy'].append(list(map(lambda x: x.dep_, doc)))
    features_2['token_sentiment'].append(list(map(lambda x: x._.blob.polarity , doc)))

In [None]:
# Create DataFrame from features dictionary
feature_df_2 = pd.DataFrame(features_2)
feature_df_2.head()

In [None]:
# Load the datasets
df_1 = pd.read_csv("./Datasets/emotion_data_merged.csv") 

# Concatenate the datasets vertically
merged_df_2 = pd.concat([df_1, feature_df_2], axis=1)
merged_df_2.head()

In [10]:
# Assuming df is your DataFrame
merged_df_2.drop_duplicates(subset=['sentence'], keep='first', inplace=True)

In [11]:
# Drop the problematic columns from the DataFrame
merged_df_2.drop(columns=['doc_entities', 'doc_noun_chunks'], inplace=True)

# Save the DataFrame as a pickle file
merged_df_2.to_pickle('./Datasets/feature_extraction_full.pkl')

print("DataFrame has been saved as feature_extraction_full.pkl")

DataFrame has been saved as feature_extraction_full.pkl


In [12]:
# Save the DataFrame as a CSV file
merged_df_2.to_csv("./Datasets/feature_extraction_full.csv", index=False)

print("DataFrame has been saved as feature_extraction_full.csv")

DataFrame has been saved as feature_extraction_full.csv
