In [3]:
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
import numpy as np
import transformers

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Your inputs
genz_meaning = "charisma, ability to attract someone romantically"
corpus = ["He has a lot of charm.", "She knows how to talk to people.", "He's got game.", "Very confident speaker."]

# Get embedding of GenZ meaning
genz_embedding = model.encode(genz_meaning, convert_to_tensor=True)
print(genz_embedding.shape)

# Create list of candidate phrases from corpus
# You can improve this using n-grams, noun phrases, etc.
phrases = []
for sentence in corpus:
    tokens = word_tokenize(sentence)
    phrases.extend(tokens)
    phrases.extend([' '.join(gram) for n in range(2, 4) for gram in ngrams(tokens, n)])

# Remove duplicates
phrases = list(set(phrases))
print(len(phrases))

# Get embeddings
phrase_embeddings = model.encode(phrases, convert_to_tensor=True)
print(phrase_embeddings.shape)

# Compute cosine similarity
cos_scores = util.pytorch_cos_sim(genz_embedding, phrase_embeddings)[0]

# Sort
top_results = np.argsort(-cos_scores)[:10]

# Print top similar phrases
print("Top similar phrases:")
for idx in top_results:
    print(f"{phrases[idx]}: {cos_scores[idx].item():.4f}")


torch.Size([384])
55
torch.Size([55, 384])
Top similar phrases:
lot of charm: 0.3621
confident speaker .: 0.3093
Very confident speaker: 0.3040
of charm: 0.3031
confident speaker: 0.3022
Very confident: 0.3010
confident: 0.2991
of charm .: 0.2770
She: 0.2636
charm: 0.2554


In [None]:

import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_phrases_spacy(text):
    doc = nlp(text)
    phrases = set()

    # Add noun chunks
    for chunk in doc.noun_chunks:
        phrases.add(chunk.text)

    # Add named entities
    for ent in doc.ents:
        phrases.add(ent.text)

    # Add unigrams and bigrams
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    for i in range(len(tokens)):
        phrases.add(tokens[i])
        if i < len(tokens) - 1:
            phrases.add(f"{tokens[i]} {tokens[i+1]}")
    return list(phrases)



def similar_phrases_each_content(genz_embedding, genz_word, content):
    # Return if content is not a string
    if not isinstance(content, str):
        return []

    content = content.replace("\n", "")
    content = content.replace("\r", "")

    phrases = []
    phrases = extract_phrases_spacy(content)
    phrases = list(set(phrases))

    if phrases == []:
        return []
    
    phrase_embeddings = model.encode(phrases, convert_to_tensor=True)

    cos_scores = util.pytorch_cos_sim(genz_embedding, phrase_embeddings)[0]

    top_results = np.argsort(-cos_scores)[:10]

    # Filter phrases with scores greater than 0.5
    filtered_phrases = [(phrases[idx], cos_scores[idx].item()) for idx in top_results if cos_scores[idx].item() > 0.5]

    return filtered_phrases


def similar_phrase_content(genz_embedding, genz_word, df):
    content = df['modified_content'].tolist()
    modified_list = df['Modified'].tolist()
    modified_contents = []
    modified = []

    for i in range(len(content)):
        phrases = similar_phrases_each_content(genz_embedding, genz_word, content[i])
        modified_content = content[i]
        modified_bool = modified_list[i]
        for (phrase, score) in phrases:
            modified_content = modified_content.replace(phrase, genz_word)
            modified_bool = True

        modified_contents.append(modified_content)
        modified.append(modified_bool)

    df['modified_content'] = modified_contents
    df['Modified'] = modified


df = pd.read_csv('datasets/emails_cleaned.csv')

df = df[['From', 'To', 'content', 'Subject']]

genz = pd.read_csv('datasets/genz_slang.csv')

model = SentenceTransformer('all-MiniLM-L6-v2')

# create a new column duplicate of content
df['modified_content'] = df['content']
df['Modified'] = False

for i in range(len(genz)):
    print(i)
    genz_embedding = model.encode(genz.iloc[i, 1], convert_to_tensor=True)
    similar_phrase_content(genz_embedding, genz.iloc[i, 0], df)

# genz_embedding = model.encode(genz.iloc[4, 1], convert_to_tensor=True)
# similar_phrase_content(genz_embedding, genz.iloc[4, 0], df)
# df.iloc[1, 4]

# Save the modified DataFrame to a new CSV file
df.to_csv('datasets/emails_cleaned_modified.csv', index=False)

# Display the first few rows of the modified DataFrame
print("Modified DataFrame:")
df.head()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
Modified DataFrame:


Unnamed: 0,From,To,content,Subject,modified_content,Modified
0,phillip.allen@enron.co,tim.belden@enron.co,Here is our forecast\n\n,,Here is our forecast\n\n,False
1,phillip.allen@enron.co,john.lavorato@enron.co,Traveling to have a business meeting takes the...,Re,Traveling to have a business meeting takes Fac...,True
2,phillip.allen@enron.co,leah.arsdall@enron.co,test successful. way to go!!!,Re: tes,test successful. way to G2G!!!,True
3,phillip.allen@enron.co,randall.gay@enron.co,"Randy,\n\n Can you send me a schedule of the s...",,"Randy,\n\n Can you send me a schedule of Facts...",True
4,phillip.allen@enron.co,greg.piper@enron.co,Let's shoot for Tuesday at 11:45.,Re: Hell,Let's shoot Ffs Tuesday at 11:45.,True


In [55]:
print(df.iloc[1, 4])
df.iloc[1, 2]


Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different Cheugy Cheugy working and Cheugy not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  

My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.



"Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  \n\nMy suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.\n"

In [None]:
import pandas as pd

df = pd.read_csv('datasets/emails_cleaned_modified.csv')

modified_rows = df[df['Modified'] == True]
# print(modified_rows['content'])
# print(modified_rows['modified_content'])

print(modified_rows)


                       From                                               To   
1    phillip.allen@enron.co                           john.lavorato@enron.co  \
2    phillip.allen@enron.co                            leah.arsdall@enron.co   
3    phillip.allen@enron.co                             randall.gay@enron.co   
4    phillip.allen@enron.co                              greg.piper@enron.co   
6    phillip.allen@enron.co  david.l.johnson@enron.com, john.shafer@enron.co   
..                      ...                                              ...   
493  phillip.allen@enron.co                               mary.gray@enron.co   
495  phillip.allen@enron.co                          patti.sullivan@enron.co   
496  phillip.allen@enron.co                         andrea.richards@enron.co   
497  phillip.allen@enron.co                            nick.politis@enron.co   
499  phillip.allen@enron.co                        stagecoachmama@hotmail.co   

                                       

In [None]:
import pandas as pd

df = pd.read_csv('datasets/emails_cleaned.csv')

content_list = df['content'].to_list()

print(content_list[:5])

print(content_list[0].replace("\n", ""))

['Here is our forecast\n\n ', "Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  \n\nMy suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.\n", 'test successful.  way to go!!!', 'Randy,\n\n Can you send me a schedule of the salary and level of everyone in the \nscheduling group.  Plus your thoughts on any changes that