In [11]:
import pandas as pd
import re

In [3]:
df = pd.read_csv('dreams.csv')

In [4]:
df.head()

Unnamed: 0,content,id
0,"\n#1 (1957)The one at the Meads's house, where...",alta
1,\n#2 (8/11/67)I'm at a family reunion in a lar...,alta
2,\n#3 (8/1/85)I watch a plane fly past and shor...,alta
3,\n#4 (1985?)Me pulling the green leaves and be...,alta
4,\n#5 (1985?)I'm in a room that reminds me of (...,alta


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
from textblob import TextBlob

def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

custom_stopwords = ['did', 'don', 'didn', 'came', 'got', 'going', 'just'] + list(stop_words.ENGLISH_STOP_WORDS)

# Vectorize and save into a new dataframe
vec = TfidfVectorizer(stop_words=custom_stopwords,
                      #tokenizer=textblob_tokenizer,
                      max_df=0.95,
                      min_df=0.15,
                      max_features=1000,
                      use_idf=True)

# Fit from the 'text' column of our dataframe
matrix = vec.fit_transform(df['content'])

# Then turn it into a new dataframe
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

In [6]:
results.head()

Unnamed: 0,dream,house,know,like,little,look,man,people,room,said,say,think,time,went
0,0.0,0.676375,0.0,0.169588,0.490429,0.0,0.218161,0.206112,0.427972,0.0,0.0,0.0,0.0,0.0
1,0.0,0.373968,0.353822,0.562594,0.406738,0.0,0.0,0.341879,0.0,0.0,0.0,0.368956,0.0,0.0
2,0.219787,0.425963,0.201508,0.160203,0.0,0.0,0.0,0.194706,0.202144,0.0,0.221,0.630381,0.423065,0.0
3,0.0,0.288699,0.136573,0.217157,0.627993,0.309472,0.0,0.131963,0.137004,0.0,0.0,0.569659,0.0,0.0
4,0.0,0.133798,0.506361,0.201284,0.0,0.0,0.0,0.244634,0.380969,0.403135,0.0,0.132005,0.531551,0.133073


In [7]:
from sklearn.cluster import KMeans

# How many clusters?
number_of_clusters=3
km = KMeans(n_clusters=number_of_clusters)

# Let's fit it!
km.fit(matrix)
km.fit

<bound method KMeans.fit of KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)>

In [8]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: room house man dream people went say time know think
Cluster 1: said went like know people room time little man house
Cluster 2: like people think man know say look room little time


In [9]:
df['category'] = km.labels_
df.head()

Unnamed: 0,content,id,category
0,"\n#1 (1957)The one at the Meads's house, where...",alta,0
1,\n#2 (8/11/67)I'm at a family reunion in a lar...,alta,2
2,\n#3 (8/1/85)I watch a plane fly past and shor...,alta,0
3,\n#4 (1985?)Me pulling the green leaves and be...,alta,0
4,\n#5 (1985?)I'm in a room that reminds me of (...,alta,1


In [61]:
words_to_exclude = ['was', 'have', 'can', 'don', 'just', 'am', 
                    'will', 'had', 'do', 'want', 'could', 'would', 
                    'never', 'ever', 'didn', 'did', 'got', 'get', 
                    'couldn', 'were', 'should', 'started']

df['I'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= I )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['I'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

think          4914
see            4628
say            3549
went           3527
go             3447
said           2730
saw            2674
thought        2474
feel           2102
remember       1895
know           1879
look           1775
told           1655
knew           1580
felt           1422
tell           1390
wanted         1355
walk           1295
ask            1293
woke           1223
asked          1214
looked         1208
try            1145
realize        1138
realized       1111
put             983
guess           889
found           866
find            827
are             817
               ... 
cashed            1
patched           1
sculptured        1
messing           1
sorrowfully       1
frill             1
backhand          1
dazedly           1
trumpet           1
hypothesize       1
thinkthe          1
smashed           1
grade             1
falsified         1
recite            1
dealt             1
skinned           1
dunked            1
deftly            1


In [47]:
df['his'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= his )[a-zA-Z]+', x['content'])],
    axis=1)

his_list = pd.Series([element for list_ in df['his'] for element in list_])
his_list.value_counts()

head           372
wife           356
hand           262
face           254
name           180
house          158
car            157
back           150
arm            138
mother         130
arms           118
friends        116
hands          107
family         105
father         104
eyes           102
mouth          101
friend         100
own             98
office          95
hair            91
room            89
way             79
girlfriend      78
penis           76
body            72
son             72
brother         72
chest           71
parents         63
              ... 
canister         1
quota            1
tasks            1
purchase         1
are              1
tendency         1
blond            1
spouse           1
pacemaker        1
figure           1
workout          1
mansion          1
hardening        1
usually          1
empty            1
specially        1
erected          1
loud             1
dangerous        1
ghetto           1
excitement       1
rock        

In [63]:
df['her'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= her )[a-zA-Z]+', x['content'])],
    axis=1)

his_list = pd.Series([element for list_ in df['her'] for element in list_])
his_list.value_counts()

and              1119
to                926
that              431
i                 337
in                280
if                246
head              222
face              219
husband           199
a                 198
up                188
about             187
back              178
mother            177
for               160
house             157
hand              155
name              135
hair              133
how               126
what              126
the               117
on                114
room              106
as                106
out               105
car               102
because            99
with               94
she                90
                 ... 
waterbed            1
bridal              1
bosses              1
interrupt           1
activlties          1
captor              1
church              1
rock                1
frailty             1
diamonds            1
japanese            1
disappearance       1
catch               1
overnight           1
skis      

In [68]:
words_to_exclude = ['the','be','get','go','me','a','do','another','my','take','have',
                    'him','her','make','find','leave','them','come','put','look',
                    'talk','tell','give','keep','his','walk','this','questions','it','use',
                    'myself', 'where', 'our', 'an', 'some', 'let', 'someone', 'one', 'us']

df['to'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= to )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['to'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

see             2772
help            1138
work             702
say              613
play             577
try              576
move             563
stop             562
stay             543
know             540
sleep            531
sit              508
eat              497
run              433
drive            422
call             410
buy              407
ask              393
show             390
pick             388
meet             385
turn             359
start            327
wait             325
figure           319
kill             312
catch            304
visit            302
pay              293
live             292
                ... 
singing            1
justice            1
walt               1
been               1
hands              1
logoff             1
confirmation       1
rack               1
zig                1
josie              1
butthead           1
riff               1
furiously          1
belmont            1
batteries          1
session            1
enfold       

In [73]:
words_to_exclude = ['was', 'have', 'can', 'don', 'just', 'am', 
                    'will', 'had', 'do', 'want', 'could', 'would', 
                    'never', 'ever', 'didn', 'did', 'got', 'get', 
                    'couldn', 'were', 'should', 'started', 'is', 'has',
                    'and', 'doesn', 'gets', 'wouldn',]

df['he'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= he )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['he'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

said            688
says            358
wanted          252
wants           195
came            193
does            149
wasn            147
told            140
went            136
knew            117
comes           103
looked          100
took             93
won              93
left             91
asked            90
goes             90
gave             89
looks            83
knows            80
put              79
must             79
thought          79
seemed           77
really           72
saw              69
might            65
walked           65
thinks           65
kept             63
               ... 
evidently         1
snatched          1
mind              1
pontificates      1
slips             1
offended          1
expect            1
interacted        1
cusses            1
speak             1
laments           1
think             1
yields            1
send              1
nuzzled           1
ground            1
recently          1
upset             1
admires           1


In [74]:
words_to_exclude = ['was', 'have', 'can', 'don', 'just', 'am', 
                    'will', 'had', 'do', 'want', 'could', 'would', 
                    'never', 'ever', 'didn', 'did', 'got', 'get', 
                    'couldn', 'were', 'should', 'started', 'is', 'has',
                    'and', 'doesn', 'gets', 'wouldn',]

df['she'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= she )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['she'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

said             589
says             286
wanted           202
wants            141
came             130
told             119
does             109
wasn             109
gave              87
went              86
thought           79
knew              76
took              73
left              64
comes             63
asked             62
looked            61
won               60
goes              60
knows             60
looks             57
saw               57
tells             56
might             55
seemed            54
needed            51
needs             50
thinks            49
must              47
sees              46
                ... 
hitched            1
assigned           1
studies            1
figure             1
mimics             1
glanced            1
grew               1
doddles            1
signed             1
strokes            1
sprays             1
storms             1
doubted            1
closes             1
dials              1
stand              1
shook        

In [80]:
words_to_exclude = ['i', 'we', 'that', 'he', 'they', 'it', 
                    'much', 'she', 'the', 'on', 'many', 'as', 
                    'you', 'there', 'when', 'no', 'my', 'then', 
                    'in', 'was', 'often', 'to', 'j', 'if',
                    'is', 'this', 'gets', 'wouldn',]

df['so'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= so )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['so'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

long            140
happy            93
close            92
far              90
hard             64
good             63
bad              63
well             61
big              60
and              53
fast             49
forth            43
upset            41
angry            37
tired            34
high             33
excited          32
people           31
small            31
nice             29
real             29
scared           29
late             28
glad             25
mad              23
loud             23
very             22
sad              22
cute             21
crowded          20
               ... 
left              1
naomi             1
angrily           1
love              1
sluggish          1
innocuous         1
rare              1
whoever           1
miles             1
away              1
person            1
slight            1
goodnight         1
breakfast         1
hopefully         1
typing            1
ashley            1
thankful          1
ms                1


In [81]:
words_to_exclude = ['i', 'we', 'that', 'he', 'they', 'it', 
                    'much', 'she', 'the', 'on', 'many', 'as', 
                    'you', 'there', 'when', 'no', 'my', 'then', 
                    'in', 'was', 'often', 'to', 'j', 'if',
                    'is', 'this', 'gets', 'wouldn',]

df['extremely'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= extremely )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['extremely'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

upset            13
large             9
happy             8
angry             8
nervous           7
anxious           6
steep             5
curious           4
sad               4
heavy             4
tired             4
cold              4
dangerous         3
attractive        3
surprised         3
good              3
pleasant          3
poor              3
uncomfortable     2
short             2
frightened        2
frustrated        2
long              2
hot               2
perplexed         2
agitated          2
distraught        2
tall              2
unpleasant        2
small             2
                 ..
seriously         1
deep              1
confident         1
helpful           1
useful            1
stern             1
icy               1
busy              1
strong            1
dark              1
dejected          1
deserted          1
rude              1
thick             1
interested        1
fed               1
valuable          1
weak              1
exciting          1


In [84]:
words_to_exclude = ['go', 'get', 'be', 'do', 'have']

df['wanted_to'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= wanted to )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['wanted_to'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

see           113
know           87
take           70
make           61
talk           48
buy            34
play           27
leave          26
keep           26
stay           25
help           23
tell           23
put            21
come           20
show           19
give           19
ask            18
say            18
try            17
stop           16
watch          16
hear           14
marry          14
find           14
use            13
call           13
look           12
sit            12
join           12
drive          11
             ... 
park            1
send            1
belong          1
shave           1
limber          1
compose         1
light           1
cheer           1
signal          1
search          1
rape            1
create          1
own             1
switch          1
repay           1
dress           1
abduct          1
defend          1
flirt           1
reassure        1
drown           1
devour          1
bet             1
cure            1
slip      

In [87]:
words_to_exclude = ['was', 'have', 'can', 'don', 'just', 'am', 
                    'will', 'had', 'do', 'want', 'could', 'would', 
                    'never', 'ever', 'didn', 'did', 'got', 'get', 
                    'couldn', 'were', 'should', 'started', 'is', 'has',
                    'and', 'doesn', 'gets', 'wouldn', 'are', 'all', 'weren',
                    'went', 'came', 'come', 'must', 'might']

df['they'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= they )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['they'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

said          195
wanted        139
thought        72
knew           70
won            69
seemed         68
left           65
looked         64
look           63
took           61
say            53
need           53
aren           52
go             52
told           52
put            52
needed         50
know           49
think          48
both           46
saw            46
really         45
gave           43
made           42
seem           39
may            35
used           33
called         33
leave          31
kept           30
             ... 
deserve         1
saddled         1
wake            1
screamed        1
cheat           1
expecting       1
graded          1
toyed           1
fan             1
oohed           1
warned          1
chasing         1
originally      1
page            1
panic           1
assign          1
cleaned         1
peer            1
protest         1
earn            1
face            1
ignore          1
respect         1
as              1
kinda     

In [91]:
words_to_exclude = []

df['like_a'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= like a )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['like_a'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

little         54
small          37
baby           37
big            36
very           31
large          30
movie          25
man            22
huge           21
hotel          20
school         17
maze           16
cat            16
child          15
giant          14
roller         13
cross          12
hospital       12
store          12
regular        12
good           12
normal         11
dog            11
carnival       11
city           11
bus            11
lot            11
black          11
real           11
white          10
               ..
slave           1
touring         1
cello           1
mature          1
sideways        1
stair           1
cane            1
pageboy         1
variation       1
decorative      1
hawk            1
mail            1
martin          1
mr              1
bridge          1
handle          1
sauna           1
haunted         1
cocoon          1
caftan          1
solo            1
staircase       1
buddha          1
cream           1
goat      

In [92]:
words_to_exclude = []

df['saw_a'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= saw a )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['saw_a'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

man            43
big            33
lot            30
woman          23
small          20
girl           18
sign           16
boy            16
huge           15
little         14
group          13
picture        13
large          13
young          12
car            11
very           10
red             9
couple          9
bunch           9
friend          7
beautiful       7
fellow          7
horse           6
lady            6
name            5
few             5
black           5
number          5
white           5
policeman       5
               ..
line            1
kid             1
norwegian       1
handicapped     1
typed           1
rather          1
string          1
series          1
pool            1
slip            1
way             1
sister          1
week            1
play            1
tail            1
stage           1
bright          1
waterfall       1
meteor          1
different       1
wagon           1
sheet           1
seat            1
hanger          1
spring    

In [94]:
words_to_exclude = []

df['to_the'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= to the )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['to_the'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

other          347
bathroom       344
door           285
right          257
house          255
left           226
front          226
top            218
car            178
back           178
ground         161
side           157
next           151
beach          115
end            107
floor          107
bottom         104
man             96
edge            93
hospital        93
kitchen         93
store           86
room            82
office          81
window          79
airport         71
basement        70
woman           70
street          63
place           58
              ... 
dungeon          1
bumper           1
xxx              1
brown            1
details          1
lateness         1
hills            1
wharves          1
immediate        1
theme            1
waking           1
receiver         1
peculiar         1
tracks           1
danger           1
gray             1
perfume          1
super            1
depressions      1
cobbler          1
drawer           1
formation   

In [95]:
words_to_exclude = []

df['in'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= in the )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['in'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

dream        827
back         656
middle       512
car          462
room         456
house        389
water        381
front        336
kitchen      297
same         240
air          234
morning      223
living       221
bathroom     199
basement     165
country      126
other        118
center       117
distance     113
store        102
next         102
hall         102
parking      100
hospital      98
way           95
first         94
sky           93
woods         92
door          90
corner        88
            ... 
unit           1
mermaid        1
english        1
cottage        1
research       1
backstage      1
blank          1
custom         1
financial      1
ticket         1
smelly         1
always         1
tray           1
batting        1
blanks         1
scope          1
costume        1
sunset         1
womb           1
blimp          1
violent        1
designer       1
answer         1
backwash       1
sore           1
sub            1
bancroft       1
waterfall     

In [96]:
words_to_exclude = []

df['in_a'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= in a )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['in_a'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

room            436
car             359
house           330
large           293
small           255
very            202
big             201
hospital        135
building        114
hurry           110
chair           108
wheelchair      105
place           103
long            102
different        97
hotel            95
row              85
little           84
while            84
huge             83
circle           83
store            83
bed              82
strange          80
group            69
few              69
corner           68
restaurant       68
play             64
dream            60
               ... 
sheaf             1
physics           1
tri               1
sr                1
message           1
hint              1
hollandaise       1
seashore          1
cover             1
meal              1
precise           1
grumpy            1
disagreeable      1
frozen            1
ravine            1
santa             1
toga              1
furniture         1
suppressed        1


In [97]:
words_to_exclude = []

df['strange'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= strange )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['strange'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

because         40
to              32
that            29
and             29
thing           26
house           20
place           19
dream           19
man             18
city            17
people          13
town            12
part            11
about           10
looking          9
things           8
sort             8
feeling          7
way              7
reason           7
lady             7
men              6
little           6
neighborhood     6
land             5
name             5
street           5
happened         5
one              5
noises           5
                ..
jungle           1
sight            1
questions        1
phenomena        1
using            1
collection       1
engine           1
interlude        1
weapons          1
guys             1
piano            1
state            1
curves           1
power            1
drunk            1
roller           1
bicycle          1
corridor         1
airport          1
growths          1
shadows          1
streets     

In [98]:
words_to_exclude = []

df['into'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= into a )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['into'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

room           107
small           76
building        40
large           36
store           33
car             29
big             29
house           28
bathroom        21
huge            20
little          19
new             18
bedroom         18
very            17
place           16
corner          16
parking         14
closet          14
sort            14
tiny            14
hole            13
restaurant      12
fight           12
wall            12
gas             12
box             11
man             11
woman            9
garage           9
door             9
              ... 
prayer           1
girls            1
chamber          1
den              1
slooshing        1
hippie           1
laundry          1
forward          1
bear             1
safely           1
land             1
stoplight        1
domestic         1
disturbance      1
dictaphone       1
cannon           1
round            1
crocheted        1
particular       1
master           1
pallet           1
trampoline  

In [99]:
words_to_exclude = []

df['was'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= was )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['was'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

a                 6673
in                3927
going             3295
very              1586
the               1516
at                1368
trying            1209
there             1195
on                1092
like              1068
just              1059
sitting            903
not                878
with               877
walking            762
so                 741
looking            721
talking            694
an                 657
really             655
about              646
supposed           609
doing              607
to                 596
getting            595
standing           590
thinking           561
working            555
kind               538
still              526
                  ... 
double               1
barry                1
drizzling            1
clothing             1
simultaneously       1
oblivious            1
block                1
camouflaged          1
alfred               1
mountain             1
enthralled           1
unwelcome            1
secretive  

In [103]:
words_to_exclude = []

df['a'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= a )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['a'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

little         2775
lot            1832
small          1752
very           1657
man            1645
big            1619
few            1589
large          1386
woman          1340
long           1276
bit            1202
couple          958
good            936
huge            916
car             890
room            883
house           810
group           737
while           703
place           674
dream           670
friend          655
girl            604
young           585
new             568
table           503
bunch           501
sort            478
baby            452
white           441
               ... 
puny              1
continental       1
therapy           1
scanty            1
bushing           1
lamacchia         1
holy              1
delinquent        1
knack             1
penetrating       1
tracked           1
putrefying        1
lad               1
requisition       1
martyred          1
kneeler           1
crawly            1
pugilist          1
rate              1


In [105]:
words_to_exclude = []

df['on_a'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= on a )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['on_a'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

couch          166
bus            107
trip            99
table           94
chair           77
bed             76
boat            73
bicycle         71
small           71
train           70
bench           60
street          55
big             51
high            47
hill            46
shelf           45
little          44
date            44
large           42
wall            41
long            40
very            40
ledge           37
piece           36
plane           36
beach           32
pair            30
road            30
side            30
motorcycle      30
              ... 
percentage       1
curvy            1
term             1
fork             1
training         1
crumpled         1
puzzle           1
tom              1
marble           1
rubber           1
bow              1
barstool         1
thursday         1
partial          1
wind             1
fake             1
die              1
rectangular      1
wave             1
pump             1
germany          1
folder      

In [110]:
words_to_exclude = []

df['my'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= my )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['my'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()[90:]

grandfather    77
knees          76
van            75
neighbor       74
side           74
grandmother    73
attention      73
wallet         72
surprise       71
birth          71
dress          70
housemate      68
computer       68
partner        68
books          67
job            66
door           66
grade          66
book           65
chest          65
shirt          65
money          65
eye            65
penis          64
girl           63
voice          63
camera         60
shoulders      58
birthday       58
dog            58
               ..
toolbox         1
attire          1
shenanigans     1
internal        1
amusing         1
texaco          1
shoelace        1
canna           1
pov             1
leopard         1
garment         1
supreme         1
droopy          1
sat             1
ciisc           1
foxhole         1
candy           1
lord            1
diaper          1
copilot         1
equilibrium     1
strap           1
stamina         1
tea             1
moby      

In [122]:
words_to_exclude = []

df['have'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?<= have )[a-zA-Z]+', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['have'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()[90:]

missed          15
breakfast       15
died            14
walked          14
too             14
your            14
with            14
room            14
arrived         14
changed         14
met             14
those           13
really          13
difficulty      13
used            13
sexual          13
wanted          13
long            13
good            13
returned        12
started         12
run             12
looked          12
us              12
broken          11
often           11
four            11
none            11
hit             11
plenty          11
                ..
retainers        1
pissed           1
blonde           1
marijuana        1
surprised        1
hotdogs          1
grandpa          1
decision         1
cats             1
volunteered      1
declared         1
physically       1
arrangements     1
price            1
neglected        1
legs             1
mis              1
uncovered        1
switched         1
but              1
important        1
snobbish    

In [117]:
words_to_exclude = []

df['creature'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?:\S+\s)?\S*creature\S*(?:\s\S+)?', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['creature'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

the creature is                     3
the creature. he                    2
this creature was                   2
cartoon creature out                2
a creature that                     2
odd creatures dancing               1
sea creature with                   1
about creatures.                    1
flatfish creature, like             1
another creature had                1
snake-like creature called          1
like creature. another              1
mechanical/animal creature under    1
this creature to                    1
these creatures outside.            1
like creature with                  1
its creature-hood, if               1
pony-like creature. i               1
crazed creature that                1
foreign creature which              1
amoeba creatures attacking          1
little creature, a                  1
a creature i                        1
dragon creature. i                  1
these creatures are                 1
slug-like creatures in              1
some creatur

In [118]:
words_to_exclude = []

df['monster'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?:\S+\s)?\S*monster\S*(?:\s\S+)?', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['monster'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

the monster comes                5
the monster was                  5
the monster. i                   4
the monster of                   3
the monster and                  3
the monster could                2
a monster from                   2
the monster, but                 2
see monsters, three              2
the monster woman                2
a monster. i                     2
a monster or                     2
more monsters trying             1
like monster's den,              1
about monsters and               1
the monster is                   1
things, monsters maybe.          1
man monster. we                  1
giant monsters deep              1
a monster, and                   1
horrible monster scared          1
a monster largemouth             1
the monster face.                1
of monster or                    1
are monsters to                  1
these monsters working           1
horrible monster like            1
the monster disappeared          1
like monsters in    

In [119]:
words_to_exclude = []

df['animal'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?:\S+\s)?\S*animal\S*(?:\s\S+)?', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['animal'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

stuffed animals and              6
stuffed animals. i               4
the animal was                   4
stuffed animals.                 3
of animals on                    3
an animal or                     3
other animal that                3
the animals were                 3
the animals and                  3
other animals, including         3
the animals i                    3
his animal friends               2
see animals: a                   2
the animal. the                  2
the animals are                  2
an animal with                   2
the animal is                    2
small animals in                 2
stuffed animals to               2
more animals and                 2
stuffed animals in               2
the animals off                  2
an animal on                     2
the animals. i                   2
stuffed animal. we               2
the animals to                   2
other animals were               2
small animals were               2
stuffed animals were

In [120]:
words_to_exclude = []

df['place'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?:\S+\s)?\S*place\S*(?:\s\S+)?', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['place'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

a place to                 182
a place where              133
took place in               98
the place where             94
a place that                88
the place was               59
the place is                55
the place. i                32
the place and               32
this place is               32
no place to                 26
a place with                25
a place like                25
the place that              25
takes place in              24
took place at               24
a place for                 23
a place in                  23
the place i                 22
this place where            20
the place we                20
this place that             18
one place to                18
taking place in             17
this place was              16
a place i                   15
good place to               14
in place of                 14
my place. i                 12
the place to                12
                          ... 
the places, it               1
the plac

In [121]:
words_to_exclude = []

df['to'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?:\S+\s)?\S*to\S*(?:\s\S+)?', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['to'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

to the                    995
seemed to be              983
going to be               796
back to the               734
answers to questions      663
go to the                 624
up to the                 623
trying to get             604
went to the               568
seems to be               554
supposed to be            524
on top of                 498
i told him                495
seem to be                487
had to go                 463
i told her                458
the top of                389
down to the               346
going to have             317
going to go               316
over to the               299
out to the                296
get to the                287
go into the               282
next to the               273
have to go                273
went into the             259
going to get              248
want to go                248
back into the             245
                         ... 
him to something.           1
go to pet                   1
preparatio

In [125]:
words_to_exclude = []

df['do'] = df.apply(
    lambda x: [val.lower() for val in re.findall('(?:\S+\s)?\S*do\S*(?:\s\S+)?', x['content'])],
    axis=1)

i_list = pd.Series([element for list_ in df['do'] for element in list_])
i_list[~i_list.isin(words_to_exclude)].value_counts()

i don't know                  1848
i don't remember               919
i don't want                   587
to do with                     466
the door and                   442
i don't think                  347
i don't have                   265
the window and                 229
to do it                       206
i do not                       202
the door. i                    188
i don't like                   171
to do something                169
walking down the               157
i don't know.                  156
to do the                      138
to do. i                       129
walking down a                 128
the door to                    126
i don't recall                 123
to do this                     122
to do it.                      120
walk down the                  118
go down the                    118
to do a                        116
to do that                     111
i don't really                 108
i don't know,                  106
to do that.         

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

custom_stopwords = ['did', 'don', 'didn', 'said', 'thee', 'ye', 'came', 'got',
                    'like', 'going', 'come', 'feel', 'getting', 'just', 'want',
                    'wanted'] + list(stop_words.ENGLISH_STOP_WORDS)


vec = TfidfVectorizer(
    use_idf=True,
    min_df=0.1,
    norm='l1',
    stop_words=custom_stopwords)
matrix = vec.fit_transform(df['content'].str.replace("\d", ""))
vocab = vec.get_feature_names()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)
wordcount_df.head()

Unnamed: 0,away,big,car,door,dream,friend,good,home,house,kind,...,thought,time,told,trying,walk,walking,way,went,woman,words
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094141,0.032429
1,0.0,0.0,0.265413,0.0,0.0,0.0,0.0,0.0,0.074691,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.081383,0.0,0.0,0.028092
2,0.041944,0.0,0.0,0.0,0.038309,0.0,0.0,0.042342,0.074287,0.0,...,0.0,0.073788,0.0,0.0,0.0,0.0,0.080943,0.0,0.121664,0.01397
3,0.082889,0.0,0.0,0.08604,0.0,0.0,0.0,0.0,0.073401,0.087283,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013804
4,0.027386,0.0,0.028726,0.0,0.0,0.027996,0.0,0.027646,0.024252,0.028838,...,0.028771,0.096355,0.0,0.0,0.0,0.0,0.105698,0.024124,0.0,0.009121
