In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk

In [30]:
verbs = pd.read_csv('../data/agency_power.csv')

In [31]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandons,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuses,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


In [4]:
verbs.groupby('agency').count()

Unnamed: 0_level_0,verb,power
agency,Unnamed: 1_level_1,Unnamed: 2_level_1
agency_equal,242,156
agency_neg,228,117
agency_pos,1676,1455


In [5]:
verbs.groupby('power').count()

Unnamed: 0_level_0,verb,agency
power,Unnamed: 1_level_1,Unnamed: 2_level_1
power_agent,1222,1216
power_equal,309,308
power_theme,206,204


In [6]:
verbs['agency'] = verbs['agency'].replace('agency_neg', -1)
verbs['agency'] = verbs['agency'].replace('agency_equal', 0)
verbs['agency'] = verbs['agency'].replace('agency_pos', 1)

verbs.groupby('agency').count()

Unnamed: 0_level_0,verb,power
agency,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,228,117
0.0,242,156
1.0,1676,1455


In [7]:
verbs['power'] = verbs['power'].replace('power_theme', -1)
verbs['power'] = verbs['power'].replace('power_equal', 0)
verbs['power'] = verbs['power'].replace('power_agent', 1)

verbs.groupby('power').count()

Unnamed: 0_level_0,verb,agency
power,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,206,204
0.0,309,308
1.0,1222,1216


# Lemmatize the verbs

In [29]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

verbs['verb'].apply(lambda x: porter_stemmer.stem(x))

0          abandon
1          abolish
2           absorb
3             abus
4          acceler
5           accept
6           access
7         accommod
8        accompani
9       accomplish
10          accord
11         account
12         accumul
13           accus
14             ach
15          achiev
16      acknowledg
17        acquaint
18          acquir
19          acquit
20           activ
21             act
22           adapt
23         address
24             add
25          adjoin
26          adjust
27        administ
28           admir
29           admit
           ...    
2125          wire
2126          wish
2127      withdraw
2128        wither
2129      withhold
2130           wit
2131         wobbl
2132        wonder
2133          work
2134         worri
2135        worsen
2136       worship
2137         wound
2138         wrack
2139          wrap
2140         wreck
2141        wrench
2142        wrestl
2143        wriggl
2144         wring
2145         write
2146        

In [35]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

verbs['verb'] = verbs['verb'].apply(lambda x: lemmatizer.lemmatize(x))

In [36]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandon,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuse,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


# Part of Speech Tagging

In [37]:
test = pickle.load(open("../data/movies_small.p", 'rb'))

tiny_test = test

In [38]:
def get_verbs(x):

    verb_tags = ['VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    text = nltk.word_tokenize(x)
    tags = nltk.pos_tag(text)
    
    verbs = [word for (word, tag) in tags if tag in verb_tags]
    
    return verbs

tiny_test['verbs'] = tiny_test['words'].apply(get_verbs)

tiny_test['verb_count'] = tiny_test['verbs'].apply(lambda x: len(x))

In [39]:
tiny_test.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy,[make],1
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy,"[think, start]",2
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy,"[hacking, gagging]",2
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy,[try],1
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy,"[ask, cute]",2


In [40]:
#source: https://gist.github.com/jlln/338b4b0b55bd6984f883

def split_data_frame_list(df, 
                       target_column,
                      output_type=float):
    ''' 
    Accepts a column with multiple types and splits list variables to several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = output_type(s)
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = output_type(split_row)
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

In [41]:
df = split_data_frame_list(tiny_test, 'verbs', output_type = str)
df.head()

Unnamed: 0,char_id_from,char_id_to,gender_from,gender_to,genre,line_id,movie_id,movie_year,verb_count,verbs,words
0,u0,u2,f,m,comedy,L194,m0,1999,1,make,we make quick roxanne korrine andrew barrett i...
1,u0,u2,f,m,comedy,L195,m0,1999,2,think,well i think we start pronunciation okay you
2,u0,u2,f,m,comedy,L195,m0,1999,2,start,well i think we start pronunciation okay you
3,u0,u2,f,m,comedy,L196,m0,1999,2,hacking,hacking gagging spit part please
4,u0,u2,f,m,comedy,L196,m0,1999,2,gagging,hacking gagging spit part please


In [42]:
final_df = df.merge(verbs, left_on = 'verbs', right_on = 'verb')

In [43]:
final_df.head()

Unnamed: 0,char_id_from,char_id_to,gender_from,gender_to,genre,line_id,movie_id,movie_year,verb_count,verbs,words,verb,agency,power
0,u0,u2,f,m,comedy,L194,m0,1999,1,make,we make quick roxanne korrine andrew barrett i...,make,agency_pos,power_agent
1,u0,u5,f,f,comedy,L907,m0,1999,2,make,i want let you make your mind him,make,agency_pos,power_agent
2,u0,u5,f,f,comedy,L908,m0,1999,7,make,you you really think i could make my decision ...,make,agency_pos,power_agent
3,u5,u6,f,f,comedy,L754,m0,1999,2,make,you look wrong perspective we make statement,make,agency_pos,power_agent
4,u2,u7,m,m,comedy,L215,m0,1999,2,make,make you think he it,make,agency_pos,power_agent
