In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk

%matplotlib inline

In [2]:
verbs = pd.read_csv('../data/agency_power.csv')

In [3]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandons,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuses,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


In [4]:
verbs.groupby('agency').count()

Unnamed: 0_level_0,verb,power
agency,Unnamed: 1_level_1,Unnamed: 2_level_1
agency_equal,242,156
agency_neg,228,117
agency_pos,1676,1455


In [5]:
verbs.groupby('power').count()

Unnamed: 0_level_0,verb,agency
power,Unnamed: 1_level_1,Unnamed: 2_level_1
power_agent,1222,1216
power_equal,309,308
power_theme,206,204


# Lemmatize the verbs

In [6]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

verbs['verb'] = verbs['verb'].apply(lambda x: lemmatizer.lemmatize(x))

In [7]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandon,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuse,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


# Part of Speech Tagging

In [8]:
movies = pickle.load(open("../data/movies.p", 'rb'))

In [9]:
movies['line_id'].unique().shape #number of unique lines

(304354,)

In [10]:
def get_verbs(x):

    verb_tags = ['VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    text = nltk.word_tokenize(x)
    tags = nltk.pos_tag(text)
    
    verbs = [word for (word, tag) in tags if tag in verb_tags]
    
    return verbs

movies['verbs'] = movies['words'].apply(get_verbs)

movies['verb_count'] = movies['verbs'].apply(lambda x: len(x))

In [11]:
#source: https://gist.github.com/jlln/338b4b0b55bd6984f883

def split_data_frame_list(df, 
                       target_column,
                       output_type=float):
    ''' 
    Accepts a column with multiple types and splits list variables to several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = output_type(s)
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = output_type(split_row)
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

In [12]:
movies_long = split_data_frame_list(movies, 'verbs', output_type = str)

In [17]:
movies.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy,[make],1
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy,"[think, start]",2
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy,"[hacking, gagging]",2
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy,[try],1
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy,"[ask, cute]",2


# Add back the lines with no verbs
* Give them a verb count of 0

In [40]:
movies = pickle.load(open("../data/movies.p", 'rb'))

cols_to_use = ['verbs', 'verb_count', 'line_id']
movies_verbs = movies.merge(movies_long[cols_to_use], on = 'line_id', how='left', indicator = True)

In [41]:
movies_verbs[movies_verbs['_merge'] != 'both'].head(2)

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count,_merge
10,m,m0,f,u0,u2,L201,cameron,1999,comedy,,,left_only
14,m,m0,f,u0,u2,L204,,1999,comedy,,,left_only


In [42]:
movies_verbs['verb_count'].fillna(value = 0, inplace = True)

In [43]:
movies_verbs[movies_verbs['_merge'] != 'both'].head(2)

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count,_merge
10,m,m0,f,u0,u2,L201,cameron,1999,comedy,,0.0,left_only
14,m,m0,f,u0,u2,L204,,1999,comedy,,0.0,left_only


In [44]:
movies_verbs = movies_verbs.merge(verbs, left_on = 'verbs', right_on = 'verb', how = 'left')

In [45]:
agency_dummies = pd.get_dummies(movies_verbs['agency'])
power_dummies = pd.get_dummies(movies_verbs['power'])

In [46]:
movies_verbs = pd.concat([movies_verbs, power_dummies], axis = 1)
movies_verbs = pd.concat([movies_verbs, agency_dummies], axis = 1)

# Sum the verb counts by line

In [47]:
counts = movies_verbs.groupby('line_id').agg({'agency_neg': sum, 'agency_pos': sum, 'power_agent': sum, 'power_theme': sum})
counts = counts.reset_index()

drop_list = ['agency_neg', 'agency_pos', 'power_agent', 'power_theme', 'agency_equal', 
             'power_equal', 'verbs', 'verb', 'agency', 'power', '_merge']

movies_verbs.drop(drop_list, axis = 1, inplace = True)

In [48]:
movies_verbs = counts.merge(movies_verbs, on = 'line_id', how = 'left')
movies_verbs.drop_duplicates(inplace = True)

In [49]:
movies_verbs.shape #correct number of unique lines

(304354, 14)

In [50]:
movies_verbs['agency_pos_prop'] = movies_verbs['agency_pos'] / movies_verbs['verb_count']
movies_verbs['power_pos_prop'] = movies_verbs['power_agent'] / movies_verbs['verb_count']

movies_verbs['agency_neg_prop'] = movies_verbs['agency_neg'] / movies_verbs['verb_count']
movies_verbs['power_neg_prop'] = movies_verbs['power_theme'] / movies_verbs['verb_count']

In [51]:
movies_verbs.head()

Unnamed: 0,line_id,agency_neg,agency_pos,power_agent,power_theme,gender_to,movie_id,gender_from,char_id_from,char_id_to,words,movie_year,genre,verb_count,agency_pos_prop,power_pos_prop,agency_neg_prop,power_neg_prop
0,L1000,0,2,1,0,m,m0,f,u5,u11,oh christ tell me you change your mind i alrea...,1999,comedy,3.0,0.666667,0.333333,0.0,0.0
3,L10000,1,1,1,0,f,m232,?,u3522,u3525,oh chamber run uh huh good well hey you guy kn...,1989,action,3.0,0.333333,0.333333,0.333333,0.0
6,L100000,0,0,0,0,f,m278,m,u4166,u4168,,1993,drama,0.0,,,,
7,L100001,1,0,0,0,f,m278,m,u4166,u4168,you go see mr koehler first place,1993,drama,3.0,0.0,0.0,0.333333,0.0
11,L100002,0,1,0,1,f,m278,m,u4166,u4168,he call me,1993,drama,1.0,1.0,0.0,0.0,1.0


# Agency/power by gender

In [29]:
movies_verbs.groupby('gender_from')['agency_pos_prop'].mean()

gender_from
?    0.369426
f    0.372348
m    0.379147
Name: agency_pos_prop, dtype: float64

In [30]:
movies_verbs.groupby('gender_from')['power_pos_prop'].mean()

gender_from
?    0.267010
f    0.252478
m    0.272315
Name: power_pos_prop, dtype: float64

In [31]:
movies_verbs.groupby('gender_from')['agency_neg_prop'].mean()

gender_from
?    0.143136
f    0.162057
m    0.151938
Name: agency_neg_prop, dtype: float64

In [32]:
movies_verbs.groupby('gender_from')['power_neg_prop'].mean()

gender_from
?    0.068161
f    0.075582
m    0.068527
Name: power_neg_prop, dtype: float64

# Check whether the difference is bigger for some movies

### Calculate differences in proportions by gender and movie

In [33]:
by_movie = pd.DataFrame(movies_verbs.groupby(['movie_id', 'gender_from'])['power_pos_prop'].mean())
by_movie = pd.DataFrame({'mean': movies_verbs.groupby(["movie_id", "gender_from"])['power_pos_prop'].mean()}).reset_index()
by_movie = by_movie[by_movie['gender_from'] != '?']
by_movie = by_movie.pivot(index='movie_id', columns='gender_from', values='mean').reset_index()
by_movie['diff'] = by_movie['m'] - by_movie['f']
by_movie = by_movie.sort_values('diff', ascending=False)

In [34]:
by_movie.shape

(614, 4)

In [35]:
by_movie[by_movie['diff'] > 0.05].shape

(151, 4)

About 25% of movies have a gender difference greater than 5%.

# Analysis by gender pairs
* Male-Male
* Male-Female
* Female-Male
* Female-Female

In [36]:
by_gender = movies_verbs[(movies_verbs['gender_to'] != '?') & (movies_verbs['gender_from'] != '?')]
      
# Create gender pair column
conditions = [(by_gender.gender_from == 'm') & (by_gender.gender_to == 'm'),
              (by_gender.gender_from == 'm') & (by_gender.gender_to == 'f'),
              (by_gender.gender_from == 'f') & (by_gender.gender_to == 'm'),
              (by_gender.gender_from == 'f') & (by_gender.gender_to == 'f')]

choices = ['mm', 'mf', 'fm', 'ff']
   
by_gender['gender_pair'] = np.select(conditions, choices)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [37]:
by_gender.groupby('gender_pair')['power_pos_prop'].mean()

gender_pair
ff    0.241882
fm    0.254570
mf    0.259897
mm    0.278771
Name: power_pos_prop, dtype: float64