In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk

%matplotlib inline

In [2]:
verbs = pd.read_csv('../data/agency_power.csv')

In [3]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandons,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuses,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


In [4]:
verbs.groupby('agency').count()

Unnamed: 0_level_0,verb,power
agency,Unnamed: 1_level_1,Unnamed: 2_level_1
agency_equal,242,156
agency_neg,228,117
agency_pos,1676,1455


In [5]:
verbs.groupby('power').count()

Unnamed: 0_level_0,verb,agency
power,Unnamed: 1_level_1,Unnamed: 2_level_1
power_agent,1222,1216
power_equal,309,308
power_theme,206,204


# Lemmatize the verbs

In [6]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

verbs['verb'] = verbs['verb'].apply(lambda x: lemmatizer.lemmatize(x))

In [7]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandon,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuse,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


# Part of Speech Tagging

In [8]:
movies = pickle.load(open("../data/movies.p", 'rb'))

In [9]:
def get_verbs(x):

    verb_tags = ['VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    text = nltk.word_tokenize(x)
    tags = nltk.pos_tag(text)
    
    verbs = [word for (word, tag) in tags if tag in verb_tags]
    
    return verbs

movies['verbs'] = movies['words'].apply(get_verbs)

movies['verb_count'] = movies['verbs'].apply(lambda x: len(x))

In [10]:
movies.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy,[make],1
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy,"[think, start]",2
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy,"[hacking, gagging]",2
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy,[try],1
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy,"[ask, cute]",2


In [11]:
#source: https://gist.github.com/jlln/338b4b0b55bd6984f883

def split_data_frame_list(df, 
                       target_column,
                       output_type=float):
    ''' 
    Accepts a column with multiple types and splits list variables to several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = output_type(s)
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = output_type(split_row)
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

In [12]:
movies_long = split_data_frame_list(movies, 'verbs', output_type = str)
movies_long.head(2)

Unnamed: 0,char_id_from,char_id_to,gender_from,gender_to,genre,line_id,movie_id,movie_year,verb_count,verbs,words
0,u0,u2,f,m,comedy,L194,m0,1999,1,make,we make quick roxanne korrine andrew barrett i...
1,u0,u2,f,m,comedy,L195,m0,1999,2,think,well i think we start pronunciation okay you


In [42]:
movies_verbs = movies_long.merge(verbs, left_on = 'verbs', right_on = 'verb')

In [43]:
movies_verbs.head(2)

Unnamed: 0,char_id_from,char_id_to,gender_from,gender_to,genre,line_id,movie_id,movie_year,verb_count,verbs,words,verb,agency,power
0,u0,u2,f,m,comedy,L194,m0,1999,1,make,we make quick roxanne korrine andrew barrett i...,make,agency_pos,power_agent
1,u0,u5,f,f,comedy,L907,m0,1999,2,make,i want let you make your mind him,make,agency_pos,power_agent


In [44]:
agency_dummies = pd.get_dummies(movies_verbs['agency'])
power_dummies = pd.get_dummies(movies_verbs['power'])

In [45]:
movies_verbs = pd.concat([movies_verbs, agency_dummies], axis = 1)
movies_verbs = pd.concat([movies_verbs, power_dummies], axis = 1)

In [46]:
movies_verbs.head(2)

Unnamed: 0,char_id_from,char_id_to,gender_from,gender_to,genre,line_id,movie_id,movie_year,verb_count,verbs,words,verb,agency,power,agency_equal,agency_neg,agency_pos,power_agent,power_equal,power_theme
0,u0,u2,f,m,comedy,L194,m0,1999,1,make,we make quick roxanne korrine andrew barrett i...,make,agency_pos,power_agent,0,0,1,1,0,0
1,u0,u5,f,f,comedy,L907,m0,1999,2,make,i want let you make your mind him,make,agency_pos,power_agent,0,0,1,1,0,0


In [47]:
movies_verbs['agency_pos_prop'] = movies_verbs['agency_pos'] / movies_verbs['verb_count']
movies_verbs['power_pos_prop'] = movies_verbs['power_agent'] / movies_verbs['verb_count']

movies_verbs['agency_neg_prop'] = movies_verbs['agency_neg'] / movies_verbs['verb_count']
movies_verbs['power_neg_prop'] = movies_verbs['power_theme'] / movies_verbs['verb_count']

In [54]:
drop_list = ['agency_equal', 'agency_neg', 'agency_pos', 'power_agent', 'power_equal', 'power_theme',
             'verbs', 'verb', 'agency', 'power']

movies_verbs.drop(drop_list, axis = 1, inplace = True)
movies_verbs.head(2)



ValueError: labels ['agency_equal' 'agency_neg' 'agency_pos' 'power_agent' 'power_equal'
 'power_theme' 'verbs' 'verb' 'agency' 'power'] not contained in axis

# Agency/power by gender

In [20]:
movies_verbs.groupby('gender_from')['agency_pos_prop'].mean()

gender_from
?    0.256687
f    0.239076
m    0.242197
Name: agency_pos_prop, dtype: float64

In [21]:
movies_verbs.groupby('gender_from')['power_pos_prop'].mean()

gender_from
?    0.185526
f    0.162110
m    0.173953
Name: power_pos_prop, dtype: float64

In [22]:
movies_verbs.groupby('gender_from')['agency_neg_prop'].mean()

gender_from
?    0.099455
f    0.104053
m    0.097057
Name: agency_neg_prop, dtype: float64

In [23]:
movies_verbs.groupby('gender_from')['power_neg_prop'].mean()

gender_from
?    0.047360
f    0.048530
m    0.043774
Name: power_neg_prop, dtype: float64

# Check whether the difference is bigger for some movies

### Calculate differences in proportions by gender and movie

In [79]:
by_movie = pd.DataFrame(movies_verbs.groupby(['movie_id', 'gender_from'])['power_pos_prop'].mean())
by_movie = pd.DataFrame({'mean': movies_verbs.groupby(["movie_id", "gender_from"])['power_pos_prop'].mean()}).reset_index()
by_movie = by_movie[by_movie['gender_from'] != '?']
by_movie = by_movie.pivot(index='movie_id', columns='gender_from', values='mean').reset_index()
by_movie['diff'] = by_movie['m'] - by_movie['f']
by_movie = by_movie.sort_values('diff', ascending=False)

In [84]:
by_movie.shape

(613, 4)

In [85]:
by_movie[by_movie['diff'] > 0.05].shape

(99, 4)

About 16% of movies have a gender difference greater than 5%.

# Analysis by gender pairs
* Male-Male
* Male-Female
* Female-Male
* Female-Female

In [75]:
by_gender = movies_verbs[(movies_verbs['gender_to'] != '?') & (movies_verbs['gender_from'] != '?')]
      
# Create gender pair column
conditions = [(by_gender.gender_from == 'm') & (by_gender.gender_to == 'm'),
              (by_gender.gender_from == 'm') & (by_gender.gender_to == 'f'),
              (by_gender.gender_from == 'f') & (by_gender.gender_to == 'm'),
              (by_gender.gender_from == 'f') & (by_gender.gender_to == 'f')]

choices = ['mm', 'mf', 'fm', 'ff']
   
by_gender['gender_pair'] = np.select(conditions, choices)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [77]:
by_gender.groupby('gender_pair')['power_pos_prop'].mean()

gender_pair
ff    0.154822
fm    0.162483
mf    0.166255
mm    0.175918
Name: power_pos_prop, dtype: float64