In [67]:
import pandas as pd
import numpy as np
import pickle
import nltk

%matplotlib inline

In [68]:
verbs = pd.read_csv('../data/agency_power.csv')

In [69]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandons,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuses,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


In [70]:
verbs.groupby('agency').count()

Unnamed: 0_level_0,verb,power
agency,Unnamed: 1_level_1,Unnamed: 2_level_1
agency_equal,242,156
agency_neg,228,117
agency_pos,1676,1455


In [71]:
verbs.groupby('power').count()

Unnamed: 0_level_0,verb,agency
power,Unnamed: 1_level_1,Unnamed: 2_level_1
power_agent,1222,1216
power_equal,309,308
power_theme,206,204


# Lemmatize the verbs

In [72]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

verbs['verb'] = verbs['verb'].apply(lambda x: lemmatizer.lemmatize(x))

In [73]:
verbs.head()

Unnamed: 0,verb,agency,power
0,abandon,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuse,agency_pos,power_agent
4,accelerates,agency_pos,power_agent


# Part of Speech Tagging

In [74]:
movies = pickle.load(open("../data/movies_lines_holdout.p", 'rb'))

In [75]:
movies['line_id'].unique().shape #number of unique lines

(101960,)

In [76]:
def get_verbs(x):

    verb_tags = ['VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    text = nltk.word_tokenize(x)
    tags = nltk.pos_tag(text)
    
    verbs = [word for (word, tag) in tags if tag in verb_tags]
    
    return verbs

movies['verbs'] = movies['words'].apply(get_verbs)

movies['verb_count'] = movies['verbs'].apply(lambda x: len(x))

In [22]:
#source: https://gist.github.com/jlln/338b4b0b55bd6984f883

def split_data_frame_list(df, 
                       target_column,
                       output_type=float):
    ''' 
    Accepts a column with multiple types and splits list variables to several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = output_type(s)
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = output_type(split_row)
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

In [23]:
movies_long = split_data_frame_list(movies, 'verbs', output_type = str)

In [24]:
movies.head()

Unnamed: 0,movie_id,gender_to,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count
0,m49,m,f,u761,u765,L163186,thanks miss,1999,comedy,[miss],1
1,m49,m,f,u761,u765,L163187,you kind i amanda,1999,comedy,[],0
2,m49,m,f,u761,u765,L163188,right well thanks drink stuff amanda reason me...,1999,comedy,"[drink, stick]",2
3,m49,m,f,u761,u765,L163189,glum hawk night still young fill plenty compen...,1999,comedy,[],0
4,m49,m,f,u761,u765,L163190,huh,1999,comedy,[],0


# Add back the lines with no verbs
* Give them a verb count of 0

In [35]:
movies = pickle.load(open("../data/movies_lines_holdout.p", 'rb'))

cols_to_use = ['verbs', 'verb_count', 'line_id']
movies_verbs = movies.merge(movies_long[cols_to_use], on = 'line_id', how='left', indicator = True)

In [36]:
movies_verbs[movies_verbs['_merge'] != 'both'].head(2)

Unnamed: 0,movie_id,gender_to,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count,_merge
1,m49,m,f,u761,u765,L163187,you kind i amanda,1999,comedy,,,left_only
4,m49,m,f,u761,u765,L163189,glum hawk night still young fill plenty compen...,1999,comedy,,,left_only


In [37]:
movies_verbs['verb_count'].fillna(value = 0, inplace = True)

In [38]:
movies_verbs[movies_verbs['_merge'] != 'both'].head(2)

Unnamed: 0,movie_id,gender_to,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre,verbs,verb_count,_merge
1,m49,m,f,u761,u765,L163187,you kind i amanda,1999,comedy,,0.0,left_only
4,m49,m,f,u761,u765,L163189,glum hawk night still young fill plenty compen...,1999,comedy,,0.0,left_only


In [39]:
movies_verbs = movies_verbs.merge(verbs, left_on = 'verbs', right_on = 'verb', how = 'left')

In [40]:
agency_dummies = pd.get_dummies(movies_verbs['agency'])
power_dummies = pd.get_dummies(movies_verbs['power'])

In [41]:
movies_verbs = pd.concat([movies_verbs, power_dummies], axis = 1)
movies_verbs = pd.concat([movies_verbs, agency_dummies], axis = 1)

# Sum the verb counts by line

In [42]:
counts = movies_verbs.groupby('line_id').agg({'agency_neg': sum, 'agency_pos': sum, 'power_agent': sum, 'power_theme': sum})
counts = counts.reset_index()

drop_list = ['agency_neg', 'agency_pos', 'power_agent', 'power_theme', 'agency_equal', 
             'power_equal', 'verbs', 'verb', 'agency', 'power', '_merge']

movies_verbs.drop(drop_list, axis = 1, inplace = True)

In [43]:
movies_verbs = counts.merge(movies_verbs, on = 'line_id', how = 'left')
movies_verbs.drop_duplicates(inplace = True)

In [44]:
movies_verbs.shape #correct number of unique lines

(101960, 14)

In [45]:
movies_verbs['agency_pos_prop'] = movies_verbs['agency_pos'] / movies_verbs['verb_count']
movies_verbs['power_pos_prop'] = movies_verbs['power_agent'] / movies_verbs['verb_count']

movies_verbs['agency_neg_prop'] = movies_verbs['agency_neg'] / movies_verbs['verb_count']
movies_verbs['power_neg_prop'] = movies_verbs['power_theme'] / movies_verbs['verb_count']

In [46]:
movies_verbs.head()

Unnamed: 0,line_id,agency_neg,agency_pos,power_agent,power_theme,movie_id,gender_to,gender_from,char_id_from,char_id_to,words,movie_year,genre,verb_count,agency_pos_prop,power_pos_prop,agency_neg_prop,power_neg_prop
0,L1000,0,2,1,0,m0,m,f,u5,u11,oh christ tell me you change your mind i alrea...,1999,comedy,3.0,0.666667,0.333333,0.0,0.0
3,L100058,0,0,0,0,m279,m,f,u4183,u4191,you feel good franny,1995,drama,1.0,0.0,0.0,0.0,0.0
4,L100059,0,1,0,0,m279,m,f,u4183,u4191,yes i fine it heat i think,1995,drama,3.0,0.333333,0.0,0.0,0.0
7,L100062,0,0,0,0,m279,m,f,u4183,u4191,it chicago station i find it day,1995,drama,2.0,0.0,0.0,0.0,0.0
9,L100063,0,0,0,0,m279,m,f,u4183,u4191,kinda pretty uh jazz kinda singing,1995,drama,1.0,0.0,0.0,0.0,0.0


# Agency/power by gender

In [47]:
movies_verbs.groupby('gender_from')['agency_pos_prop'].mean()

gender_from
?    0.370080
f    0.374493
m    0.380031
Name: agency_pos_prop, dtype: float64

In [48]:
movies_verbs.groupby('gender_from')['power_pos_prop'].mean()

gender_from
?    0.267942
f    0.251378
m    0.271910
Name: power_pos_prop, dtype: float64

In [49]:
movies_verbs.groupby('gender_from')['agency_neg_prop'].mean()

gender_from
?    0.147343
f    0.164047
m    0.154008
Name: agency_neg_prop, dtype: float64

In [51]:
movies_verbs.groupby('gender_from')['power_neg_prop'].mean()

gender_from
?    0.070530
f    0.077743
m    0.067734
Name: power_neg_prop, dtype: float64

# Check whether the difference is bigger for some movies

### Calculate differences in proportions by gender and movie

In [52]:
by_movie = pd.DataFrame(movies_verbs.groupby(['movie_id', 'gender_from'])['power_pos_prop'].mean())
by_movie = pd.DataFrame({'mean': movies_verbs.groupby(["movie_id", "gender_from"])['power_pos_prop'].mean()}).reset_index()
by_movie = by_movie[by_movie['gender_from'] != '?']
by_movie = by_movie.pivot(index='movie_id', columns='gender_from', values='mean').reset_index()
by_movie['diff'] = by_movie['m'] - by_movie['f']
by_movie = by_movie.sort_values('diff', ascending=False)

In [53]:
by_movie.shape

(204, 4)

In [54]:
by_movie[by_movie['diff'] > 0.05].shape

(51, 4)

About 25% of movies have a gender difference greater than 5%.

# Analysis by gender pairs
* Male-Male
* Male-Female
* Female-Male
* Female-Female

In [55]:
by_gender = movies_verbs[(movies_verbs['gender_to'] != '?') & (movies_verbs['gender_from'] != '?')]
      
# Create gender pair column
conditions = [(by_gender.gender_from == 'm') & (by_gender.gender_to == 'm'),
              (by_gender.gender_from == 'm') & (by_gender.gender_to == 'f'),
              (by_gender.gender_from == 'f') & (by_gender.gender_to == 'm'),
              (by_gender.gender_from == 'f') & (by_gender.gender_to == 'f')]

choices = ['mm', 'mf', 'fm', 'ff']
   
by_gender['gender_pair'] = np.select(conditions, choices)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [64]:
by_gender.groupby('gender_pair')['power_pos_prop'].mean()

gender_pair
ff    0.238997
fm    0.253022
mf    0.257855
mm    0.279815
Name: power_pos_prop, dtype: float64

# Analysis by genre

In [60]:
pd.DataFrame(movies_verbs.groupby(['gender_from', 'genre'])['power_pos_prop'].mean()).reset_index()

Unnamed: 0,gender_from,genre,power_pos_prop
0,?,action,0.286699
1,?,adventure,0.251829
2,?,animation,0.259222
3,?,biography,0.281446
4,?,comedy,0.242982
5,?,crime,0.268476
6,?,drama,0.264861
7,?,fantasy,0.276327
8,?,film-noir,0.420635
9,?,horror,0.264985
