In [3]:
import re
import pandas as pd
import spacy
from spacy.symbols import PROPN, PERSON
from __future__ import unicode_literals, print_function

In [4]:
nlp = spacy.load('en_core_web_sm')

Read training, validation and test data.

In [5]:
# Read training, validation and test data.
train = pd.read_csv('..\gap-coreference\gap-development.tsv', sep='\t')
valid = pd.read_csv('..\gap-coreference\gap-validation.tsv', sep='\t')
test = pd.read_csv('..\gap-coreference\gap-test.tsv', sep='\t')

In [17]:
def sample_count(df):
    ''' 
    Get number of samples in each set. 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: Numer of rows in dataframe 
    :rtype: int
    '''
    return df.shape[0]


def feature_names(df):
    ''' 
    Get names of features. 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: List of column names in a Pandas dataframe
    :rtype: numpy.ndarray
    '''
    return df.columns.values


def column_value_counts(df, target_column, new_column):
    '''
    Get value counts of each categorical variable. Store this data in 
    a dataframe. Also add a column with relative percentage of each 
    categorical variable.
    
    :param df: A Pandas dataframe
    :param target_column: Name of the column in the original dataframe (string)
    :param new_column: Name of the new column where the frequency counts are stored 
    :type df: pandas.core.frame.DataFrame
    :type target_column: str
    :type new_column: str
    :return: A Pandas dataframe containing the frequency counts
    :rtype: pandas.core.frame.DataFrame
    '''
    df_value_counts = df[target_column].value_counts()
    df = pd.DataFrame(df_value_counts)
    df.columns = [new_column]
    df[new_column+'_%'] = 100*df[new_column] / df[new_column].sum()
    return df


def create_labels(df):
    '''
    Create labels of which candidate reference is the correct one, if any. 
    Store the labels in a new column in the dataframe. 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: Returns the updated Pandas dataframe
    :rtype: pandas.core.frame.DataFrame
    '''
    df['labels'] = 'Neither'
    df.loc[df['A-coref'] == True, 'labels'] = 'A'
    df.loc[df['B-coref'] == True, 'labels'] = 'B'
    return df


def pronouns(df):
    ''' 
    Get a list of all unique pronouns in the dataset. 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: List of dataframe indexes 
    :rtype: numpy.ndarray
    '''
    return df.index.tolist()


def count_all_pronoun(text):
    ''' 
    Count all pronouns in passed text.
    
    :param text: A string
    :type text: str
    :return: Number of pronouns in the text string
    :rtype: int
    '''
    return len(re.findall(r"\b(He|Her|His|She|he|her|hers|him|his|she)\b", text))


def store_pronoun_count(df):
    ''' 
    Count all pronouns in each row and store it in a new column named "Pronoun-count". 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: Returns nothing
    :rtype: None
    '''
    df['Pronoun-count'] = df['Text'].apply(count_all_pronoun)
    return
    
def text_len(df):
    ''' 
    Count text length in each row and stroe it in a new column named "Text-len". 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: Returns nothing
    :rtype: None
    '''
    df['Text-len'] = df.Text.str.len()
    return

def column_summary(df, target_column, new_column):
    ''' 
    Compute column summary and return as a dataframe. 
    
    :param df: A Pandas dataframe
    :param target_column: Name of the column in the original dataframe (string)
    :param new_column: Name of the new column where the frequency counts are stored 
    :type df: pandas.core.frame.DataFrame
    :type target_column: str
    :type new_column: str
    :return: A Pandas dataframe containing the summary
    :rtype: pandas.core.frame.DataFrame
    '''
    temp = pd.DataFrame(df[target_column].describe())
    temp.columns = [new_column]
    temp = temp.round(2)
    return temp

def word_count(df):
    ''' 
    Count and store words in "Text" for each row of the dataframe. 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: Returns nothing
    :rtype: None
    '''
    df['Word-count'] = df.Text.str.split().str.len()
    return
    
def person_entity_count(text):
    ''' 
    Extract person names and count the number of person name occurences. 
    Not completely accurate. 
    
    :param text: Text sample in the dataset
    :type text: str
    :return: Rturns the number of person names in the text
    :rtype: int
    '''
    doc = nlp(text)
    p_count = 0
    #person = []
    for ent in doc.ents:
        if ent.label == PERSON:
            p_count += 1
            #person.append(ent.text)
    return p_count

def pronoun_capital(pronoun_text):
    '''
    Find if the passed pronoun starts with a capital letter.
    
    :param pronoun_text: The target pronoun
    :type pronoun_text: str
    :return: Returns true or false
    :rtype: int
    '''
    z = re.match('(H\w+)', pronoun_text)
    if z is None:
        return 0
    else:
        return 1

def pronoun_first_word(df):
    '''
    Find if the target pronoun is the frst word in the sentence, and store it in a 
    column in the dataframe.
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: Returns nothing
    :rtype: None
    '''
    df['Pronoun-first-word'] = df['Text'].apply(pronoun_capital)
    return



In [4]:
print(" Training set: {} \n Validation set: {} \n Test Set: {}\n".
      format(sample_count(train), sample_count(valid), sample_count(test)))

print("Feature names: {}".format(feature_names(train)))

 Training set: 2000 
 Validation set: 454 
 Test Set: 2000

Feature names: ['ID' 'Text' 'Pronoun' 'Pronoun-offset' 'A' 'A-offset' 'A-coref' 'B'
 'B-offset' 'B-coref' 'URL']


In [330]:
train.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,...,Pronoun-count,Text-len,Word-count,Person-name-count,Pronoun-first-word,A-pronoun-offset-diff,B-pronoun-offset-diff,Pronoun-sentence-number,A-sentence-number,B-sentence-number
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,...,6,426,74,11,0,-83,-67,3,2,2
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,True,Bernard Leach,251,False,...,4,410,65,7,1,-56,-33,3,2,2
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,False,De la Sota,246,True,...,2,536,95,2,1,-92,-19,3,2,2
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,False,Henry Rosenthal,336,True,...,1,401,69,4,0,-147,15,2,1,2
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,False,Rivera,294,True,...,4,660,112,7,1,-218,-143,3,2,2


In [10]:
# Get frequency distribution of pronouns in each set
df_train = column_value_counts(train, 'Pronoun', 'Train')
df_test = column_value_counts(test, 'Pronoun', 'Test')
df_valid = column_value_counts(valid, 'Pronoun', 'Validation')

pronoun_count = pd.concat([df_train, df_valid, df_test], axis=1) # Merge dataframes by index
pronoun_count = pronoun_count.fillna(0) # Replace Nan with 0 (zero)
pronoun_count = pronoun_count.round(2) # Rounding decimals to two digits after .
print(pronoun_count.sort_values(by=['Train'], ascending=False))

      Train  Train_%  Validation  Validation_%  Test  Test_%
her   534.0    26.70       132.0         29.07   566   28.30
his   496.0    24.80       102.0         22.47   528   26.40
she   249.0    12.45        62.0         13.66   237   11.85
he    235.0    11.75        67.0         14.76   221   11.05
She   179.0     8.95        25.0          5.51   159    7.95
He    138.0     6.90        26.0          5.73   127    6.35
him    98.0     4.90        26.0          5.73    96    4.80
Her    38.0     1.90         8.0          1.76    37    1.85
His    33.0     1.65         6.0          1.32    28    1.40
hers    0.0     0.00         0.0          0.00     1    0.05


In [8]:
# Get frequency distribution of which candidate noun is correct in each set
train = create_labels(train)
test = create_labels(test)
valid = create_labels(valid)

df_train = column_value_counts(train, 'labels', 'Train_lables')
df_test = column_value_counts(test, 'labels', 'Test_labels')
df_valid = column_value_counts(valid, 'labels', 'Validation_labels')

label_count = pd.concat([df_train, df_valid, df_test], axis=1)
label_count = label_count.round(2)
label_count.head()

Unnamed: 0,Train_lables,Train_lables_%,Validation_labels,Validation_labels_%,Test_labels,Test_labels_%
A,874,43.7,187,41.19,918,45.9
B,925,46.25,205,45.15,855,42.75
Neither,201,10.05,62,13.66,227,11.35


In [123]:
# Get all unique pronouns in text
unique_pronouns = pronouns(pronoun_count)
print(unique_pronouns)

['He', 'Her', 'His', 'She', 'he', 'her', 'hers', 'him', 'his', 'she']


In [80]:
# Get text length and store it in a new column
text_len(train)
text_len(valid)
text_len(test)

In [97]:
# Get summary of text length and compare side by side
df_train = column_summary(train, 'Text-len', 'Train')
df_test = column_summary(test, 'Text-len', 'Test')
df_valid = column_summary(valid, 'Text-len', 'Validation')

text_length_summary = pd.concat([df_train, df_valid, df_test], axis=1)
print(text_length_summary)

         Train  Validation     Test
count  2000.00      454.00  2000.00
mean    426.92      426.39   430.61
std     124.63      112.64   130.72
min     104.00      147.00    69.00
25%     346.00      354.00   344.00
50%     410.00      408.50   412.00
75%     493.00      487.75   500.00
max    1270.00     1012.00  1347.00


In [126]:
# Get pronoun count in text and store it in a new column
store_pronoun_count(train)
store_pronoun_count(valid)
store_pronoun_count(test)

# Get summary of pronoun count and compare side by side
df_train = column_summary(train, 'Pronoun-count', 'Train')
df_test = column_summary(test, 'Pronoun-count', 'Test')
df_valid = column_summary(valid, 'Pronoun-count', 'Validation')

pronoun_count_summary = pd.concat([df_train, df_valid, df_test], axis=1)
print(pronoun_count_summary)

         Train  Validation     Test
count  2000.00      454.00  2000.00
mean      3.11        3.39     3.03
std       2.06        2.36     2.00
min       1.00        1.00     1.00
25%       2.00        2.00     2.00
50%       3.00        3.00     3.00
75%       4.00        5.00     4.00
max      17.00       16.00    15.00


In [135]:
# Get word count in each row
word_count(train)
word_count(valid)
word_count(test)

# Get summary of word count and compare side by side
df_train = column_summary(train, 'Word-count', 'Train')
df_test = column_summary(test, 'Word-count', 'Test')
df_valid = column_summary(valid, 'Word-count', 'Validation')

word_count_summary = pd.concat([df_train, df_valid, df_test], axis=1)
print(word_count_summary)

         Train  Validation     Test
count  2000.00      454.00  2000.00
mean     71.20       70.95    71.58
std      20.53       18.64    20.94
min      16.00       20.00    12.00
25%      58.00       59.00    58.00
50%      68.00       69.00    69.00
75%      82.00       80.00    83.00
max     204.00      187.00   223.00


In [169]:
# Get count of person names in the text and store in a column
train['Person-name-count'] = train['Text'].apply(person_entity_count)
test['Person-name-count'] = test['Text'].apply(person_entity_count)
valid['Person-name-count'] = valid['Text'].apply(person_entity_count)

# Get summary of person name count and compare side by side
df_train = column_summary(train, 'Person-name-count', 'Train')
df_test = column_summary(test, 'Person-name-count', 'Test')
df_valid = column_summary(valid, 'Person-name-count', 'Validation')
person_name_count_summary = pd.concat([df_train, df_valid, df_test], axis=1)
print(person_name_count_summary)


         Train  Validation     Test
count  2000.00      454.00  2000.00
mean      4.55        4.91     4.48
std       2.68        3.05     2.81
min       0.00        0.00     0.00
25%       3.00        3.00     3.00
50%       4.00        4.00     4.00
75%       6.00        6.00     6.00
max      23.00       20.00    30.00


In [195]:
# Number of target pronouns that are the first word in the sentence
pronoun_first_word(train)
pronoun_first_word(valid)
pronoun_first_word(test)
print("Training set: {}".format(train['Pronoun-first-word'].sum()))
print("Validation set: {}".format(valid['Pronoun-first-word'].sum()))
print("Test set {}".format(test['Pronoun-first-word'].sum()))

Training set: 223
Validation set: 45
Test set 222


In [210]:
# Any relation of candidate A and B with pronoun is first word or not?
def first_coref_relation(df):
    first_a, first_b, not_first_a, not_first_b = 0, 0, 0, 0

    for i, p in enumerate(df['Pronoun-first-word']):
        if p == 0:
            if df['A-coref'].iloc[i] == True:
                not_first_a += 1
            elif df['B-coref'].iloc[i] == True:
                not_first_b += 1
        else:
            if df['A-coref'].iloc[i] == True:
                first_a += 1
            elif df['B-coref'].iloc[i] == True:
                first_b += 1     
    
    return first_a, first_b, not_first_a, not_first_b

print(first_coref_relation(train))
print(first_coref_relation(valid))
print(first_coref_relation(test))

(105, 98, 769, 827)
(20, 19, 167, 186)
(112, 88, 806, 767)


In [225]:
# In how many cases the candiate name appears after the target pronoun?
a = (train['A-offset'] - train['Pronoun-offset']) > 0
b = (train['B-offset'] - train['Pronoun-offset']) > 0
print(sum(a == True), sum(b == True))
#print(sum(a == False), sum(b == False))

274 496


In [235]:
# Calculate the offset difference between the pronoun and the candidate names
def diff_pronoun_noun_offset(df, name_column):
    df[name_column[0]+'-pronoun-offset-diff'] = (df[name_column] - df['Pronoun-offset'])
    return

diff_pronoun_noun_offset(train, 'A-offset')
diff_pronoun_noun_offset(train, 'B-offset')
diff_pronoun_noun_offset(valid, 'A-offset')
diff_pronoun_noun_offset(valid, 'B-offset')
diff_pronoun_noun_offset(test, 'A-offset')
diff_pronoun_noun_offset(test, 'B-offset')

In [264]:
# How close are the target nouns and pronouns?
df_train_A = column_summary(train, 'A-pronoun-offset-diff', 'Train-A')
df_train_B = column_summary(train, 'B-pronoun-offset-diff', 'Train-B')
df_valid_A = column_summary(train, 'A-pronoun-offset-diff', 'Validation-A')
df_valid_B = column_summary(train, 'B-pronoun-offset-diff', 'Validation-B')
df_test_A = column_summary(train, 'A-pronoun-offset-diff', 'Test-A')
df_test_B = column_summary(train, 'B-pronoun-offset-diff', 'Test-B')

pronoun_offset_diff_summary = pd.concat([df_train_A, df_train_B, df_valid_A, 
                                       df_valid_B, df_test_A, df_test_B], axis=1)
print(pronoun_offset_diff_summary)

       Train-A  Train-B  Validation-A  Validation-B   Test-A   Test-B
count  2000.00  2000.00       2000.00       2000.00  2000.00  2000.00
mean    -85.19   -24.43        -85.19        -24.43   -85.19   -24.43
std      77.35    71.07         77.35         71.07    77.35    71.07
min    -577.00  -461.00       -577.00       -461.00  -577.00  -461.00
25%    -126.00   -58.00       -126.00        -58.00  -126.00   -58.00
50%     -82.00   -28.00        -82.00        -28.00   -82.00   -28.00
75%     -45.00    -5.75        -45.00         -5.75   -45.00    -5.75
max     280.00   291.00        280.00        291.00   280.00   291.00


In [248]:
# Get the rows where candiate A appears after pronoun, but is still the True coreference
train.loc[((train['A-pronoun-offset-diff'] > 0) == True) & (train['A-coref'] == True)]

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL,Pronoun-count,Text-len,Word-count,Person-name-count,Pronoun-first-word,A-pronoun-offset-diff,B-pronoun-offset-diff
21,development-22,This device allowed him to continue his racing...,his,304,Kurt,317,True,Paul Newman,338,False,http://en.wikipedia.org/wiki/Kurt_Kossmann,3,426,70,2,0,13,34
27,development-28,This is a list of episodes from the MTV show B...,his,194,Alan,204,True,Ryan,223,False,http://en.wikipedia.org/wiki/List_of_Bully_Bea...,4,290,54,4,0,10,29
73,development-74,He is currently the wide receivers coach for B...,his,275,Seth Doege,302,True,Jim Ned Indians,449,False,http://en.wikipedia.org/wiki/Seth_Doege,3,512,91,3,1,27,174
85,development-86,"The musical style is essentially tonal, evokin...",his,385,Elkus,405,True,Charles Ives,439,False,http://en.wikipedia.org/wiki/Jonathan_Elkus,1,452,66,2,0,20,54
103,development-104,"His work as an arranger, producer and songwrit...",his,240,Gabriel,249,True,Paul Anka,319,False,http://en.wikipedia.org/wiki/Juan_Gabriel,3,329,53,6,1,9,79
119,development-120,Oliveira would take a major step forward with ...,his,329,Oliveira,350,True,Otis Griffin,368,False,http://en.wikipedia.org/wiki/Marcus_Oliveira,1,454,71,6,0,21,39
131,development-132,Then he persuades future son-in-law Ben to bet...,his,230,Jimmy,242,True,Ben,266,False,http://en.wikipedia.org/wiki/Hot_Tip_(film),4,319,57,3,0,12,36
135,development-136,Miller is also currently serving as the produc...,his,323,Miller,354,True,Joseph P. Hayes,449,False,http://en.wikipedia.org/wiki/Roy_Miller_(produ...,2,465,76,3,0,31,126
140,development-141,Isabel Walker is a twenty-something American f...,she,396,Roxanne,409,True,Isabel,429,False,http://en.wikipedia.org/wiki/Le_Divorce_(novel),9,487,78,6,0,13,33
145,development-146,"Rocky intervenes, and after seemingly beating ...",her,430,Natasha,501,True,Supriya Pilgaonkar,526,False,http://en.wikipedia.org/wiki/Deewane_Huye_Paagal,3,590,93,4,0,71,96


In [261]:
# Get the rwos where candiate B appears after pronoun, but is still the True coreference
train.loc[((train['B-pronoun-offset-diff'] > 0) == True) & (train['B-coref'] == True)]

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL,Pronoun-count,Text-len,Word-count,Person-name-count,Pronoun-first-word,A-pronoun-offset-diff,B-pronoun-offset-diff
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,False,Henry Rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band),1,401,69,4,0,-147,15
7,development-8,Slant Magazine's Sal Cinquemani viewed the alb...,his,337,Greg Kot,173,False,Robert Christgau,377,True,http://en.wikipedia.org/wiki/The_Truth_About_L...,1,451,71,4,0,-164,40
8,development-9,Her father was an Englishman ``of rank and cul...,her,246,Mary Paine,255,False,Kelsey,267,True,http://en.wikipedia.org/wiki/Mary_S._Peake,5,302,53,3,1,9,21
42,development-43,"Impressed by her beauty, her warrior skills, a...",her,337,Talia,348,False,Nyssa,355,True,http://en.wikipedia.org/wiki/Nyssa_Raatko,14,522,87,3,0,11,18
52,development-53,"In 1972, Henning broke the world records on th...",her,166,Sylvia Burka,189,False,Henning,221,True,http://en.wikipedia.org/wiki/Anne_Henning,3,338,59,2,0,23,55
56,development-57,Classicist John H. D'Arms was both the residen...,her,287,Sophie Consagra,183,False,Adele Chatfield-Taylor,331,True,http://en.wikipedia.org/wiki/American_Academy_...,1,528,85,3,0,-104,44
79,development-80,"In 1984, Petersen married Zina Nibley, a daugh...",his,312,Hugh Nibley,341,False,Petersen,354,True,http://en.wikipedia.org/wiki/Boyd_Petersen,2,403,65,6,0,29,42
96,development-97,Caswell begins to doubt that it was Bullard wh...,his,357,Loren Shaw,403,False,Bullard,448,True,http://en.wikipedia.org/wiki/Executive_Suite,5,513,86,1,0,46,91
102,development-103,Donald P. Kanak is non-executive Chairman of P...,his,263,Hank Greenberg,224,False,Kanak,300,True,http://en.wikipedia.org/wiki/Donald_P._Kanak,2,368,56,3,0,-39,37
113,development-114,"He informs Blackadder and Baldrick, ``For us, ...",his,335,Lieutenant George,264,False,Richthofen,359,True,http://en.wikipedia.org/wiki/The_Red_Baron_in_...,3,429,66,4,1,-71,24


In [262]:
# Get the number of candates appearing after pronoun, but still are the True coreference
temp_a = train.index[((train['A-pronoun-offset-diff'] > 0) == True) 
                     & (train['A-coref'] == True)].tolist() # Candidate A
temp_b = train.index[((train['B-pronoun-offset-diff'] > 0) == True) 
                     & (train['B-coref'] == True)].tolist() # candiate B
print(len(temp_a), len(temp_b))


123 230


In [265]:
# Get the number of candates appearing after pronoun, and are False coreference
temp_a = train.index[((train['A-pronoun-offset-diff'] > 0) == True) 
                     & (train['A-coref'] == False)].tolist() # Candidate A
temp_b = train.index[((train['B-pronoun-offset-diff'] > 0) == True) 
                     & (train['B-coref'] == False)].tolist() # candiate B
print(len(temp_a), len(temp_b))


151 266


In [376]:
# Get the numbers of the sentences where the pronoun, A and B are located in text
# Need to optimize the store_sentence_number() function

def get_sentence_number(sentences, offset):
    '''
    Get the number the sentence where the token is located.
    '''
    l = 0
    for i, sentence in enumerate(sentences):
        l += len(sentence) + 1
        if l >= offset:
            return i

def store_sentence_number(df):
    '''
    Store the number of the sentences where the pronun, A and B are located in text.
    '''
    
    for i, text in enumerate(df.Text):
        l1 = len(text)
        p_offset, a_offset, b_offset = df['Pronoun-offset'][i], \
                                        df['A-offset'][i], df['B-offset'][i]
        p = df['Pronoun'][i]

        doc = nlp(text)
        sentences = [sent.string.strip() for sent in doc.sents] # Splitting into sentences
        p_pos, a_pos, b_pos = None, None, None 
        
        df.at[i, 'Pronoun-sentence-number'] = get_sentence_number(sentences, p_offset)
        df.at[i, 'A-sentence-number'] = get_sentence_number(sentences, a_offset)
        df.at[i, 'B-sentence-number'] = get_sentence_number(sentences, b_offset)
        
    return


    
store_sentence_number(train)
store_sentence_number(valid)
store_sentence_number(test)


In [377]:
# Get number of FinalPro, MedialPro and InitialPro sample count
def extraction_pattern(df):
    if (df['A-sentence-number'] == df['B-sentence-number']) & \
        ((df['Pronoun-sentence-number'] == df['A-sentence-number']) | \
            (df['Pronoun-sentence-number'] == df['A-sentence-number'] + 1 )):
                return 'FinalPro'
    elif (df['Pronoun-sentence-number'] == df['A-sentence-number'] + 1) & \
        (df['Pronoun-sentence-number'] == df['B-sentence-number']):
                return 'MedialPro'
    elif (df['Pronoun-sentence-number'] == df['A-sentence-number']) & \
            df['A-sentence-number'] == df['B-sentence-number']:
                return 'InitialPro'

train['Extraction-pattern'] = df.apply(extraction_pattern, axis=1)
valid['Extraction-pattern'] = df.apply(extraction_pattern, axis=1)
test['Extraction-pattern'] = df.apply(extraction_pattern, axis=1)

def extraction_pattern_count(df):
    return len(df.index[df['Extraction-pattern'] == 'FinalPro']), \
            len(df.index[df['Extraction-pattern'] == 'MedialPro']), \
            len(df.index[df['Extraction-pattern'] == 'InitialPro'])

print(extraction_pattern_count(train))
print(extraction_pattern_count(valid))
print(extraction_pattern_count(test))

(1272, 539, 8)
(294, 117, 2)
(1272, 539, 8)


In [9]:
# Number of pronouns in each text? - Done
# How many person names in each text? - Done
# How close are other pronouns to the target pronoun?
# How close are the target nouns to the target pronoun? What about the ommitted nouns? - Partially done
# How many nouns and pronouns in the preceding, same, and trailing sentence?
# Position of actual name reference (immediate before, or appeared before another name.) 
# If there are two pronouns in the same sentence containing the target pronoun, are they both of the same gender?
# How many target pronouns are the first word in the sentence? - Done
# Get number of FinalPro, MedialPro and InitialPro sample count - Done

In [8]:
type(train.columns.values)

numpy.ndarray

In [13]:
# Displays the documentation for filter function
help(feature_names)
# Obtains the string of the documentation.
docstring = feature_names.__doc__

Help on function feature_names in module __main__:

feature_names(df)
    Get names of features. 
    
    :param df: A Pandas dataframe
    :type df: pandas.core.frame.DataFrame
    :return: List of column names in a Pandas dataframe
    :rtype: numpy.ndarray



In [16]:
def foo():
    return

print(foo())

None
