## Convote Dataset - Basic Preprocessing
- Data Main Page: http://www.cs.cornell.edu/home/llee/data/convote.html 
- About the Data: http://www.cs.cornell.edu/home/llee/data/convote/README.v1.1.txt


In [1]:
import pandas as pd
import os
import string
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, sent_tokenize
from nltk.probability import FreqDist

In [2]:
# For now, we will use the data from stage one
train_path = ('./convote_v1.1/data_stage_one/training_set/')
test_path = ('./convote_v1.1/data_stage_one/test_set/')
train_file_names = os.listdir(train_path)
test_file_names = os.listdir(test_path)

# Create Dictionary for File Name and Text
file_name_and_text = {}
for file in train_file_names:
    with open(train_path + file, 'r') as target_file:
         file_name_and_text[file] = target_file.read()

# Structure the dataframe such that the file name is the index
train_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

# This is redundant but fastest given the data is partitioned into directories
file_name_and_text = {}
for file in test_file_names:
    with open(test_path + file, 'r') as target_file:
         file_name_and_text[file] = target_file.read()

test_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

In [3]:
train_data.head()

Unnamed: 0,File,Text
0,282_400436_1413023_DMN.txt,"mr. speaker , i would like to say a word about..."
1,088_400272_2994052_DON.txt,"mr. speaker , today we have some very clear ch..."
2,038_400080_0251064_DON.txt,"mr. speaker , i yield myself such time as i ma..."
3,132_400227_0763073_DON.txt,"mr. chairman , i yield back the balance of my ..."
4,282_400380_1838049_ROY.txt,"madam chairman , will the gentleman yield ? \n"


In [4]:
test_data.head()

Unnamed: 0,File,Text
0,048_400216_0297011_DON 2.txt,"mr. chairman , how much time do i have remaini..."
1,414_400080_3170075_DON.txt,"madam speaker , i yield myself 35 seconds . \n..."
2,414_400061_1909178_ROY.txt,"mr. chairman , i demand a recorded vote . \n"
3,102_400175_0641038_ROY.txt,"mr. speaker , i rise today as a cosponsor of h..."
4,414_400080_3170065_DON.txt,"madam speaker , i yield myself 15 seconds . \n..."


In [5]:
# Remove file extension
train_data['File'] = train_data['File'].map(lambda x: x.replace('.txt', ''))
test_data['File'] = test_data['File'].map(lambda x: x.replace('.txt', ''))

In [6]:
# Add Label feature (derived from end of file name)
Label = []

for i in train_data.File:
    Label.append(i[-3:])
    
train_data['Label'] = Label

### 'Label' feature Details 
- 'P' is replaced by a party indicator, D or R (or X if no
   corresponding party could be found).  As mentioned in the paper, we 
   purposely *did not* use this information in our experiments.

- 'M' is replaced by an indicator of whether the bill under
   discussion is mentioned directly in the speech segment, or whether it is
   only referenced by another speech segment on the same page.  If the bill is
   directly mentioned in the current speech, the letter M appears in
   the file name; otherwise, the letter O appears.

- 'V' is replaced by a vote indicator, Y or N, which serves as the
   ground-truth label for the speech.

In [7]:
# Split Label into three columns
train_data['Party'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[0]
train_data['Discussion'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[1]
train_data['Vote'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[2]

In [8]:
train_data.head()

Unnamed: 0,File,Text,Label,Party,Discussion,Vote
0,282_400436_1413023_DMN,"mr. speaker , i would like to say a word about...",DMN,D,M,N
1,088_400272_2994052_DON,"mr. speaker , today we have some very clear ch...",DON,D,O,N
2,038_400080_0251064_DON,"mr. speaker , i yield myself such time as i ma...",DON,D,O,N
3,132_400227_0763073_DON,"mr. chairman , i yield back the balance of my ...",DON,D,O,N
4,282_400380_1838049_ROY,"madam chairman , will the gentleman yield ? \n",ROY,R,O,Y


In [9]:
# Test data has some Stage 2 files, not sure why - removed 2
Label = []
for i in test_data.File:
    label = i.rstrip('0123456789.- ')
    Label.append(label[-3:])

test_data['Label'] = Label

In [10]:
# Split label into distinct columns
test_data['Party'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[0]
test_data['Discussion'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[1]
test_data['Vote'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[2]

In [11]:
test_data.head()

Unnamed: 0,File,Text,Label,Party,Discussion,Vote
0,048_400216_0297011_DON 2,"mr. chairman , how much time do i have remaini...",DON,D,O,N
1,414_400080_3170075_DON,"madam speaker , i yield myself 35 seconds . \n...",DON,D,O,N
2,414_400061_1909178_ROY,"mr. chairman , i demand a recorded vote . \n",ROY,R,O,Y
3,102_400175_0641038_ROY,"mr. speaker , i rise today as a cosponsor of h...",ROY,R,O,Y
4,414_400080_3170065_DON,"madam speaker , i yield myself 15 seconds . \n...",DON,D,O,N


### Party Speech Distribution

In [12]:
print('Training Party Distribution:', '\n', train_data['Party'].value_counts(), '\n')
print('Test Party Distribution:', '\n', test_data['Party'].value_counts(), '\n')

Training Party Distribution: 
 D    2848
R    2786
I      26
Name: Party, dtype: int64 

Test Party Distribution: 
 R    1782
D    1726
I      10
Name: Party, dtype: int64 



### Sentence Count & Length

In [13]:
# Add column for sentence count
train_data['NumSents'] = train_data['Text'].apply(lambda x: len(x.strip().split('\n')))
test_data['NumSents'] = test_data['Text'].apply(lambda x: len(x.strip().split('\n')))

In [14]:
train_data['NumSents']
test_data['NumSents']

0        1
1        5
2        1
3       19
4        4
        ..
3513     1
3514     1
3515     1
3516     1
3517    39
Name: NumSents, Length: 3518, dtype: int64

# Tokenization

In [15]:
train_data['Text'][1]

"mr. speaker , today we have some very clear choices . \nit is not every day that we face such black and white options -- often the issues we debate on this floor have many shades of gray . \nbut today , there is no confusion , there is no muddying of the issues , and there is no way to mask the harm this bill would do : cut education spending for the first time in a decade , slash funding for worker and youth training , and provide no increase for home heating assistance for low-income families . \ntoday , we have a choice . \nwe can pass a bill that will be detrimental to our children 's future ; that will hurt students in need of financial assistance to go to college ; that will not help families struggling to pay their heating bills ; and that will severely hinder research and preventive health efforts . \nor we can reject this bill and demand something better for american families . \nwe have heard that this bill is the result of priorities . \nwell , this is one point where i agr

In [16]:
# Phrases like: 'Mr. Chairman' & 'Mr. Speaker' appear frequently - not informative
# Add to custom stopwords list

### Custom Stopwords Creation

In [17]:
# Initialize builtin and custom stopwords
stopwords = nltk.corpus.stopwords.words('english')
customStopWords = ['mr', 'chairman','mr', 'speaker', 'madam', 'mr.']
stopwords.extend(customStopWords)

# Join stopwords and punctuation
punct = list(string.punctuation)
stops = stopwords + punct + ['--',"''", 'r.', '``', "'s", "n't"]

In [18]:
# Function to tokenize 
def tokenize_speech(text, party):
    if 'R' in party:
        return [word for word in word_tokenize(text) if not word in stops]
    elif 'D' in party:
        return [word for word in word_tokenize(text) if not word in stops]
    else:
        return [word for word in word_tokenize(text) if not word in stops]
        

In [19]:
train_data['Tokens'] = train_data.apply(lambda x: tokenize_speech(x['Text'],x['Party']),axis=1)
test_data['Tokens'] = test_data.apply(lambda x: tokenize_speech(x['Text'],x['Party']),axis=1)

### Demonstration of Speech tokenization and token count(s)

In [20]:
print(train_data['Tokens'][1], '\n\n')
print('Total tokens:', len(train_data['Tokens'][1]))
print('Total UNIQUE tokens:', len(set(train_data['Tokens'][1])))

['today', 'clear', 'choices', 'every', 'day', 'face', 'black', 'white', 'options', 'often', 'issues', 'debate', 'floor', 'many', 'shades', 'gray', 'today', 'confusion', 'muddying', 'issues', 'way', 'mask', 'harm', 'bill', 'would', 'cut', 'education', 'spending', 'first', 'time', 'decade', 'slash', 'funding', 'worker', 'youth', 'training', 'provide', 'increase', 'home', 'heating', 'assistance', 'low-income', 'families', 'today', 'choice', 'pass', 'bill', 'detrimental', 'children', 'future', 'hurt', 'students', 'need', 'financial', 'assistance', 'go', 'college', 'help', 'families', 'struggling', 'pay', 'heating', 'bills', 'severely', 'hinder', 'research', 'preventive', 'health', 'efforts', 'reject', 'bill', 'demand', 'something', 'better', 'american', 'families', 'heard', 'bill', 'result', 'priorities', 'well', 'one', 'point', 'agree', 'republican', 'colleagues', 'bill', 'result', 'priorities', 'wrong', 'priorities', 'republican', 'leadership', 'congress', 'content', 'spend', 'tax', 'cut

### Total Tokens & Unique Tokens for each Speech

In [21]:
# Create column of TOTAL token count per text
train_data['Total_tokens'] = train_data['Tokens'].apply(lambda x: len(x))
test_data['Total_tokens'] = test_data['Tokens'].apply(lambda x: len(x))

# Create column of UNIQUE token count per text
train_data['Unique_tokens'] = train_data['Tokens'].apply(lambda x: len(set(x)))
test_data['Unique_tokens'] = test_data['Tokens'].apply(lambda x: len(set(x)))

In [22]:
Rtoks = (sum(train_data.Total_tokens[train_data['Party'] == 'R']))
Dtoks = (sum(train_data.Total_tokens[train_data['Party'] == 'D']))
Itoks = (sum(train_data.Total_tokens[train_data['Party'] == 'I']))

RtoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'R']))
DtoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'D']))
ItoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'I']))

In [23]:
totalToks = Rtoks+Dtoks+Itoks
totalToksU = RtoksU+DtoksU+ItoksU

### Cumulative Total  & Unique Total Tokens

In [24]:
print(f'There are {totalToks} total words in the corpus', '\n')
print(f'There are {totalToksU} UNIQUE words in the corpus')

There are 720025 total words in the corpus 

There are 472127 UNIQUE words in the corpus


### Cumulative Total & Unique Total Tokens by Party

In [25]:
print(f'There are {Rtoks} words in the Republican speeches')
print(f'There are {Dtoks} words in the Democrat speeches')
print(f'There are {Itoks} words in the Independent speeches', '\n')

print(f'There are {RtoksU} UNIQUE words in the Republican speeches')
print(f'There are {DtoksU} UNIQUE words in the Democrat speeches')
print(f'There are {ItoksU} UNIQUE words in the Independent speeches')

There are 300201 words in the Republican speeches
There are 417531 words in the Democrat speeches
There are 2293 words in the Independent speeches 

There are 200033 UNIQUE words in the Republican speeches
There are 270662 UNIQUE words in the Democrat speeches
There are 1432 UNIQUE words in the Independent speeches


In [26]:
train_data.head(5)

Unnamed: 0,File,Text,Label,Party,Discussion,Vote,NumSents,Tokens,Total_tokens,Unique_tokens
0,282_400436_1413023_DMN,"mr. speaker , i would like to say a word about...",DMN,D,M,N,17,"[would, like, say, word, gentleman, illinois, ...",171,129
1,088_400272_2994052_DON,"mr. speaker , today we have some very clear ch...",DON,D,O,N,16,"[today, clear, choices, every, day, face, blac...",198,150
2,038_400080_0251064_DON,"mr. speaker , i yield myself such time as i ma...",DON,D,O,N,15,"[yield, time, may, consume, would, like, brief...",155,113
3,132_400227_0763073_DON,"mr. chairman , i yield back the balance of my ...",DON,D,O,N,1,"[yield, back, balance, time]",4,4
4,282_400380_1838049_ROY,"madam chairman , will the gentleman yield ? \n",ROY,R,O,Y,1,"[gentleman, yield]",2,2


In [27]:
test_data['Text'][0]

'mr. chairman , how much time do i have remaining ? \n'

### Avg Number of Sentences by Party

In [28]:
RsentAvg = round((train_data.NumSents[train_data['Party'] == 'R']).mean(), 2)
DsentAvg = round((train_data.NumSents[train_data['Party'] == 'D']).mean(), 2)
IsentAvg = round((train_data.NumSents[train_data['Party'] == 'I']).mean(), 2)

print(f'Avg number of Republican Sentences/Speech: {RsentAvg}', '\n', 
      f'Avg number of Democrat Sentences/Speech: {DsentAvg}', '\n', 
      f'Avg number of Independent Sentences/Speech: {IsentAvg}')

Avg number of Republican Sentences/Speech: 9.97 
 Avg number of Democrat Sentences/Speech: 13.07 
 Avg number of Independent Sentences/Speech: 8.62


### Avg Number of Words by Party

In [29]:
AvgRtoks = (train_data.Total_tokens[train_data['Party'] == 'R']).mean()
AvgDtoks = (train_data.Total_tokens[train_data['Party'] == 'D']).mean()
AvgItoks = (train_data.Total_tokens[train_data['Party'] == 'I']).mean()

print(f'Avg number of Republican Words/Speech: {AvgRtoks}', '\n', 
      f'Avg number of Democrat Words/Speech: {AvgDtoks}', '\n', 
      f'Avg number of Independent Words/Speech: {AvgItoks}')

Avg number of Republican Words/Speech: 107.75340990667624 
 Avg number of Democrat Words/Speech: 146.60498595505618 
 Avg number of Independent Words/Speech: 88.1923076923077
