In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
candidates = pd.read_csv('../data/candidates_clean.csv', dtype = {'district': 'str'}) # refactor to explicitly set district as string

candidates.head()

Unnamed: 0,candidate_name,party,incumbent,winner,perc_vote,money_raised,money_spent,state_name,district,state_dist,perc_vote_max,winner_bool
0,Jerry Carl,R,0,1,64.9,1971321,1859349,AL,1,AL01,64.9,1
1,James Averhart,D,0,0,35.0,80095,78973,AL,1,AL01,64.9,0
2,Barry Moore,R,0,1,65.3,650807,669368,AL,2,AL02,65.3,1
3,Phyllis Harvey-Hall,D,0,0,34.6,56050,55988,AL,2,AL02,65.3,0
4,Mike D Rogers,R,1,1,67.5,1193111,1218564,AL,3,AL03,67.5,1


In [3]:
candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   candidate_name  883 non-null    object 
 1   party           883 non-null    object 
 2   incumbent       883 non-null    int64  
 3   winner          883 non-null    int64  
 4   perc_vote       883 non-null    float64
 5   money_raised    883 non-null    int64  
 6   money_spent     883 non-null    int64  
 7   state_name      883 non-null    object 
 8   district        883 non-null    int64  
 9   state_dist      883 non-null    object 
 10  perc_vote_max   883 non-null    float64
 11  winner_bool     883 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 82.9+ KB


In [None]:
candidates['party'].value_counts()

In [None]:
candidates.loc[(candidates['party'].isin(['I', 'L', '3'])) & (candidates['winner'] == 1)]

# no independent or 3rd party candidate won a race

### how many races didn't have an incumbant running? How does that correlate with number of candidates in the race?

### How often does the candidate who raised more money win a race?

Partition by state and district, sort by money_raised, qualify row_num = 1 (to only have the person from each race who raised the most money). Chart value_counts of winner column.

In [None]:
# new column with max raised in each district race
candidates['max_raised'] = candidates.groupby(['state_name', 'district'])['money_raised'].transform('max')

In [None]:
raised_winner = candidates.loc[candidates['money_raised'] == candidates['max_raised']]

In [None]:
fontsize = 14

plt.figure(figsize = (10,6))
raised_winner['winner'].value_counts().plot(kind = 'bar')
plt.xticks(rotation = 0, labels = ['Winner', 'Not Winner'], ticks = [0, 1], fontsize = fontsize)
plt.yticks(fontsize = fontsize)
plt.title('Test Title', fontsize = fontsize + 2, )
plt.tight_layout()
plt.savefig('../data/test.png', dpi = 150);

#### How often does the candidate who spent more money win a race?

Same strategy as previous question.

In [None]:
# new column with max spent in each district race
candidates['max_spent'] = candidates.groupby(['state_name', 'district'])['money_spent'].transform('max')

spent_winner = candidates.loc[candidates['money_spent'] == candidates['max_spent']]

spent_winner['winner'].value_counts().plot.bar()
plt.xticks(rotation = 0, labels = ['Winner', 'Not Winner'], ticks = [0, 1]);

# are the ones who raised and spent the most money completely synonymous? Seems likely

#### Does the difference between either money raised or money spent seem to influence the likelihood of a candidate winning a race?

strategy: filter to top 2 candidates in each race, new column calculating the total spent in the race, and a column calculating the difference between winner and 2nd place in $, plus % calculation


In [None]:
# adding column to concat state and district, move earlier in future refactor
candidates['state_dist'] = candidates['state_name'] + candidates['district'].astype('str')

candidates.head(1)

In [None]:
# filter to max 2 from each race

candidates_top2 = candidates.iloc[candidates.reset_index().groupby(['state_name', 'district'])['perc_vote'].nlargest(2).index.levels[2]] # refactor to account for the concat column I added above (easier, not strictly necessary)

candidates_top2.head()

In [None]:
candidates_top2.shape

# full candidates df has 892 rows, seems like not too many races have more than 2 candidates (or many districts are uncontested)

side question - how many races are uncontested? Even in Alabama with 7 districts, it looks like 4 of them were uncontested.

In [None]:
(candidates_top2['state_dist'].value_counts() == 1).sum()

# 60 seats uncontested, see what the guys think about including this kind of information and make a graph

In [None]:
# columns for total raised and spent by top two candidates in each race
candidates_top2['total_raised'] = candidates_top2.groupby('state_dist')['money_raised'].transform('sum')
candidates_top2['total_spent'] = candidates_top2.groupby('state_dist')['money_spent'].transform('sum')

# column for % of total (not sure I need the $ difference, add later if needed)
candidates_top2['perc_raised'] = candidates_top2['money_raised'] / candidates_top2['total_raised'] * 100
candidates_top2['perc_spent'] = candidates_top2['money_spent'] / candidates_top2['total_spent'] * 100

candidates_top2.head()