In [94]:
import pandas as pd
import numpy as np

In [95]:
def generate_max_feature(df,col_name,name_of_feature,groupby_params=["year","Cand_Office_St"]):
    '''""
    Generate a feature that will groupby state, district and get the top candidate for the given feature
    
    For eg: Individual_Contribution
    The function will identify candidates that have the highest in "Individual_Contribution"
    in their district,state and assign 1.
    Rest all will have 0.
    '''
    max_rows = df.groupby(groupby_params)[col_name].idxmax()
    df[name_of_feature] = 0
    df.loc[max_rows,name_of_feature] = 1
    return df

In [96]:
senate = pd.read_csv("senate.csv")
print(senate["Cand_Office_St"].unique().tolist())
senate.head()

['AK', 'AL', 'AR', 'CO', 'DE', 'GA', 'IA', 'ID', 'IL', 'KS', 'KY', 'LA', 'MA', 'ME', 'MI', 'MN', 'MS', 'MT', 'NC', 'NE', 'NH', 'NJ', 'NM', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'VA', 'WV', 'WY', 'AZ', 'CA', 'CT', 'FL', 'HI', 'IN', 'MD', 'MO', 'ND', 'NV', 'NY', 'OH', 'UT', 'VT', 'WA', 'WI']


Unnamed: 0,Cand_Id,Cand_Incumbent_Challenger_Open_Seat,Cand_Name,Cand_Office,Cand_Office_St,Cand_Party_Affiliation,Cand_State,GE WINNER INDICATOR,Individual_Contribution,Other_Committee_Contribution,Party_Committee_Contribution,Total_Disbursement,Total_Receipt,year
0,S6AK00078,CHALLENGER,"CUDDY, DAVID W",S,AK,REP,AK,0,31261.0,260.0,0.0,862663.0,862661.0,2008
1,S8AK00074,CHALLENGER,"CALDERO, ROCKY CHRISTOPHER",S,AK,DEM,AK,0,31622.0,0.0,0.0,25891.0,32004.0,2008
2,S8AK00082,CHALLENGER,"METCALFE, RAY",S,AK,DEM,AK,0,30267.0,0.0,0.0,30562.0,30309.0,2008
3,S8AK00108,CHALLENGER,"SIKMA, RODERIC H 'RICK'",S,AK,REP,AK,0,4746.0,0.0,0.0,12203.0,11996.0,2008
4,S8AK00124,CHALLENGER,"VICKERS, RAYMOND B VIC",S,AK,REP,AK,0,10215.0,0.0,0.0,1010617.0,1055221.0,2008


Generate Top feature for different finances 

In [97]:
senate = generate_max_feature(senate,"Individual_Contribution","top_individual_contribution")
senate = generate_max_feature(senate,"Total_Disbursement","top_total_disbursement")
senate = generate_max_feature(senate,"Other_Committee_Contribution","top_other_comm_contribution")
senate = generate_max_feature(senate,"Party_Committee_Contribution","top_party_comm_contribution",["year","Cand_Office_St","Cand_Party_Affiliation"])

In [98]:
print("No. of winners = " + str(senate[senate["GE WINNER INDICATOR"] == 1].shape[0]))

print("No. of unique states = " + str(senate["Cand_Office_St"].unique().shape[0]))

No. of winners = 169
No. of unique states = 50


In [99]:
# In-state out-state candidates. Currently we are dropping with Nan values in Cand_State
senate["in_state"] = 1
senate.loc[senate["Cand_Office_St"] != senate["Cand_State"],"in_state"] = 0

In [100]:
senate[senate["GE WINNER INDICATOR"] == 1].shape

(169, 19)

In [101]:
# Creating dummy variables for incumbent and open candidates
senate["incumbent"] = 0
senate.loc[senate["Cand_Incumbent_Challenger_Open_Seat"] == "INCUMBENT","incumbent"] = 1

senate["open"] = 0
senate.loc[senate["Cand_Incumbent_Challenger_Open_Seat"] == "OPEN","open"] = 1

In [102]:
# def party_domination_features(df1,no_of_datasets):
#     '''
#     There are five categories in which each district will fall:
#     1) Strong Democratic : All the records for this district indicate DEM party have won
#     2) Strong Republic : All the records for this district indicate REP party have won 
#     3) Likely Democratic: More than half of records for this district indicate DEM party have won
#     4) Likely Republic : More than half of records for this district indicate REP party have won
#     5) None of above : This district doesn't fall in any of those categories
    
#     :param df: pd.df | standard dataframe for this project 
#     :param no_of_datasets: int | will be used for comparing wins for a party  
    
#     :return: pd.df | with 4 new features that indicate party dominance in every district
#     '''
#     df = df1.copy()
#     df["DEM_wins"] = 0 
#     df.loc[(df["Cand_Party_Affiliation"] == "DEM") & (df["GE WINNER INDICATOR"] == 1),"DEM_wins"] = 1

#     df["REP_wins"] = 0
#     df.loc[(df["Cand_Party_Affiliation"] == "REP") & (df["GE WINNER INDICATOR"] == 1),"REP_wins"] = 1
    
#     aggregated_data = df.groupby(['Cand_Office_St']).agg({'Cand_Office_St':"first","DEM_wins":"sum","REP_wins":"sum"})
    
#     df["strong_DEM"] = 0
#     df["likely_DEM"] = 0
#     df["strong_REP"] = 0
#     df["likely_REP"] = 0
    
#     for index, row in aggregated_data.iterrows():
#         state = row["Cand_Office_St"]

#         if row["DEM_wins"] >= no_of_datasets-1:
#             df.loc[(df["Cand_Office_St"] == state),"strong_DEM"] = 1 

#         elif row["REP_wins"] >= no_of_datasets-1:
#             df.loc[(df["Cand_Office_St"] == state),"strong_REP"] = 1 

#         elif row["DEM_wins"] > int(no_of_datasets/2):
#             df.loc[(df["Cand_Office_St"] == state),"likely_DEM"] = 1 

#         elif row["REP_wins"] > int(no_of_datasets/2):
#             df.loc[(df["Cand_Office_St"] == state),"likely_REP"] = 1
    
#     df = df.drop(['DEM_wins', 'REP_wins'], axis=1)
    
#     return df


# senate = party_domination_features(senate,5)
# senate.head()

In [103]:
senate.loc[(senate["Cand_Party_Affiliation"] != "DEM") & (senate["Cand_Party_Affiliation"] != "REP"),"Cand_Party_Affiliation"] = "OTHER"
senate = pd.get_dummies(senate,columns=["Cand_Party_Affiliation","Cand_Office_St"])

Drop columns that are not necessary in the analysis

In [104]:
# columns_to_drop = []
# cand_data.drop(columns_to_drop,axis=1)

#### Now that we have created our analytical dataset, let's just write it to a csv file for further analysis.

In [105]:
senate.to_csv("senate_analytical.csv",index=False)