# Clean Data for National Presidential Party Wins
This notebook takes in a CSV of data from a Kaggle data set and cleans it for export to CSV and use in Tableau
**Found some errors in the data so using a different source**

In [35]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Import Kaggle Data
Data source is https://www.kaggle.com/harshitagpt/us-presidents and was created by Harshita Gupta

In [36]:
input_file = 'raw_data/us_presidents.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,start,end,president,prior,party,vice
0,0,1,"April 30, 1789","March 4, 1797",George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan [13],John Adams
1,1,2,"March 4, 1797","March 4, 1801",John Adams,1st Vice President of the United States,Federalist,Thomas Jefferson
2,2,3,"March 4, 1801","March 4, 1809",Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr
3,3,4,"March 4, 1809","March 4, 1817",James Madison,5th United States Secretary of State (1801–...,Democratic- Republican,George Clinton
4,4,5,"March 4, 1817","March 4, 1825",James Monroe,7th United States Secretary of State (1811–...,Democratic- Republican,Daniel D. Tompkins


### Remove Unnecessary Columns

In [37]:
df = df[['president', 'party', 'start', 'end']]
df.head()

Unnamed: 0,president,party,start,end
0,George Washington,Nonpartisan [13],"April 30, 1789","March 4, 1797"
1,John Adams,Federalist,"March 4, 1797","March 4, 1801"
2,Thomas Jefferson,Democratic- Republican,"March 4, 1801","March 4, 1809"
3,James Madison,Democratic- Republican,"March 4, 1809","March 4, 1817"
4,James Monroe,Democratic- Republican,"March 4, 1817","March 4, 1825"


In [38]:
df

Unnamed: 0,president,party,start,end
0,George Washington,Nonpartisan [13],"April 30, 1789","March 4, 1797"
1,John Adams,Federalist,"March 4, 1797","March 4, 1801"
2,Thomas Jefferson,Democratic- Republican,"March 4, 1801","March 4, 1809"
3,James Madison,Democratic- Republican,"March 4, 1809","March 4, 1817"
4,James Monroe,Democratic- Republican,"March 4, 1817","March 4, 1825"
5,John Quincy Adams,Democratic- Republican,"March 4, 1825","March 4, 1829"
6,Andrew Jackson,Democratic,"March 4, 1829","March 4, 1837"
7,Martin Van Buren,Democratic,"March 4, 1837","March 4, 1841"
8,William Henry Harrison,Whig,"March 4, 1841","April 4, 1841"
9,John Tyler,"Whig April 4, 1841 – September 13, 1841","April 4, 1841","March 4, 1845"


### Convert Start and End Dates to Years

In [25]:
df['start_yr'] = pd.to_datetime(df['start'], errors='coerce').dt.year
df['end_yr'] = pd.to_datetime(df['end'], errors='coerce').dt.year
df.head()

Unnamed: 0,president,party,start,end,start_yr,end_yr
0,George Washington,Nonpartisan [13],"April 30, 1789","March 4, 1797",1789,1797.0
1,John Adams,Federalist,"March 4, 1797","March 4, 1801",1797,1801.0
2,Thomas Jefferson,Democratic- Republican,"March 4, 1801","March 4, 1809",1801,1809.0
3,James Madison,Democratic- Republican,"March 4, 1809","March 4, 1817",1809,1817.0
4,James Monroe,Democratic- Republican,"March 4, 1817","March 4, 1825",1817,1825.0


### Calculate Potential Election Years

In [26]:
df['election_yr'] = df['start_yr'] - 1
df.loc[df['president'] == 'George Washington', 'election_yr'] = df['start_yr']
df.head()

Unnamed: 0,president,party,start,end,start_yr,end_yr,election_yr
0,George Washington,Nonpartisan [13],"April 30, 1789","March 4, 1797",1789,1797.0,1789
1,John Adams,Federalist,"March 4, 1797","March 4, 1801",1797,1801.0,1796
2,Thomas Jefferson,Democratic- Republican,"March 4, 1801","March 4, 1809",1801,1809.0,1800
3,James Madison,Democratic- Republican,"March 4, 1809","March 4, 1817",1809,1817.0,1808
4,James Monroe,Democratic- Republican,"March 4, 1817","March 4, 1825",1817,1825.0,1816


### Calculate Terms Served

In [34]:
df['election_yr'].shift(-1) - df['election_yr']

0      7.0
1      4.0
2      8.0
3      8.0
4      8.0
5      4.0
6      8.0
7      4.0
8      0.0
9      4.0
10     4.0
11     1.0
12     3.0
13     4.0
14     4.0
15     4.0
16     4.0
17     8.0
18     4.0
19     0.0
20     4.0
21     4.0
22     4.0
23     4.0
24     4.0
25     8.0
26     4.0
27     8.0
28     2.0
29     6.0
30     4.0
31    12.0
32     8.0
33     8.0
34     2.0
35     6.0
36     5.0
37     3.0
38     4.0
39     8.0
40     4.0
41     8.0
42     8.0
43     8.0
44     NaN
Name: election_yr, dtype: float64

In [28]:
(df['end_yr'] - df['start_yr']) // 4

0     2.0
1     1.0
2     2.0
3     2.0
4     2.0
5     1.0
6     2.0
7     1.0
8     0.0
9     1.0
10    1.0
11    0.0
12    0.0
13    1.0
14    1.0
15    1.0
16    1.0
17    2.0
18    1.0
19    0.0
20    1.0
21    1.0
22    1.0
23    1.0
24    1.0
25    2.0
26    1.0
27    2.0
28    0.0
29    1.0
30    1.0
31    2.0
32    2.0
33    2.0
34    0.0
35    1.0
36    1.0
37    0.0
38    1.0
39    2.0
40    1.0
41    2.0
42    2.0
43    NaN
44    NaN
dtype: float64

### Create DataFrame of Election Years and Merge Presidents Data

In [20]:
election_years = [1789] + list(range(1792, df['start_yr'].max(), 4))
election_df = pd.DataFrame(election_years, columns=['year'])
election_df = pd.merge_asof(election_df,
                            df,
                            left_on='year',
                            right_on='election_yr',
                            direction='backward')
election_df

Unnamed: 0,year,president,party,start,end,start_yr,end_yr,election_yr
0,1789,George Washington,Nonpartisan [13],"April 30, 1789","March 4, 1797",1789,1797.0,1789
1,1792,George Washington,Nonpartisan [13],"April 30, 1789","March 4, 1797",1789,1797.0,1789
2,1796,John Adams,Federalist,"March 4, 1797","March 4, 1801",1797,1801.0,1796
3,1800,Thomas Jefferson,Democratic- Republican,"March 4, 1801","March 4, 1809",1801,1809.0,1800
4,1804,Thomas Jefferson,Democratic- Republican,"March 4, 1801","March 4, 1809",1801,1809.0,1800
5,1808,James Madison,Democratic- Republican,"March 4, 1809","March 4, 1817",1809,1817.0,1808
6,1812,James Madison,Democratic- Republican,"March 4, 1809","March 4, 1817",1809,1817.0,1808
7,1816,James Monroe,Democratic- Republican,"March 4, 1817","March 4, 1825",1817,1825.0,1816
8,1820,James Monroe,Democratic- Republican,"March 4, 1817","March 4, 1825",1817,1825.0,1816
9,1824,John Quincy Adams,Democratic- Republican,"March 4, 1825","March 4, 1829",1825,1829.0,1824


Fixes:
1840: John Tyler should be William Harrison
1864: should be Lincoln
1880: Garfield
1900: McKinley
1944: FDR