In [1]:
import pandas as pd

## Import the csv file of voting data from NYS for the 2016 election

In [2]:
voting_df=pd.read_csv('../data/Raw/2016President_1.csv')

## Look at the shape of the data frame

In [3]:
voting_df.shape

(70, 54)

## Check the county in each row
* there are less than 70 counties in NYS, check which rows are not representative of a county
* look for the rows that need to be removed

In [4]:
voting_df['County'].unique()

array([' Albany', ' Allegany', ' Broome', ' Cattaraugus', ' Cayuga',
       ' Chautauqua', ' Chemung', ' Chenango', ' Clinton', ' Columbia',
       ' Cortland', ' Delaware', ' Dutchess', ' Erie', ' Essex',
       ' Franklin', ' Fulton', ' Genesee', ' Greene', ' Hamilton',
       ' Herkimer', ' Jefferson', ' Lewis', ' Livingston', ' Madison',
       ' Monroe', ' Montgomery', ' Nassau', ' Niagara', ' Oneida',
       ' Onondaga', ' Ontario', ' Orange', nan, 'County', ' Orleans',
       ' Oswego', ' Otsego', ' Putnam', ' Rensselaer', ' Rockland',
       ' Saratoga', ' Schenectady', ' Schoharie', ' Schuyler', ' Seneca',
       ' St. Lawrence', ' Steuben', ' Suffolk', ' Sullivan', ' Tioga',
       ' Tompkins', ' Ulster', ' Warren', ' Washington', ' Wayne',
       ' Westchester', ' Wyoming', ' Yates', 'Total Outside NYC',
       ' Bronx', ' Kings', ' New York', ' Queens', ' Richmond',
       'Total NYC', 'Statewide Total', 'RECAP'], dtype=object)

## Check the first 5 rows to see what columns there are

In [5]:
voting_df.head()

Unnamed: 0,County,Clinton Kaine,Trump Pence,Trump Pence.1,Stein Baraka,Clinton Kaine.1,Johnson Weld,Clinton Kaine.2,Johnson Weld.1,Unnamed: 9,...,Emidio Soltysik,Tony Valdivia,J. J. Vogel-Walcutt,Esther Welsh,Barbara Whitaker,Robert M. Wolff,Blank,Void,Scattering,Total
0,Albany,78504,42554,5254,2475,3419,3099,1148,1693,,...,3,1,0,0,0,0,1350,145,1318,141313
1,Allegany,4646,11421,1104,275,173,527,63,208,,...,0,0,0,0,0,0,178,2,176,18872
2,Broome,37106,37337,3606,1546,1631,2187,475,1059,,...,1,0,0,0,0,1,1038,67,864,87177
3,Cattaraugus,9077,17485,2207,440,274,932,146,314,,...,0,0,0,0,0,0,320,72,205,31553
4,Cayuga,12779,15316,2068,501,541,1008,202,350,,...,1,0,0,0,0,1,259,16,307,33447


## First get rid of the rows that are null in the county column
* assign "missing_county" to the rows in the county column which are null

In [6]:
missing_county=voting_df['County'].isnull()

* check which rows "is null" is true for

In [7]:
missing_county

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
      ...  
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
51    False
52    False
53    False
54    False
55    False
56    False
57    False
58    False
59    False
60    False
61    False
62    False
63    False
64    False
65    False
66    False
67    False
68     True
69    False
Name: County, Length: 70, dtype: bool

* remove those rows from the data frame

In [8]:
voting_df2=voting_df[-missing_county]

* check the shape now to see how many rows have been removed

In [9]:
voting_df2.shape

(67, 54)

## Check the rows in the County column now to see what else needs to be removed

In [10]:
voting_df2['County'].unique()

array([' Albany', ' Allegany', ' Broome', ' Cattaraugus', ' Cayuga',
       ' Chautauqua', ' Chemung', ' Chenango', ' Clinton', ' Columbia',
       ' Cortland', ' Delaware', ' Dutchess', ' Erie', ' Essex',
       ' Franklin', ' Fulton', ' Genesee', ' Greene', ' Hamilton',
       ' Herkimer', ' Jefferson', ' Lewis', ' Livingston', ' Madison',
       ' Monroe', ' Montgomery', ' Nassau', ' Niagara', ' Oneida',
       ' Onondaga', ' Ontario', ' Orange', 'County', ' Orleans',
       ' Oswego', ' Otsego', ' Putnam', ' Rensselaer', ' Rockland',
       ' Saratoga', ' Schenectady', ' Schoharie', ' Schuyler', ' Seneca',
       ' St. Lawrence', ' Steuben', ' Suffolk', ' Sullivan', ' Tioga',
       ' Tompkins', ' Ulster', ' Warren', ' Washington', ' Wayne',
       ' Westchester', ' Wyoming', ' Yates', 'Total Outside NYC',
       ' Bronx', ' Kings', ' New York', ' Queens', ' Richmond',
       'Total NYC', 'Statewide Total', 'RECAP'], dtype=object)

## Remove more unnecessary rows
* assign names to the rows that are totals of the multiple counties added together as well as the "recap" and "county" rows
* remove those rows from the data frame
* reprint the data frame

In [11]:
contains_total = voting_df2['County'].str.contains('Total')
contains_RECAP = voting_df2['County'].str.contains('RECAP')
contains_county = voting_df2['County']=='County'
voting_df3=voting_df2[-contains_total & -contains_RECAP & -contains_county]
voting_df3

Unnamed: 0,County,Clinton Kaine,Trump Pence,Trump Pence.1,Stein Baraka,Clinton Kaine.1,Johnson Weld,Clinton Kaine.2,Johnson Weld.1,Unnamed: 9,...,Emidio Soltysik,Tony Valdivia,J. J. Vogel-Walcutt,Esther Welsh,Barbara Whitaker,Robert M. Wolff,Blank,Void,Scattering,Total
0,Albany,78504,42554,5254,2475,3419,3099,1148,1693,,...,3,1,0,0,0,0,1350,145,1318,141313
1,Allegany,4646,11421,1104,275,173,527,63,208,,...,0,0,0,0,0,0,178,2,176,18872
2,Broome,37106,37337,3606,1546,1631,2187,475,1059,,...,1,0,0,0,0,1,1038,67,864,87177
3,Cattaraugus,9077,17485,2207,440,274,932,146,314,,...,0,0,0,0,0,0,320,72,205,31553
4,Cayuga,12779,15316,2068,501,541,1008,202,350,,...,1,0,0,0,0,1,259,16,307,33447
5,Chautauqua,18112,27516,4078,757,704,1601,275,602,,...,0,0,0,0,0,0,576,0,485,54810
6,Chemung,13277,18347,1750,463,326,1012,154,435,,...,0,0,0,0,0,0,395,0,261,36514
7,Chenango,6417,10849,1072,358,258,634,100,282,,...,0,0,0,0,0,0,148,11,169,20319
8,Clinton,14217,13312,1137,613,634,964,208,348,,...,2,0,0,0,0,0,2414,17,602,34536
9,Columbia,14146,11867,1889,598,865,626,273,318,,...,0,0,0,0,0,0,230,20,238,31152


## Check the list of counties again to see what else needs to be fixed

In [12]:
voting_df3['County'].unique()

array([' Albany', ' Allegany', ' Broome', ' Cattaraugus', ' Cayuga',
       ' Chautauqua', ' Chemung', ' Chenango', ' Clinton', ' Columbia',
       ' Cortland', ' Delaware', ' Dutchess', ' Erie', ' Essex',
       ' Franklin', ' Fulton', ' Genesee', ' Greene', ' Hamilton',
       ' Herkimer', ' Jefferson', ' Lewis', ' Livingston', ' Madison',
       ' Monroe', ' Montgomery', ' Nassau', ' Niagara', ' Oneida',
       ' Onondaga', ' Ontario', ' Orange', ' Orleans', ' Oswego',
       ' Otsego', ' Putnam', ' Rensselaer', ' Rockland', ' Saratoga',
       ' Schenectady', ' Schoharie', ' Schuyler', ' Seneca',
       ' St. Lawrence', ' Steuben', ' Suffolk', ' Sullivan', ' Tioga',
       ' Tompkins', ' Ulster', ' Warren', ' Washington', ' Wayne',
       ' Westchester', ' Wyoming', ' Yates', ' Bronx', ' Kings',
       ' New York', ' Queens', ' Richmond'], dtype=object)

### Remove the extra spaces from around the words by using the strip function

In [13]:
voting_df3['County']=voting_df3['County'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Drop the columns that have no values (NaN)

In [14]:
voting_df3.dropna(axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
voting_df3.head(20)

Unnamed: 0,County,Clinton Kaine,Trump Pence,Trump Pence.1,Stein Baraka,Clinton Kaine.1,Johnson Weld,Clinton Kaine.2,Johnson Weld.1,County.1,...,Emidio Soltysik,Tony Valdivia,J. J. Vogel-Walcutt,Esther Welsh,Barbara Whitaker,Robert M. Wolff,Blank,Void,Scattering,Total
0,Albany,78504,42554,5254,2475,3419,3099,1148,1693,Albany,...,3,1,0,0,0,0,1350,145,1318,141313
1,Allegany,4646,11421,1104,275,173,527,63,208,Allegany,...,0,0,0,0,0,0,178,2,176,18872
2,Broome,37106,37337,3606,1546,1631,2187,475,1059,Broome,...,1,0,0,0,0,1,1038,67,864,87177
3,Cattaraugus,9077,17485,2207,440,274,932,146,314,Cattaraugus,...,0,0,0,0,0,0,320,72,205,31553
4,Cayuga,12779,15316,2068,501,541,1008,202,350,Cayuga,...,1,0,0,0,0,1,259,16,307,33447
5,Chautauqua,18112,27516,4078,757,704,1601,275,602,Chautauqua,...,0,0,0,0,0,0,576,0,485,54810
6,Chemung,13277,18347,1750,463,326,1012,154,435,Chemung,...,0,0,0,0,0,0,395,0,261,36514
7,Chenango,6417,10849,1072,358,258,634,100,282,Chenango,...,0,0,0,0,0,0,148,11,169,20319
8,Clinton,14217,13312,1137,613,634,964,208,348,Clinton,...,2,0,0,0,0,0,2414,17,602,34536
9,Columbia,14146,11867,1889,598,865,626,273,318,Columbia,...,0,0,0,0,0,0,230,20,238,31152


* There are still a lot of columns that are not needed for the data analysis

## Look at the column names to see what needs to be fixed

In [16]:
voting_df3.columns

Index(['County', 'Clinton   Kaine', 'Trump    Pence', 'Trump    Pence.1',
       'Stein   Baraka', 'Clinton   Kaine.1', 'Johnson  Weld',
       'Clinton   Kaine.2', 'Johnson  Weld.1', 'County.1', 'Arantxa Aranja',
       'Neer R. Asherie', 'Mark Blickley', 'Robert L. Buchanan',
       'Gary S. Canns', 'Willie Carter', 'Darrell Castle', 'Ariel Cohen',
       'William J. Connolly', '"Rocky" Rogue De La Fuente', 'Jason Fried',
       'Zoltan Istvan Gyurko', 'County.2', 'Ben Hartnell', 'Tom Hoefling',
       'Michael Frederick Ingbar', 'Lynn Kahn', 'Chris Keniston',
       'Gloria La Riva', 'Jeffrey Mackler', 'Michael A. Maturen',
       'Evan McMullin', 'Monica Moorehead', 'Jason Mutford', 'Clifton Roberts',
       'County.3', 'Marshall Schoenke', 'Ryan Alan Scott', 'Emidio Soltysik',
       'Tony Valdivia', 'J. J. Vogel-Walcutt', 'Esther Welsh',
       'Barbara Whitaker', 'Robert M. Wolff', 'Blank', 'Void', 'Scattering',
       'Total'],
      dtype='object')

## Clean up the column names by replacing abnormal amounts of spaces with dashes and calling that new_colnames

In [17]:
new_colnames=voting_df3.columns.str.replace('\s{2,}','-')
new_colnames

Index(['County', 'Clinton-Kaine', 'Trump-Pence', 'Trump-Pence.1',
       'Stein-Baraka', 'Clinton-Kaine.1', 'Johnson-Weld', 'Clinton-Kaine.2',
       'Johnson-Weld.1', 'County.1', 'Arantxa Aranja', 'Neer R. Asherie',
       'Mark Blickley', 'Robert L. Buchanan', 'Gary S. Canns', 'Willie Carter',
       'Darrell Castle', 'Ariel Cohen', 'William J. Connolly',
       '"Rocky" Rogue De La Fuente', 'Jason Fried', 'Zoltan Istvan Gyurko',
       'County.2', 'Ben Hartnell', 'Tom Hoefling', 'Michael Frederick Ingbar',
       'Lynn Kahn', 'Chris Keniston', 'Gloria La Riva', 'Jeffrey Mackler',
       'Michael A. Maturen', 'Evan McMullin', 'Monica Moorehead',
       'Jason Mutford', 'Clifton Roberts', 'County.3', 'Marshall Schoenke',
       'Ryan Alan Scott', 'Emidio Soltysik', 'Tony Valdivia',
       'J. J. Vogel-Walcutt', 'Esther Welsh', 'Barbara Whitaker',
       'Robert M. Wolff', 'Blank', 'Void', 'Scattering', 'Total'],
      dtype='object')

## Replace the column names in the dataframe with new_colnames

In [18]:
voting_df3.columns=new_colnames

### Check the columns to see if there are any that are not needed in the data frame

In [19]:
voting_df3.columns

Index(['County', 'Clinton-Kaine', 'Trump-Pence', 'Trump-Pence.1',
       'Stein-Baraka', 'Clinton-Kaine.1', 'Johnson-Weld', 'Clinton-Kaine.2',
       'Johnson-Weld.1', 'County.1', 'Arantxa Aranja', 'Neer R. Asherie',
       'Mark Blickley', 'Robert L. Buchanan', 'Gary S. Canns', 'Willie Carter',
       'Darrell Castle', 'Ariel Cohen', 'William J. Connolly',
       '"Rocky" Rogue De La Fuente', 'Jason Fried', 'Zoltan Istvan Gyurko',
       'County.2', 'Ben Hartnell', 'Tom Hoefling', 'Michael Frederick Ingbar',
       'Lynn Kahn', 'Chris Keniston', 'Gloria La Riva', 'Jeffrey Mackler',
       'Michael A. Maturen', 'Evan McMullin', 'Monica Moorehead',
       'Jason Mutford', 'Clifton Roberts', 'County.3', 'Marshall Schoenke',
       'Ryan Alan Scott', 'Emidio Soltysik', 'Tony Valdivia',
       'J. J. Vogel-Walcutt', 'Esther Welsh', 'Barbara Whitaker',
       'Robert M. Wolff', 'Blank', 'Void', 'Scattering', 'Total'],
      dtype='object')

### Make a list of the columns that should be kept in the data frame

In [20]:
cols_to_keep= ['County', 'Clinton-Kaine', 'Trump-Pence', 'Trump-Pence.1',
       'Stein-Baraka', 'Clinton-Kaine.1', 'Johnson-Weld', 'Clinton-Kaine.2',
       'Johnson-Weld.1','Arantxa Aranja', 'Neer R. Asherie',
       'Mark Blickley', 'Robert L. Buchanan', 'Gary S. Canns', 'Willie Carter',
       'Darrell Castle', 'Ariel Cohen', 'William J. Connolly',
       '"Rocky" Rogue De La Fuente', 'Jason Fried', 'Zoltan Istvan Gyurko', 'Ben Hartnell', 'Tom Hoefling', 'Michael Frederick Ingbar',
       'Lynn Kahn', 'Chris Keniston', 'Gloria La Riva', 'Jeffrey Mackler',
       'Michael A. Maturen', 'Evan McMullin', 'Monica Moorehead',
       'Jason Mutford', 'Clifton Roberts','Marshall Schoenke',
       'Ryan Alan Scott', 'Emidio Soltysik', 'Tony Valdivia',
       'J. J. Vogel-Walcutt', 'Esther Welsh', 'Barbara Whitaker',
       'Robert M. Wolff']

### Create a new version of that data frame made only of the columns that should be kept

In [21]:
voting_df4=voting_df3[cols_to_keep]

### Look at the new data frame

In [22]:
voting_df4

Unnamed: 0,County,Clinton-Kaine,Trump-Pence,Trump-Pence.1,Stein-Baraka,Clinton-Kaine.1,Johnson-Weld,Clinton-Kaine.2,Johnson-Weld.1,Arantxa Aranja,...,Jason Mutford,Clifton Roberts,Marshall Schoenke,Ryan Alan Scott,Emidio Soltysik,Tony Valdivia,J. J. Vogel-Walcutt,Esther Welsh,Barbara Whitaker,Robert M. Wolff
0,Albany,78504,42554,5254,2475,3419,3099,1148,1693,0,...,14,3,0,0,3,1,0,0,0,0
1,Allegany,4646,11421,1104,275,173,527,63,208,0,...,0,0,0,0,0,0,0,0,0,0
2,Broome,37106,37337,3606,1546,1631,2187,475,1059,0,...,0,0,0,0,1,0,0,0,0,1
3,Cattaraugus,9077,17485,2207,440,274,932,146,314,0,...,0,0,0,0,0,0,0,0,0,0
4,Cayuga,12779,15316,2068,501,541,1008,202,350,0,...,0,0,0,0,1,0,0,0,0,1
5,Chautauqua,18112,27516,4078,757,704,1601,275,602,0,...,0,0,0,0,0,0,0,0,0,0
6,Chemung,13277,18347,1750,463,326,1012,154,435,0,...,0,0,0,0,0,0,0,0,0,0
7,Chenango,6417,10849,1072,358,258,634,100,282,0,...,0,0,0,0,0,0,0,0,0,0
8,Clinton,14217,13312,1137,613,634,964,208,348,0,...,0,0,0,0,2,0,0,0,0,0
9,Columbia,14146,11867,1889,598,865,626,273,318,0,...,1,0,0,0,0,0,0,0,0,0


### The data in the candidate columns need to be recognized as integers (to be added later) rather than strings
* create a for loop that shows the columns that are being fixed and changes their type to integer

In [23]:
for candidate in cols_to_keep[1:]:
    print('Fixing type of column:', candidate)
    voting_df4[candidate]=voting_df4[candidate].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Fixing type of column: Clinton-Kaine
Fixing type of column: Trump-Pence
Fixing type of column: Trump-Pence.1
Fixing type of column: Stein-Baraka
Fixing type of column: Clinton-Kaine.1
Fixing type of column: Johnson-Weld
Fixing type of column: Clinton-Kaine.2
Fixing type of column: Johnson-Weld.1
Fixing type of column: Arantxa Aranja
Fixing type of column: Neer R. Asherie
Fixing type of column: Mark Blickley
Fixing type of column: Robert L. Buchanan
Fixing type of column: Gary S. Canns
Fixing type of column: Willie Carter
Fixing type of column: Darrell Castle
Fixing type of column: Ariel Cohen
Fixing type of column: William J. Connolly
Fixing type of column: "Rocky" Rogue De La Fuente
Fixing type of column: Jason Fried
Fixing type of column: Zoltan Istvan Gyurko
Fixing type of column: Ben Hartnell
Fixing type of column: Tom Hoefling
Fixing type of column: Michael Frederick Ingbar
Fixing type of column: Lynn Kahn
Fixing type of column: Chris Keniston
Fixing type of column: Gloria La Riva

## Add the three Clinton-Kaine columns together to get the total number of those that voted for Clinton in each county in a new column called "Clinton-Kaine-Total"

In [24]:
voting_df4['Clinton-Kaine-Total'] = voting_df4[['Clinton-Kaine','Clinton-Kaine.1','Clinton-Kaine.2']].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Add the three Trump-Pence columns together to get the total number of those that voted for Trump in each county in a new column called "Trump-Pence-Total"

In [25]:
voting_df4['Trump-Pence-Total'] = voting_df4[['Trump-Pence', 'Trump-Pence.1']].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Add the columns of all the other candidates together together to get the total number of those that voted for "other" in each county in a new column called "Other"

In [26]:
voting_df4['Other'] = voting_df4[['Stein-Baraka','Johnson-Weld','Johnson-Weld.1','Arantxa Aranja', 'Neer R. Asherie',
       'Mark Blickley', 'Robert L. Buchanan', 'Gary S. Canns', 'Willie Carter',
       'Darrell Castle', 'Ariel Cohen', 'William J. Connolly',
       '"Rocky" Rogue De La Fuente', 'Jason Fried', 'Zoltan Istvan Gyurko', 'Ben Hartnell', 'Tom Hoefling', 'Michael Frederick Ingbar',
       'Lynn Kahn', 'Chris Keniston', 'Gloria La Riva', 'Jeffrey Mackler',
       'Michael A. Maturen', 'Evan McMullin', 'Monica Moorehead',
       'Jason Mutford', 'Clifton Roberts','Marshall Schoenke',
       'Ryan Alan Scott', 'Emidio Soltysik', 'Tony Valdivia',
       'J. J. Vogel-Walcutt', 'Esther Welsh', 'Barbara Whitaker',
       'Robert M. Wolff']].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


### Look at the data frame with its new columns

In [27]:
voting_df4

Unnamed: 0,County,Clinton-Kaine,Trump-Pence,Trump-Pence.1,Stein-Baraka,Clinton-Kaine.1,Johnson-Weld,Clinton-Kaine.2,Johnson-Weld.1,Arantxa Aranja,...,Ryan Alan Scott,Emidio Soltysik,Tony Valdivia,J. J. Vogel-Walcutt,Esther Welsh,Barbara Whitaker,Robert M. Wolff,Clinton-Kaine-Total,Trump-Pence-Total,Other
0,Albany,78504,42554,5254,2475,3419,3099,1148,1693,0,...,0,3,1,0,0,0,0,83071,47808,7621
1,Allegany,4646,11421,1104,275,173,527,63,208,0,...,0,0,0,0,0,0,0,4882,12525,1109
2,Broome,37106,37337,3606,1546,1631,2187,475,1059,0,...,0,1,0,0,0,0,1,39212,40943,5053
3,Cattaraugus,9077,17485,2207,440,274,932,146,314,0,...,0,0,0,0,0,0,0,9497,19692,1767
4,Cayuga,12779,15316,2068,501,541,1008,202,350,0,...,0,1,0,0,0,0,1,13522,17384,1959
5,Chautauqua,18112,27516,4078,757,704,1601,275,602,0,...,0,0,0,0,0,0,0,19091,31594,3064
6,Chemung,13277,18347,1750,463,326,1012,154,435,0,...,0,0,0,0,0,0,0,13757,20097,2004
7,Chenango,6417,10849,1072,358,258,634,100,282,0,...,0,0,0,0,0,0,0,6775,11921,1295
8,Clinton,14217,13312,1137,613,634,964,208,348,0,...,0,2,0,0,0,0,0,15059,14449,1995
9,Columbia,14146,11867,1889,598,865,626,273,318,0,...,0,0,0,0,0,0,0,15284,13756,1624


## Create another list of columns to keep to be put in a new data frame
* the columns to keep are the county and the total votes for Clinton, Trump, and Other

In [28]:
cols_to_keep2 = ['County', 'Clinton-Kaine-Total', 'Trump-Pence-Total', 'Other']

### Create a new data frame made of just the columns that were selected to be kept

In [29]:
voting_df5 = voting_df4[cols_to_keep2]

### Look at the new data frame to make sure it is complete

In [30]:
voting_df5

Unnamed: 0,County,Clinton-Kaine-Total,Trump-Pence-Total,Other
0,Albany,83071,47808,7621
1,Allegany,4882,12525,1109
2,Broome,39212,40943,5053
3,Cattaraugus,9497,19692,1767
4,Cayuga,13522,17384,1959
5,Chautauqua,19091,31594,3064
6,Chemung,13757,20097,2004
7,Chenango,6775,11921,1295
8,Clinton,15059,14449,1995
9,Columbia,15284,13756,1624


### Save the newest data frame as a csv to be used in other notebooks

In [31]:
voting_df5.to_csv('../data/2016President_clean.csv', index=False)