# Pandas

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

In [None]:
# Modules to import
import pandas as pd
import openpyxl

In [None]:
# Intialise data of lists.
data = {'Name':['Edward','Abby', 'Chris', 'Dean','Bert'],
        'Age':[12, 21, 19, 52,34]}
 
# Create DataFrame
df = pd.DataFrame(data)
df

In [None]:
# Adding a column
df['Country'] = 'United States'
df

In [None]:
# Making a change based on an event
df.loc[(df.Name == 'Chris'),'Country']='Mexico'
df

In [None]:
# Resorting based on a column
df = df.sort_values(by=['Name'])
df

In [None]:
# Adding a new row
new_row = {'Name':'Frank','Age':'46','Country':'Denmark'}
df = df.append(new_row, ignore_index=True)
df

In [None]:
# Renaming a column
df.rename(columns={'Country': 'Location'}, inplace=True)
df

In [None]:
# Adding a column and assigning based on another column
df['Life_Stage'] = ['Old' if x > 25 else 'Young' for x in df['Age']]
df

In [None]:
# Looking at data types
df.dtypes

<center><a href="https://pbpython.com/images/pandas_dtypes.png">Data Types</a></center>
<img src="https://pbpython.com/images/pandas_dtypes.png">

In [None]:
# Converting Age to Integer
df['Age'] = df['Age'].astype('int64')
df.dtypes

In [None]:
# Adding a column and assigning based on another column
df['Life_Stage'] = ['Old' if x > 25 else 'Young' for x in df['Age']]
df

In [None]:
# Looking at group by
df.groupby(['Life_Stage']).median()

In [None]:
# Another group by
df.groupby(['Location']).median()

In [None]:
# Rudementary Chart
chart = df['Age'].plot.bar()
chart;

In [None]:
#Creating a new dataframe filtered from the first one
df2 = df[df['Location'] == 'United States']
df2

In [None]:
# Exporting as a CSV
df2.to_csv('book1.csv', sep=',', index=False) ## CSV separated with a comma, ignoring the index

In [None]:
##Write it to Excel
with pd.ExcelWriter('Demographics.xlsx',engine='openpyxl') as writer:
    df.to_excel(writer, sheet_name='All', index=False)
    df2.to_excel(writer, sheet_name='US Residents', index=False)

In [None]:
# Wide data transformation
df = pd.DataFrame({
    'date' : ['05/03', '06/03', '07/03', '08/03'],
    'AA' : [1, 4, 7, 5],
    'BB' : [2, 5, 8, 7],
    'CC' : [3, 6, 9, 1]
}).set_index('date')
df

In [None]:
# Wide to long
# Melt to work, we can't use a date index so reset it
# Melt keeping your date and melting out the variables
df = df.reset_index()
wide_to_long = pd.melt(df, id_vars='date', value_vars=['AA', 'BB', 'CC'])
wide_to_long

In [None]:
#Long to Wide
#How to accomplish a pivot with Pandas
long_to_wide = wide_to_long.pivot_table(index=['date'],columns='variable',values='value').reset_index()
long_to_wide

In [None]:
#Working with data
df = pd.DataFrame({
    'Name' : ['Kristin Ruark','Leonila Maskell','Dena Streetman','Aleen Espinoza','Venetta Denison',
              'Paz Dowless','Lorette Landa','Lamonica Vogl','Keely Harbin','Lourdes Rathjen',
              'Lorilee Birge','Guillermo Borquez','Wayne Beggs','Corrie Kowalewski'],
    'Age' : [90,97,23,61,41,86,82,24,19,89,24,62,67,79],
    'Country' : ['Saint Pierre and Miquelon','Antarctica','Finland','Comoros','Anguilla','Italy',
                 'Virgin Islands (U.S.)','Liechtenstein','Liberia','Cook Islands','Albania',
                 'Maldives','Senegal','El Salvador'],
    'Hobby' : ['Candle making','Candy making','Car fixing & building','Card games','Cardistry',
               'Ceramics','Chatting','Cheesemaking','Chess','Cleaning','Clothesmaking',
               'Coffee roasting','Collecting','Coloring']
})
df

In [None]:
#Splitting Name into first and last
df[['Name','Last_Name']] = df['Name'].str.split(' ',expand=True)
df.rename(columns={'Name': 'First_Name'}, inplace=True)
df

In [None]:
#Reindexing
df = df.reindex(columns=['First_Name','Last_Name','Age','Hobby','Country'])
df

In [None]:
#Saving to Excel
df.to_excel("Output.xlsx",index=False,sheet_name='Roster')

In [None]:
#Merging Dataframes together
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index=[0, 1, 2, 3],
)


df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    },
    index=[4, 5, 6, 7],
)


df3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10", "A11"],
        "B": ["B8", "B9", "B10", "B11"],
        "C": ["C8", "C9", "C10", "C11"],
        "D": ["D8", "D9", "D10", "D11"],
    },
    index=[8, 9, 10, 11],
)
df1

In [None]:
df2

In [None]:
df3

In [None]:
#Merging operations
merged_list = [df1, df2, df3]
merged_df = pd.concat(merged_list)
merged_df

## Putting this to work

In [None]:
#Importing from dataframes
# Data sourced from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/42MVDX&version=6.0
# Converted to an xlsx and tabbed by decade
# This could be from 1 spreadsheet, or other multiple sources
df1 = pd.read_excel("1976-2020-president.xlsx", "1970")
df2 = pd.read_excel("1976-2020-president.xlsx", "1980")
df3 = pd.read_excel("1976-2020-president.xlsx", "1990")
df4 = pd.read_excel("1976-2020-president.xlsx", "2000")
df4 = pd.read_excel("1976-2020-president.xlsx", "2010")
df5 = pd.read_excel("1976-2020-president.xlsx", "2020")

merged_list = [df1,df2,df3,df4,df5]
df = pd.concat(merged_list)
df

In [None]:
#Start transformations
df.rename(columns={'state_po': 'state_acronym'}, inplace=True)
df.rename(columns={'candidatevotes': 'candidate_votes'}, inplace=True)
df.rename(columns={'totalvotes': 'total_votes'}, inplace=True)
df

In [None]:
# Create one report looking at total votes by state by year
report1_df = df.groupby(['year','state'],as_index=False).sum()
report1_df = report1_df.drop(['state_fips', 'state_cen','total_votes','version','notes','state_ic'], axis = 1)
report1_df

In [None]:
# Create one report looking at Votes by candidate by state
report2_df = df.groupby(['year','candidate'],as_index=False).sum()
report2_df = report2_df.drop(['state_fips', 'state_cen','total_votes','version','notes','state_ic'], axis = 1)
report2_df

In [None]:
# Creating a pivot
report3_df = df.pivot_table(index='year',columns='candidate'
                            ,values='candidate_votes',aggfunc='sum').round(0)
report3_df = report3_df.fillna('0').astype(int) #Handling blanks and losing the trailing .0
report3_df

In [None]:
##Write it to Excel
with pd.ExcelWriter('Presidential_Candidates.xlsx',engine='openpyxl') as writer:
    report1_df.to_excel(writer, sheet_name='State Participation', index=False)
    report2_df.to_excel(writer, sheet_name='Candidate Votes', index=False)
    report3_df.to_excel(writer, sheet_name='Candidate Pivot', index=True)

Some further reading: <br>
Dataframes from alternate sources: https://pandas.pydata.org/docs/user_guide/io.html <br>
Merging dataframes: https://pandas.pydata.org/docs/user_guide/merging.html <br>
Melting and reshaping: https://pandas.pydata.org/docs/user_guide/reshaping.html <br>
Working with Nulls (NaN): https://pandas.pydata.org/docs/user_guide/missing_data.html <br>