In [1]:
import altair as alt
import pandas as pd


In [2]:
# read in csv and filter to only have female
raw_data = pd.read_csv("clean.csv")
data = raw_data[raw_data["Sex"] == "F"]
data.head()

Unnamed: 0,Order,Family,Original order,Link to Claim,Last Name,Maiden Name,First Name,Relation to Head of Household,Age,Sex,Race,Occupation,Household Inventory # n=420,1375 people,Household,Family Notes,Age Range
0,1,RES.006,99.0,Link to Claim Images,Abbot,,Lydia,Head,?,F,W,?,1,1.0,Lydia Abbot,"Lydia claims, Isaac possibly dead, both live i...",Unknown
1,2,RES.008,195.0,Link to Claim Images,Abraham,,Martha,Head,54,F,W,?,2,1.0,Martha Abrahams (54),,50-54
3,4,RES.007,306.0,Link to Claim Images,Abraham,,Joanna,Head,?,F,W,Widow,4,1.0,Joanna Abraham,Daughter Elizabeth is 27 and may have moved ou...,Unknown
5,6,,,,Adams,Prescott,Susanna,Wife,?,F,W,,5,,,,Unknown
7,8,,,,Adams,Hall,Hannah,Wife,48,F,W,,6,,,,45-49


In [3]:
# check for all unique values
data["Relation to Head of Household"].unique()

array(['Head', 'Wife', 'daughter', 'Daughter', 'Sister', 'Enslaved',
       'wife', 'enslaved'], dtype=object)

In [4]:
# check to see which are marked as sisters
sisters = data[data["Relation to Head of Household"] == "Sister"]
sisters # based on the 3 results, we can drop them due to lack of data on them

Unnamed: 0,Order,Family,Original order,Link to Claim,Last Name,Maiden Name,First Name,Relation to Head of Household,Age,Sex,Race,Occupation,Household Inventory # n=420,1375 people,Household,Family Notes,Age Range
60,61,,,,Barker,,Elizabeth,Sister,?,F,W,,24,,,,Unknown
61,62,,,,Barker,,Ann,Sister,?,F,W,,24,,,,Unknown
1080,1081,,,,Sheafe,,Rebecca,Sister,60,F,W,,329,,,,60-64


In [5]:
# remove sister 
filtered_data = data[(data["Relation to Head of Household"] != "Sister")]
filtered_data["Relation to Head of Household"].unique()

array(['Head', 'Wife', 'daughter', 'Daughter', 'Enslaved', 'wife',
       'enslaved'], dtype=object)

In [6]:
# create a function to apply to the head of household column to create the marital status column
def marital_status(status):
    status = status.lower()
    if status == "wife":
        return "Married"
    elif status == "daughter" or status == "enslaved":
        return "Single"
    elif status == "head":
        return "Widow"

In [18]:
# apply function to create new column
filtered_data["Marital_Status"] = filtered_data["Relation to Head of Household"].apply(marital_status)
filtered_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["Marital_Status"] = filtered_data["Relation to Head of Household"].apply(marital_status)


Unnamed: 0,Order,Family,Original order,Link to Claim,Last Name,Maiden Name,First Name,Relation to Head of Household,Age,Sex,Race,Occupation,Household Inventory # n=420,1375 people,Household,Family Notes,Age Range,Marital_Status,Number of People in Family,Children
0,1,RES.006,99.0,Link to Claim Images,Abbot,,Lydia,Head,?,F,W,?,1,1.0,Lydia Abbot,"Lydia claims, Isaac possibly dead, both live i...",Unknown,Widow,1,0
1,2,RES.008,195.0,Link to Claim Images,Abraham,,Martha,Head,54,F,W,?,2,1.0,Martha Abrahams (54),,50-54,Widow,1,0
3,4,RES.007,306.0,Link to Claim Images,Abraham,,Joanna,Head,?,F,W,Widow,4,1.0,Joanna Abraham,Daughter Elizabeth is 27 and may have moved ou...,Unknown,Widow,1,0
5,6,,,,Adams,Prescott,Susanna,Wife,?,F,W,,5,,,,Unknown,Married,1,0
7,8,,,,Adams,Hall,Hannah,Wife,48,F,W,,6,,,,45-49,Married,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381,1382,RES.449,347.0,Link to Claim Images,Wood,Bradish,Elizabeth,Head,64,F,W,widow,414,1.0,"Elizabeth (Bradish) Wood (?),","adult children, widow of John",60-64,Widow,2,1
1383,1384,,,,Wyer,Breed,Sarah,Wife,73,F,W,,415,,,,70-74,Married,1,0
1384,1385,RES.452,25.0,Link to Claim Images,Wyer,Boylston,Elizabeth,Head,72,F,W,widow,416,1.0,Elizabeth (Boylston) Wyer (?),"Adult children, widow of Nathaniel",70-74,Widow,1,0
1387,1388,,,,Wyer,Austin,Lydia,Wife,35,F,W,,418,,,,35-39,Married,1,0


In [8]:
# create dataframe with aggregated columns
grouped = filtered_data.groupby(["Age Range", "Marital_Status"]).size()
grouped_df = pd.DataFrame(grouped, columns=['Count']).reset_index()
grouped_df.head()

Unnamed: 0,Age Range,Marital_Status,Count
0,0-4,Single,67
1,10-14,Married,1
2,10-14,Single,80
3,100-104,Married,1
4,15-19,Married,3


In [9]:
# display unique age ranges
ages = filtered_data["Age Range"].unique()
print(ages)

age_order = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34',
            '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', 
            '65-69', '70-74', '75-79', '80-84', '85-89', '100-104', 'Unknown']

['Unknown' '50-54' '45-49' '20-24' '10-14' '30-34' '5-9' '0-4' '60-64'
 '15-19' '40-44' '65-69' '55-59' '35-39' '25-29' '85-89' '70-74' '100-104'
 '75-79' '80-84']


In [10]:
# make brush
brush = alt.selection_interval()

In [11]:
# create age range distribution bar chart
barChar = alt.Chart(grouped_df).mark_bar().encode(
    x = alt.X("Age Range", sort=age_order),
    y = alt.Y("sum(Count)", title="Population"),
    color = "Marital_Status",
    tooltip = ["Marital_Status", "Count"] 
).properties(title="Age Distribution of Women in 1775").add_params(brush)

barChar

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [12]:
# next bar chart --> create column for # people in each family by creating list of names 
# in household and checking the length of the list (names split by comma)
# then create another column for number of children - if married, subtract 2 from # people
# in each family to get rid of parents, if widowed, subtract 1 from # people in family to get 
# rid of mom

filtered_data["Number of People in Family"] = filtered_data["Household"].apply(lambda x: len(str(x).split(',')))
filtered_data["Number of People in Family"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["Number of People in Family"] = filtered_data["Household"].apply(lambda x: len(str(x).split(',')))


array([1, 2, 4, 5, 3, 6, 7], dtype=int64)

In [13]:
# create function to calculate children in each family
def count_children(row):
    if row["Marital_Status"] == "Married":
        return row["Number of People in Family"] - 2
    elif row["Marital_Status"] == 'Widow':
        return row["Number of People in Family"] - 1  
    else:
        return 0  # assume no children for others

In [14]:
# add column with number of children
filtered_data["Children"] = filtered_data.apply(count_children, axis=1)
filtered_data["Children"] = filtered_data["Children"].clip(lower=0)
print(filtered_data["Children"].unique())

[0 1 3 4 2 5 6]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["Children"] = filtered_data.apply(count_children, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["Children"] = filtered_data["Children"].clip(lower=0)


In [15]:
grouped_children = filtered_data.groupby(['Age Range',"Children"]).size()
grouped_2 = pd.DataFrame(grouped_children, columns=['Count']).reset_index()
grouped_2

Unnamed: 0,Age Range,Children,Count
0,0-4,0,67
1,10-14,0,81
2,100-104,0,1
3,15-19,0,64
4,20-24,0,21
5,20-24,4,1
6,25-29,0,21
7,30-34,0,23
8,30-34,2,1
9,30-34,3,2


In [16]:
# create number of children bar chart



barChar_2 = alt.Chart(grouped_2).mark_bar(size=30).encode(
    x = alt.X("Children:Q"),
    y = alt.Y("Count:Q", title="Number of Children"),
    tooltip=["Children", "Count"]
).properties(
    title=alt.TitleParams(text="Number of Children", fontSize=20)
).transform_filter(brush)

barChar | barChar_2

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
