In [None]:
 #Import the dataset with mongoimport --type csv -d project_two -c nchs --headerline --drop NCHS_-_Death_rates_and_life_expectancy_at_birth.csv

In [21]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

In [22]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

# assign the met database to a variable name
db = mongo['project_two']


In [23]:
# review the collections in our new database
print(db.list_collection_names())

['npao', 'nchs']


In [24]:
# assign the collection to a variable
nchs = db['nchs']

In [25]:
# Retrieve and print the data
for document in nchs.find():
    print(document)

{'_id': ObjectId('64f163167475af78457d96ab'), 'Year': 1903, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 50.5, 'Age-adjusted Death Rate': 2379.0}
{'_id': ObjectId('64f163167475af78457d96ac'), 'Year': 1904, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 47.6, 'Age-adjusted Death Rate': 2502.5}
{'_id': ObjectId('64f163167475af78457d96ad'), 'Year': 1905, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 48.7, 'Age-adjusted Death Rate': 2423.7}
{'_id': ObjectId('64f163167475af78457d96ae'), 'Year': 1906, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 48.7, 'Age-adjusted Death Rate': 2399.0}
{'_id': ObjectId('64f163167475af78457d96af'), 'Year': 1907, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 47.6, 'Age-adjusted Death Rate': 2494.4}
{'_id': ObjectId('64f163167475af78457d96b0'), 'Year': 1908, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average

In [29]:
sample_document = nchs.find_one()
sample_document

{'_id': ObjectId('64f163167475af78457d96ab'),
 'Year': 1903,
 'Race': 'All Races',
 'Sex': 'Both Sexes',
 'Average Life Expectancy (Years)': 50.5,
 'Age-adjusted Death Rate': 2379.0}

In [38]:
pipeline = [
    {
        "$match": {
            "Race": "All Races"
        }
    },
    {
        "$group": {
            "_id": "$Year",
            "max_life_expectancy": {"$max": "$Average Life Expectancy (Years)"}
        }
    },
    {
        "$sort": {
            "max_life_expectancy": -1  # Sort in descending order (highest first)
        }
    },
    {
        "$limit": 1
    }
]

# Execute the aggregation pipeline
result = list(nchs.aggregate(pipeline))

# Check if any result was found
if result:
    highest_life_expectancy_year = result[0]["_id"]
    highest_life_expectancy_value = result[0]["max_life_expectancy"]
    print(f"The year with the highest life expectancy for 'All Races' is {highest_life_expectancy_year} with an average life expectancy of {highest_life_expectancy_value} years.")

The year with the highest life expectancy for 'All Races' is 2014 with an average life expectancy of 81.3 years.


In [45]:
# Define the field to check for empty values
field_to_check = "Average Life Expectancy (Years)"

# Define the aggregation pipeline to filter out documents with empty data
pipeline = [
    {
        "$match": {
            field_to_check: {"$exists": True, "$ne": None, "$ne": ""}
        }
    },
    {
        "$match": {
            "Race": "White"
        }
    },
    {
        "$group": {
            "_id": "$Year",
            "max_life_expectancy": {"$max": "$Average Life Expectancy (Years)"}
        }
    },
    {
        "$sort": {
            "max_life_expectancy": -1  # Sort in descending order (highest first)
        }
    },
    {
        "$limit": 1
    }
]

# Execute the aggregation pipeline
result = list(nchs.aggregate(pipeline))

# Check if any result was found
if result:
    highest_life_expectancy_year = result[0]["_id"]
    highest_life_expectancy_value = result[0]["max_life_expectancy"]
    print(f"The year with the highest life expectancy for 'White' is {highest_life_expectancy_year} with an average life expectancy of {highest_life_expectancy_value} years.")

The year with the highest life expectancy for 'White' is 2012 with an average life expectancy of 81.4 years.


In [46]:
# Define the aggregation pipeline to filter out documents with empty data
pipeline = [
    {
        "$match": {
            field_to_check: {"$exists": True, "$ne": None, "$ne": ""}
        }
    },
    {
        "$match": {
            "Race": "Black"
        }
    },
    {
        "$group": {
            "_id": "$Year",
            "max_life_expectancy": {"$max": "$Average Life Expectancy (Years)"}
        }
    },
    {
        "$sort": {
            "max_life_expectancy": -1  # Sort in descending order (highest first)
        }
    },
    {
        "$limit": 1
    }
]

# Execute the aggregation pipeline
result = list(nchs.aggregate(pipeline))

# Check if any result was found
if result:
    highest_life_expectancy_year = result[0]["_id"]
    highest_life_expectancy_value = result[0]["max_life_expectancy"]
    print(f"The year with the highest life expectancy for 'Black' is {highest_life_expectancy_year} with an average life expectancy of {highest_life_expectancy_value} years.")

The year with the highest life expectancy for 'Black' is 2015 with an average life expectancy of 78.5 years.


In [30]:
#NO SURE IF WE NEED THIS!!!

#Group data by year

pipeline = [
    {
        "$group": {
            "_id": "$Year",  # Group by the "Year" field
            "count": {"$sum": 1}  # Count the documents in each group
        }
    },
    {
        "$sort": {"_id": 1}  # Sort the results by year in ascending order
    }
]

# Execute the aggregation pipeline
result = list(nchs.aggregate(pipeline))

# Print the grouped data
for entry in result:
    pprint(entry)

{'_id': 1900, 'count': 9}
{'_id': 1901, 'count': 9}
{'_id': 1902, 'count': 9}
{'_id': 1903, 'count': 9}
{'_id': 1904, 'count': 9}
{'_id': 1905, 'count': 9}
{'_id': 1906, 'count': 9}
{'_id': 1907, 'count': 9}
{'_id': 1908, 'count': 9}
{'_id': 1909, 'count': 9}
{'_id': 1910, 'count': 9}
{'_id': 1911, 'count': 9}
{'_id': 1912, 'count': 9}
{'_id': 1913, 'count': 9}
{'_id': 1914, 'count': 9}
{'_id': 1915, 'count': 9}
{'_id': 1916, 'count': 9}
{'_id': 1917, 'count': 9}
{'_id': 1918, 'count': 9}
{'_id': 1919, 'count': 9}
{'_id': 1920, 'count': 9}
{'_id': 1921, 'count': 9}
{'_id': 1922, 'count': 9}
{'_id': 1923, 'count': 9}
{'_id': 1924, 'count': 9}
{'_id': 1925, 'count': 9}
{'_id': 1926, 'count': 9}
{'_id': 1927, 'count': 9}
{'_id': 1928, 'count': 9}
{'_id': 1929, 'count': 9}
{'_id': 1930, 'count': 9}
{'_id': 1931, 'count': 9}
{'_id': 1932, 'count': 9}
{'_id': 1933, 'count': 9}
{'_id': 1934, 'count': 9}
{'_id': 1935, 'count': 9}
{'_id': 1936, 'count': 9}
{'_id': 1937, 'count': 9}
{'_id': 1938

In [36]:

#NO SURE IF WE NEED THIS!!!
# Convert the 'result' list to a DataFrame
df = pd.DataFrame(result)

# Calculate summary statistics
summary_stats = df.groupby('_id')['count'].agg(['mean', 'median', 'std', 'min', 'max']).reset_index()

# Rename columns for clarity
summary_stats.columns = ['Year', 'Mean', 'Median', 'Std', 'Min', 'Max']

# Print the summary statistics
print(summary_stats)

     Year  Mean  Median  Std  Min  Max
0    1900   9.0     9.0  NaN    9    9
1    1901   9.0     9.0  NaN    9    9
2    1902   9.0     9.0  NaN    9    9
3    1903   9.0     9.0  NaN    9    9
4    1904   9.0     9.0  NaN    9    9
..    ...   ...     ...  ...  ...  ...
114  2014   9.0     9.0  NaN    9    9
115  2015   9.0     9.0  NaN    9    9
116  2016   9.0     9.0  NaN    9    9
117  2017   9.0     9.0  NaN    9    9
118  2018   9.0     9.0  NaN    9    9

[119 rows x 6 columns]
