# Census Case Study
- Preparing SQLAlchemy and the Database
- Loading Data into the Database
- Solving Data Science Problems with Queries

## Part 1: Preparing SQLAlchemy and the Database
- Create an Engine and MetaData object
- Create and save the census table


In [1]:
# Import create_engine, MetaData
from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, insert, select, func, case, cast, Float, desc

In [2]:
# Define an engine to connect to chapter5.sqlite: engine
engine = create_engine('sqlite:///chapter5.sqlite')

# Initialize MetaData: metadata
metadata = MetaData()

# create connection
connection = engine.connect()

# Print table names
print(engine.table_names())

[]


In [3]:
# Build a census table: census
census = Table('census', metadata,
               Column('state', String(30)),
               Column('sex', String(1)),
               Column('age', Integer()),
               Column('pop2000', Integer()),
               Column('pop2008', Integer()))

# Create the table in the database
metadata.create_all(engine)

# Print table names
print(engine.table_names())

['census']


## Part 2: Populating the Database
- Load a CSV file into a values list
- Insert the values list into the census table

In [4]:
# Import csv
import csv

In [5]:
def load_csv_value_list():
    with open('../_datasets/census.csv') as csvfile:
        # Reading the csv file
        csv_reader = csv.reader(csvfile, delimiter=',')
        
        # Create an empty list: values_list
        values_list = []

        # Iterate over the rows
        for row in csv_reader:
            # Create a dictionary with the values
            data = {'state': row[0], 'sex': row[1], 'age':row[2], 'pop2000':row[3], 'pop2008':row[4]}
            # Append the dictionary to the values list
            values_list.append(data)
        
        return values_list

In [6]:
values_list = load_csv_value_list()

# Build insert statement: stmt
stmt = insert(census)

print(stmt)

INSERT INTO census (state, sex, age, pop2000, pop2008) VALUES (:state, :sex, :age, :pop2000, :pop2008)


In [7]:
# Use values_list to insert data: results
results = connection.execute(stmt, values_list)

# Print rowcount
print(results.rowcount)

8772


## Part 3: Answering Data Science Questions with Queries
- Determine Average Age for Males and Females
- Determine the percentage of Females for each state
- Determine the top 5 states by population change from 2000 to 2008

In [8]:
import pandas as pd
def get_df_select_stmt(conn, select_stmt):
    # Results of executing the select_stmt
    results = conn.execute(select_stmt).fetchall()

    # Create a DataFrame from the results: df
    df = pd.DataFrame(results)

    # Set column names
    df.columns = results[0].keys()
    
    return df

#### 3.1 Determine Average Age for Males and Females

In [9]:
# Calculate weighted average age: stmt
stmt = select([census.columns.sex,
               (func.sum(census.columns.pop2008 * census.columns.age) /
                func.sum(census.columns.pop2008)).label('average_age')
               ])

# Group by sex
stmt = stmt.group_by(census.columns.sex)

print(stmt)

SELECT census.sex, sum(census.pop2008 * census.age) / sum(census.pop2008) AS average_age 
FROM census GROUP BY census.sex


In [10]:
# Print the result of executing the query.
df = get_df_select_stmt(connection, stmt)
df.head()

Unnamed: 0,sex,average_age
0,F,38
1,M,35


#### 3.2 Determine the percentage of Females for each state

In [11]:
# Build a query to calculate the percentage of females in 2000: stmt
stmt = select([census.columns.state,
    (func.sum(
        case([
            (census.columns.sex == 'F', census.columns.pop2000)
        ], else_=0)) /
     cast(func.sum(census.columns.pop2000), Float) * 100).label('percent_female')
])

# Group By state
stmt = stmt.group_by(census.columns.state)

print(stmt)

SELECT census.state, (sum(CASE WHEN (census.sex = :sex_1) THEN census.pop2000 ELSE :param_1 END) / CAST(sum(census.pop2000) AS FLOAT)) * :param_2 AS percent_female 
FROM census GROUP BY census.state


In [12]:
# Print the result of executing the query.
df = get_df_select_stmt(connection, stmt)
df.head(10)

Unnamed: 0,state,percent_female
0,Alabama,51.832408
1,Alaska,49.301498
2,Arizona,50.223613
3,Arkansas,51.269928
4,California,50.352332
5,Colorado,49.847671
6,Connecticut,51.668165
7,Delaware,51.611097
8,District of Columbia,53.129626
9,Florida,51.36488


#### 3.3 Determine the top 10 states by population change from 2000 to 2008

In [13]:
# Build query to return state name and population difference from 2008 to 2000
stmt = select([census.columns.state,
     (census.columns.pop2008 - census.columns.pop2000).label('pop_change')
])

# Group by State
stmt = stmt.group_by(census.columns.state)

# Order by Population Change
stmt = stmt.order_by(desc('pop_change'))

# Limit to top 10
stmt = stmt.limit(10)

print(stmt)

SELECT census.state, census.pop2008 - census.pop2000 AS pop_change 
FROM census GROUP BY census.state ORDER BY pop_change DESC
 LIMIT :param_1


In [14]:
# Print the result of executing the query.
df = get_df_select_stmt(connection, stmt)
df

Unnamed: 0,state,pop_change
0,Texas,40137
1,California,35406
2,Florida,21954
3,Arizona,14377
4,Georgia,13357
5,North Carolina,11574
6,Virginia,6639
7,Colorado,6425
8,Utah,5934
9,Illinois,5412


## Droping all tables

In [15]:
# Drop all tables
metadata.drop_all(engine)

# Check to see if census exists
print(census.exists(engine))

False
