In [57]:
import sqlalchemy as db
import pandas as pd
import os
import csv
import seaborn as sns
import matplotlib.pyplot as plt

## Connecting to database

In [58]:
# connecting to database sql_challenge created in postgresql
engine = db.create_engine('postgresql://localhost:5432/test')
connection = engine.connect()
metadata = db.MetaData()

In [59]:
#to find all the tables for the database (it should be empty)
engine.table_names()

  engine.table_names()


[]

# Creating Table


In [60]:
#creating new empty tables 
election = db.Table('election', metadata,
              db.Column('County', db.String(100), primary_key=True, nullable=False), 
              db.Column('State', db.String(100),primary_key=True, nullable=False),
                db.Column('Republic', db.Float()), 
                db.Column('Democrate', db.Float()), 
                db.Column('Win', db.String(100)) 
              )

census = db.Table('census', metadata,
              db.Column('County', db.String(), primary_key=True, nullable=False), 
              db.Column('State', db.String(20), primary_key=True, nullable=False),
                  db.Column('Population', db.Float()), 
                  db.Column('Median Age', db.Float()),
                  db.Column('Household Income', db.Float()), 
                  db.Column('Poverty Rate', db.Float()), 
                  db.Column('Unemployment Rate', db.Float()), 
                  db.Column('High School Rate', db.Float()), 
                  db.Column('College Rate', db.Float()), 
                  db.Column('Uneducated Rate', db.Float()), 
                  db.Column('White Population Rate', db.Float()), 
                  db.Column('Black Population Rate', db.Float()),
                  db.Column('Hispanic Population Rate', db.Float()),
                  db.Column('Asian Population Rate', db.Float())                  
              )

vaccine = db.Table('vaccine', metadata,
                   db.Column('Date', db.Date()),
                  db.Column('County', db.String(100), primary_key=True, nullable=False), 
                  db.Column('State', db.String(100),primary_key=True, nullable=False),  
                   db.Column('Series_Complete_Pop_Pct', db.Float()),
                   db.Column('Fully_Vaccinated', db.Float()),
                   db.Column('Fully_Vaccinated_12+', db.Float()),
                   db.Column('Fully_Vaccinated_18+', db.Float()),
                   db.Column('Fully_Vaccinated_65+', db.Float()),
                   db.Column('Completeness_pct', db.Float()),
                   db.Column('Administered_Dose1_Recip', db.Float()),
                   db.Column('Partially_Vaccinated', db.Float()),
                   db.Column('Partially_Vaccinated_12+', db.Float()),
                   db.Column('Partially_Vaccinated_18+', db.Float()),
                   db.Column('Partially_Vaccinated_65+', db.Float()),     
              )

metadata.create_all(engine) #Creates the tables

In [61]:
#to find all the tables for the database(it will show all the tables created)
engine.table_names()

  engine.table_names()


['election', 'census', 'vaccine']

In [62]:
#getting the tables from database
election = db.Table('election', metadata, autoload=True, autoload_with=engine)
census = db.Table('census', metadata, autoload=True, autoload_with=engine)
vaccine = db.Table('vaccine', metadata, autoload=True, autoload_with=engine)

## Inserting values in tables from csv files 


In [63]:
#For table election
path_election=os.path.join('cleaned_data', 'election_data.csv')

with open(path_election, 'r', encoding="utf-8") as elec:
    next(elec) #removing header
    csv_election = csv.reader(elec, delimiter=',')
    engine.execute(
        election.insert(),
        [{'County': row[0],
         'State': row[1],
         'Republic': row[2],
         'Democrate':row[3],
         'Win': row[4]} 
            for row in csv_election]
    )


#For table census
path_census=os.path.join('cleaned_data', 'census_data.csv')

with open(path_census, 'r', encoding="utf-8") as cens:
    next(cens) #removing header
    csv_census = csv.reader(cens, delimiter=',')
    engine.execute(
        census.insert(),
        [{'County': row[0],
          'State': row[1],
          'Population': row[2],
          'Median Age': row[3],
          'Household Income': row[4],
           'Poverty Rate': row[5],
          'Unemployment Rate': row[6],
          'High School Rate': row[7],
          'College Rate': row[8],
           'Uneducated Rate': row[9], 
          'White Population Rate': row[10],
          'Black Population Rate': row[11],
           'Hispanic Population Rate': row[12],
          'Asian Population Rate': row[13]} 
            for row in csv_census]
    )

#For table cdc_vaccine
path_vaccine=os.path.join('cleaned_data', 'vaccine_data.csv')

with open(path_vaccine, 'r', encoding="utf-8") as vacc:
    next(vacc) #removing header
    csv_vaccine = csv.reader(vacc, delimiter=',')
    engine.execute(
        vaccine.insert(),
        [{'Date': row[0],
          'County': row[1],
          'State': row[2],
          'Series_Complete_Pop_Pct': row[3],
           'Fully_Vaccinated': row[4],
          'Fully_Vaccinated_12+': row[5],
          'Fully_Vaccinated_18+': row[6],
           'Fully_Vaccinated_65+': row[7],
          'Completeness_pct': row[8],
          'Administered_Dose1_Recip': row[9],
           'Partially_Vaccinated': row[10],
          'Partially_Vaccinated_12+': row[11],
           'Partially_Vaccinated_18+': row[12],
          'Partially_Vaccinated_65+': row[13]}
         for row in csv_vaccine]
    )


In [65]:
# Print full table metadata for employees table
print(repr(metadata.tables['census']))

Table('census', MetaData(), Column('County', String(), table=<census>, primary_key=True, nullable=False), Column('State', String(length=20), table=<census>, primary_key=True, nullable=False), Column('Population', Float(), table=<census>), Column('Median Age', Float(), table=<census>), Column('Household Income', Float(), table=<census>), Column('Poverty Rate', Float(), table=<census>), Column('Unemployment Rate', Float(), table=<census>), Column('High School Rate', Float(), table=<census>), Column('College Rate', Float(), table=<census>), Column('Uneducated Rate', Float(), table=<census>), Column('White Population Rate', Float(), table=<census>), Column('Black Population Rate', Float(), table=<census>), Column('Hispanic Population Rate', Float(), table=<census>), Column('Asian Population Rate', Float(), table=<census>), schema=None)


In [66]:
#can also use c instead of columns
census.c.keys()

['County',
 'State',
 'Population',
 'Median Age',
 'Household Income',
 'Poverty Rate',
 'Unemployment Rate',
 'High School Rate',
 'College Rate',
 'Uneducated Rate',
 'White Population Rate',
 'Black Population Rate',
 'Hispanic Population Rate',
 'Asian Population Rate']

# Querying


### 1. To list employee number, last name, first name, sex, and salary

In [None]:
### sql
sql_1= '''SELECT .emp_no, e.last_name, e.first_name, e.sex, s.salary 
       FROM employees AS e
       JOIN salaries AS s
       ON e.emp_no=s.emp_no'''

In [None]:
#using text
results_1=connection.execute(db.text(sql_1)).fetchall()
df_1 = pd.DataFrame(results_1, columns=(['Emp_No', 'Last_Name', 
                                         "First_Name",'Sex','Salary']))
df_1


In [None]:
#using sql_alchemy
res_1=connection.execute(db.select([employees.c.emp_no,
                                   employees.c.last_name,
                                   employees.c.first_name,
                                   employees.c.sex,
                                   salaries.c.salary])
                         .join(salaries, employees.c.emp_no==salaries.c.emp_no)
                         ).fetchall()
df_alc1 = pd.DataFrame(res_1, columns=(['Emp_No', 'Last_Name', 
                                         "First_Name",'Sex','Salary']))
df_alc1

### 3. To list the manager of each department with the following information: department number, department name, the manager's employee number, last name, first name.

In [None]:
sql_3='''SELECT  d.dept_no, d.dept_name, e.emp_no, e.last_name, e.first_name 
        FROM departments AS d
        JOIN dept_manager AS dm
        ON d.dept_no=dm.dept_no
        JOIN employees AS e
        ON dm.emp_no=e.emp_no'''

In [None]:
#using text
results_3=connection.execute(db.text(sql_3)).fetchall()
df_3 = pd.DataFrame(results_3, columns=(['dept_no', 'dept_name', 
                                         'emp_no', 'last_name', 'first_name']))
df_3

In [None]:
#using sql_alchemy
res_3=connection.execute(db.select([departments.c.dept_no,
                                    departments.c.dept_name,
                                    employees.c.emp_no,
                                   employees.c.last_name,
                                   employees.c.first_name])
                         .join(dept_manager, departments.c.dept_no==dept_manager.c.dept_no)
                         .join(employees, dept_manager.c.emp_no==employees.c.emp_no)
                         ).fetchall()
df_alc3 = pd.DataFrame(res_3, columns=(['dept_no', 'dept_name', 
                                         'emp_no', 'last_name', 'first_name']))
df_alc3

### 7. To list all employees in the Sales and Development departments, including their employee number, last name, first name, and department name.

In [None]:
sql_7='''SELECT e.emp_no, e.last_name, e.first_name, e.sex, d.dept_name
        FROM employees AS e
        JOIN dept_emp AS de
        ON e.emp_no=de.emp_no
        JOIN departments AS d
        ON de.dept_no=d.dept_no
        WHERE d.dept_name='Sales' OR d.dept_name='Development' '''

In [None]:
#using text
results_7=connection.execute(db.text(sql_7)).fetchall()
df_7 = pd.DataFrame(results_7, columns=(['emp_no', 'last_name', 
                                         'first_name','sex','dept_name']))
df_7

In [None]:
#using sql_alchemy
res_7=connection.execute(db.select([employees.c.emp_no,
                                   employees.c.last_name,
                                   employees.c.first_name,
                                   employees.c.sex,
                                   departments.c.dept_name])
                         .join(dept_emp, employees.c.emp_no==dept_emp.c.emp_no)
                         .join(departments, dept_emp.c.dept_no==departments.c.dept_no)
                         .where(db.or_(departments.c.dept_name=='Sales', 
                                      departments.c.dept_name=='Development'))
                         ).fetchall()
df_alc7 = pd.DataFrame(res_7, columns=(['emp_no', 'last_name', 
                                         'first_name','sex','dept_name']))
df_alc7

# Plotting

In [None]:
#histogram to visualize the most common salary ranges for employees.
fig, ax=plt.subplots(figsize=(12,12))
df_1['Salary'].plot(kind='hist', color='r',ax=ax)
ax.set(xlabel='Salary($)', title='Salary range for employees')
plt.show()

In [None]:
#histogram to visualize the most common salary ranges for employees
fig, ax=plt.subplots(figsize=(12,12))
sns.histplot(x='Salary', data=df_1,kde=True,hue="Sex", ax=ax)
ax.set(xlabel='Salary($)', title='Salary range for employees')
plt.show()

### Most common lastname

In [None]:
most_common_lastname=df_8.nlargest(20, 'count')
fig, ax=plt.subplots(figsize=(15,15))
sns.barplot(x='last_name', y='count',
            data=most_common_lastname, alpha=0.9,ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()

### Bar chart of average salary by title.

In [None]:
#Create a bar chart of average salary by title.
sql_9='''SELECT e.emp_no, e.last_name, e.first_name, t.title, s.salary
        FROM employees AS e
        JOIN salaries AS s
        ON e.emp_no=s.emp_no
        JOIN titles AS t
        ON e.emp_title_id=t.title_id'''
results_9=connection.execute(db.text(sql_9)).fetchall()
df_9 = pd.DataFrame(results_9, columns=(['emp_no','last_name',
                                         'first_name', 'title', 'salary']))
df_9

In [None]:
fig, ax=plt.subplots(figsize=(15,15))
sns.barplot(x='title', y='salary',
            data=df_9, alpha=0.9,ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()

## Bar and pie plots for Departments

In [None]:
dep=pd.DataFrame(df_4.groupby('dept_name')['emp_no'].count())
dep.columns=['Total employees']

In [None]:
fig,ax=plt.subplots(figsize=(12,12))
dep.plot(kind='bar', color='r',ax=ax)
ax.set(xlabel='Departments', ylabel='Count')
ax.legend()
plt.show()

In [None]:
plt.style.use('ggplot')
fig,ax=plt.subplots(figsize=(12,12))
dep.plot(kind='pie', y='Total employees',autopct='%1.0f%%', ax=ax)
plt.show()