### Big Ideas:
   - cache (save) your data to speed up data acquisition
   - helper functions are your friends

### two deliverables:
   - Notebook (creating your acquisition)
   - aquire.py file- your own homemade library of functions

In [5]:
#this will give you a list of all the datasets in seaborn
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [6]:
#this is how you get all the datasets in pydataset
data()

Unnamed: 0,dataset_id,title
0,AirPassengers,Monthly Airline Passenger Numbers 1949-1960
1,BJsales,Sales Data with Leading Indicator
2,BOD,Biochemical Oxygen Demand
3,Formaldehyde,Determination of Formaldehyde
4,HairEyeColor,Hair and Eye Color of Statistics Students
...,...,...
752,VerbAgg,Verbal Aggression item responses
753,cake,Breakage Angle of Chocolate Cakes
754,cbpp,Contagious bovine pleuropneumonia
755,grouseticks,Data on red grouse ticks from Elston et al. 2001


### 3 functions to read in files:
   - pd.read_excel('file_name.xlsx', sheet_name='sheet_name')
   - pd.read_csv('filename.csv')
   - pd.read_sql(sql_query, connection_url)

______

In [7]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pydataset import data

### Read into Pandas dataframe from SQL

In [10]:
from env import host, user, password

#first step
sql_query = 'SELECT * FROM passengers'

#second step
connection_url = f'mysql+pymysql://{user}:{password}@{host}/titanic_db'
    
#third step
titanic_df= pd.read_sql(sql_query, connection_url)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


________

### Read into Pandas dataframe from a file

##### from CSV
df = pd.read_csv('file_path/file_name.csv')

##### from Amazon Server
df = pd.read_csv('https://s3.amazonaws.com/bucket_and_or_file_name.csv')

##### from Google Sheet
- sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'

- csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

- df = pd.read_csv(csv_export_url)

________

### From your Clipboard

- highlight and copy tabular data from article
- go to noteook and type "pd.read_clipboard()"
- dataframe comes up!

In [15]:
pd.read_clipboard()

Unnamed: 0,Name,Position,Office,Age,Start date,Salary
0,Tiger Nixon,System Architect,Edinburgh,61,2011/04/25,"$320,800"
1,Garrett Winters,Accountant,Tokyo,63,2011/07/25,"$170,750"
2,Ashton Cox,Junior Technical Author,San Francisco,66,2009/01/12,"$86,000"
3,Cedric Kelly,Senior Javascript Developer,Edinburgh,22,2012/03/29,"$433,060"
4,Airi Satou,Accountant,Tokyo,33,2008/11/28,"$162,700"
5,Brielle Williamson,Integration Specialist,New York,61,2012/12/02,"$372,000"
6,Herrod Chandler,Sales Assistant,San Francisco,59,2012/08/06,"$137,500"


##### to clean up data
   - pass a seperator through stating you want a new column after every comma
       - pd.read_clipboard(sep= ',')
    
   - give new column names
        - (ex): columns = [ 'Name', 'Position', 'Office', 'Age', 'Start date', 'Salary']
   

_________

### Aquire data From Excel Sheet

pd.read_excel('your_excel_file_name.xlsx', sheet_name='your_table_name', usecols=['this_one', 'this_one'])

_______

### Aquire data From Pydataset

In [21]:
from pydataset import data

#data('iris', show_doc=True)

In [22]:
#create the DataFrame 
df_iris =data('iris')
df_iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


______

### Aquire data From Seaborn

In [19]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


________

# Automating Data Acquisition

## Data Science Pipeline
- acquire
- prepare
- explore
- model
- evaluate

In [28]:
#helper function to get connection
def get_connection(db, user=user, host=host, password=password):
    '''
    This function uses my info from my env file to
    create a connection url to access the Codeup db.
    '''
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [29]:
#helper function to get titanic_db
def new_titanic_data():
    '''
    This function reads in the titanic data from the Codeup db
    and returns a pandas DataFrame with all columns.
    '''
    sql_query = 'SELECT * FROM passengers'
    return pd.read_sql(sql_query, get_connection('titanic_db'))

In [30]:
df = new_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### Automatic Data Acquisition:
- import pandas
- import numpy
- import os

- from env import host, user, password
- from pydataset import data
- **from acquire import new_titanic_data**
    - you can NOW import this new data!

________

# Caching Data

- how to store data locally (csv file)
- this will save us time
- if data is updated daily (or frequently), you need to update it regularly

In [31]:
# check to see if file name exists already
os.path.isfile('titanic_df.csv') #<-- file does not exist- False

False

In [32]:
# How to create the CSV file
titanic_df.to_csv('titanic_df.csv')

In [34]:
os.path.isfile('titanic_df.csv') #<--- now this file does exist- True

True

In [36]:
# create Mother Function
def get_titanic_data(cached=False):
    '''
    This function reads in titanic data from Codeup database and writes data to
    a csv file if cached == False or if cached == True reads in titanic df from
    a csv file, returns df.
    '''
    if cached == False or os.path.isfile('titanic_df.csv') == False:
        
        # Read fresh data from db into a DataFrame.
        df = new_titanic_data()
        
        # Write DataFrame to a csv file.
        df.to_csv('titanic_df.csv')
        
    else:
        
        # If csv file exists or cached == True, read in data from csv.
        df = pd.read_csv('titanic_df.csv', index_col=0)
        
    return df

In [37]:
#call mother function
df = get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1
