# ADS-507 Final Project
Joel Day, Nicholas Lee, and Christine Vu

In [311]:
import pandas as pd
import pymysql
import requests
import warnings
import re
import io
warnings.filterwarnings("ignore")
import getpass

# Get Data

#### Import our csv files from Github as pandas dfs

In [312]:
def github_to_pandas(raw_git_url):
    file_name = str(raw_git_url)
    pull_file = requests.get(file_name).content
    csv_df = pd.read_csv(io.StringIO(pull_file.decode('utf-8')))
    return csv_df

In [313]:
invoice_df = github_to_pandas("https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/Invoices.csv")
orderleads_df = github_to_pandas("https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/OrderLeads.csv")
salesteam_df = github_to_pandas("https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/SalesTeam.csv")

#### Replace spaces in Column Names

In [314]:
# Replace spaces with underscores in all dataframe column names
invoice_df.columns = invoice_df.columns.str.replace(" ", "_")
orderleads_df.columns = orderleads_df.columns.str.replace(" ", "_")
salesteam_df.columns = salesteam_df.columns.str.replace(" ", "_")

# Prepare data for SQL import

## Invoice Table

In [315]:
#invoice_df.head(3)

#### Match SQL's DATETIME(YYYY-MM-DD hh:mm:ss) & DATE (YYYY-MM-DD)

In [316]:
# Date to Date ("d-m-Y")
invoice_df["Date"] = pd.to_datetime(invoice_df["Date"], format='%d-%m-%Y')

# Drop "+HH:MM:SS" to make all uniform to UTC timezone
invoice_df["Date_of_Meal"] = invoice_df["Date_of_Meal"].apply(lambda x: x.split("+")[0])

# Convert Date_of_Meal to Datetime format
invoice_df["Date_of_Meal"] = pd.to_datetime(invoice_df["Date_of_Meal"],format = "%Y-%m-%d %H:%M:%S")

#### Create New Catagorical column defining 'Part of Day'

In [317]:
# Function defining hour of the day with the time of day
def time_of_day(x):
    day_hour = x.hour
    if (day_hour >= 5) and (day_hour <= 8): # 5am - 8am
        return "Early Morning"
    elif (day_hour > 8) and (day_hour <= 12): # 9am - 12pm
        return "Late Morning"
    elif (day_hour > 12) and (day_hour <= 15): # 1pm - 3pm
        return "Early Afternoon"
    elif (day_hour > 15) and (day_hour <= 19): # 4pm - 7pm
        return "Evening"
    elif (day_hour > 19) and (day_hour <= 23): # 8pm - 11pm
        return "Night"
    else: # 12am - 4am
        return "Late Night"

In [318]:
# Apply time_of_day function to Date_of_Meal column
invoice_df["Part_of_Day"] = invoice_df["Date_of_Meal"].apply(time_of_day)

In [319]:
one = invoice_df.copy()
invoice_df.head(6)

Unnamed: 0,Order_Id,Date,Meal_Id,Company_Id,Date_of_Meal,Participants,Meal_Price,Type_of_Meal,Part_of_Day
0,839FKFW2LLX4LMBB,2016-05-27,INBUX904GIHI8YBD,LJKS5NK6788CYMUU,2016-05-31 07:00:00,['David Bishop'],469,Breakfast,Early Morning
1,97OX39BGVMHODLJM,2018-09-27,J0MMOOPP709DIDIE,LJKS5NK6788CYMUU,2018-10-01 20:00:00,['David Bishop'],22,Dinner,Night
2,041ORQM5OIHTIU6L,2014-08-24,E4UJLQNCI16UX5CS,LJKS5NK6788CYMUU,2014-08-23 14:00:00,['Karen Stansell'],314,Lunch,Early Afternoon
3,YT796QI18WNGZ7ZJ,2014-04-12,C9SDFHF7553BE247,LJKS5NK6788CYMUU,2014-04-07 21:00:00,['Addie Patino'],438,Dinner,Night
4,6YLROQT27B6HRF4E,2015-07-28,48EQXS6IHYNZDDZ5,LJKS5NK6788CYMUU,2015-07-27 14:00:00,['Addie Patino' 'Susan Guerrero'],690,Lunch,Early Afternoon
5,AT0R4DFYYAFOC88Q,2014-07-21,W48JPR1UYWJ18NC6,LJKS5NK6788CYMUU,2014-07-17 20:00:00,['David Bishop' 'Susan Guerrero' 'Karen Stanse...,181,Dinner,Night


## Order Table

In [320]:
two = orderleads_df.copy()
orderleads_df.head(6)

Unnamed: 0,Order_Id,Company_Id,Company_Name,Date,Order_Value,Converted
0,80EYLOKP9E762WKG,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,18-02-2017,4875,1
1,TLEXR1HZWTUTBHPB,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,30-07-2015,8425,0
2,839FKFW2LLX4LMBB,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,27-05-2016,4837,0
3,97OX39BGVMHODLJM,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,27-09-2018,343,0
4,5T4LGH4XGBWOD49Z,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,14-01-2016,983,0
5,041ORQM5OIHTIU6L,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,24-08-2014,4185,0


## Sales Table

In [321]:
three = salesteam_df
salesteam_df.head(6)

Unnamed: 0,Sales_Rep,Sales_Rep_Id,Company_Name,Company_Id
0,Jessie Mcallister,97UNNAT790E0WM4N,Chimera-Chasing Casbah,LJKS5NK6788CYMUU
1,Jessie Mcallister,97UNNAT790E0WM4N,Tangential Sheds,36MFTZOYMTAJP1RK
2,Jessie Mcallister,97UNNAT790E0WM4N,Two-Mile Grab,H3JRC7XX7WJAD4ZO
3,Jessie Mcallister,97UNNAT790E0WM4N,Three-Men-And-A-Helper Congo'S,HB25MDZR0MGCQUGX
4,Jessie Mcallister,97UNNAT790E0WM4N,Biophysical Battleground,7RVA8TIVBLBXMNO4
5,Jessie Mcallister,97UNNAT790E0WM4N,Verbal Greenwich,KKM6EZRN9W5NYXP6


## Customer Table

Create a table with each unique customer and use the row index plus one as the customer id.

In [322]:
# Function to convert string ['name' 'name2'] to list ['name', 'name2']
# Returns a list of participant names
def string_to_list(participant_string):
    return re.findall(r"'(.*?)'", participant_string)

In [323]:
invoice_df["Participants"] = invoice_df["Participants"].apply(string_to_list)

In [324]:
# Obtain an array of all unique customer names
customers = invoice_df["Participants"].explode().unique()

In [325]:
# Create new customer dataframe
customers_df = pd.DataFrame(customers,columns = ["CustomerName"])

In [326]:
# Add customer id
customers_df["customer_id"] = customers_df.index + 1

In [327]:
#Create a first_name and last_name column
customers_df["first_name"] = customers_df["CustomerName"].apply(lambda x: x.split(" ")[0])
## Splice the list 1: in the event the person has multiple last names
customers_df["last_name"] = customers_df["CustomerName"].apply(lambda x: x.split(" ")[1])

In [328]:
#Drop first column due to redundant information
#customers_df = customers_df.drop( columns = 'CustomerName')

In [329]:
four = customers_df.copy()
customers_df.head(6)

Unnamed: 0,CustomerName,customer_id,first_name,last_name
0,David Bishop,1,David,Bishop
1,Karen Stansell,2,Karen,Stansell
2,Addie Patino,3,Addie,Patino
3,Susan Guerrero,4,Susan,Guerrero
4,Amanda Knowles,5,Amanda,Knowles
5,Cheryl Feaster,6,Cheryl,Feaster


## Connect to MySQL Server

In [366]:
from sqlalchemy import create_engine

In [367]:
hostname="localhost"
dbname="module3"
uname="root"
pwd="Jd$0615!"
engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}".format(host=hostname, db=dbname, user=uname, pw=pwd))

In [330]:
cnx = pymysql.connect(host='localhost',port=int(3306),user='root',passwd=getpass.getpass('Enter password: '), database = 'ads_507_supermarket')

Enter password: ········


In [331]:
def sql(query):
    cur = cnx.cursor()
    cur.execute(query)
    for x in cur:
        print(x)

In [332]:
def run(query):
    db = pd.read_sql(query, cnx)
    return db

In [359]:
#Create Database
#sql('CREATE DATABASE ADS_507_Supermarket')
#sql('SHOW DATABASES')
#sql('USE ads_507_supermarket')
sql('SHOW TABLES')
#sql('DESCRIBE test5')
#sql('DROP TABLE customers')
#sql('TRUNCATE TABLE test3')

('customers',)
('invoice',)
('orders',)
('sales',)


## Create Empty Tables for each df

##### Invoice

In [351]:
sql('''CREATE TABLE invoice (
    Order_id TINYINT,
    Date DATE, 
    Meal_id TINYINT, 
    Company_id TINYINT,
    Date_of_Meal DATETIME,
    Participants VARCHAR(255),
    Meal_Price SMALLINT,
    Type_of_Meal ENUM('Breakfast', 'Lunch', 'Dinner'),
    Part_of_Day ENUM('Early Morning', 'Late Morning', 'Early Afternoon', 'Night', 'Late Night'),
    PRIMARY KEY (Order_id)
   )''')

In [360]:
sql('DESCRIBE Invoice')

('Order_id', 'tinyint', 'NO', 'PRI', None, '')
('Date', 'date', 'YES', '', None, '')
('Meal_id', 'tinyint', 'YES', '', None, '')
('Company_id', 'tinyint', 'YES', '', None, '')
('Date_of_Meal', 'datetime', 'YES', '', None, '')
('Participants', 'varchar(255)', 'YES', '', None, '')
('Meal_Price', 'smallint', 'YES', '', None, '')
('Type_of_Meal', "enum('Breakfast','Lunch','Dinner')", 'YES', '', None, '')
('Part_of_Day', "enum('Early Morning','Late Morning','Early Afternoon','Night','Late Night')", 'YES', '', None, '')


##### Order

In [356]:
sql('''CREATE TABLE orders (
    Order_id TINYINT,
    Company_id TINYINT,
    Company_Name VARCHAR(255),
    Date DATE,
    Order_Value SMALLINT,
    Converted TINYINT,
    PRIMARY KEY (Order_id, Company_id)
   )''')

In [361]:
sql('DESCRIBE orders')

('Order_id', 'tinyint', 'NO', 'PRI', None, '')
('Company_id', 'tinyint', 'NO', 'PRI', None, '')
('Company_Name', 'varchar(255)', 'YES', '', None, '')
('Date', 'date', 'YES', '', None, '')
('Order_Value', 'smallint', 'YES', '', None, '')
('Converted', 'tinyint', 'YES', '', None, '')


#### Sales

In [357]:
sql('''CREATE TABLE sales (
    Sales_Rep VARCHAR(255),
    Sales_Rep_id TINYINT,
    Company_Name VARCHAR(255),
    Company_id TINYINT,
    PRIMARY KEY (Sales_Rep_id)
   )''')

In [362]:
sql('DESCRIBE sales')

('Sales_Rep', 'varchar(255)', 'YES', '', None, '')
('Sales_Rep_id', 'tinyint', 'NO', 'PRI', None, '')
('Company_Name', 'varchar(255)', 'YES', '', None, '')
('Company_id', 'tinyint', 'YES', '', None, '')


##### Customers

In [358]:
sql('''CREATE TABLE customers (
    CustomerName VARCHAR(255),
    customer_id TINYINT,
    first_name VARCHAR(255),
    last_name VARCHAR(255),
    PRIMARY KEY (customer_id)
   )''')

In [363]:
sql('DESCRIBE customers')

('CustomerName', 'varchar(255)', 'YES', '', None, '')
('customer_id', 'tinyint', 'NO', 'PRI', None, '')
('first_name', 'varchar(255)', 'YES', '', None, '')
('last_name', 'varchar(255)', 'YES', '', None, '')


## Import data from pandas dfs into MySQL Tables

###### Save the dfs to respective SQL databastwo

In [372]:
one.to_sql('invoice', engine, index=False)

In [374]:
two.to_sql('orders', engine, index=False)

In [377]:
three.to_sql('sales', engine, index=False)

In [381]:
four.to_sql('customers', engine, index=False)