# ADS-507 Final Project
Joel Day, Nicholas Lee, and Christine Vu

In [1]:
# Packages #

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import pymysql
import requests
import io
import os

import warnings
warnings.filterwarnings("ignore")

Import in CSV files

In [2]:
# Function to Pull Raw CSV from GitHub and Convert to Pandas Dataframe Object

def github_to_pandas(raw_git_url):
    # Pull Raw CSV File from GitHub
    file_name = str(raw_git_url)
    pull_file = requests.get(file_name).content

    # Convert Raw CSV to Pandas Dataframe
    csv_df = pd.read_csv(io.StringIO(pull_file.decode('utf-8')))

    return csv_df

In [3]:
# Pull CSV files from GitHub and Convert to Pandas Dataframe
invoice_df = github_to_pandas(
    "https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/Invoices.csv")

orderleads_df = github_to_pandas(
    "https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/OrderLeads.csv"
)

salesteam_df = github_to_pandas(
    "https://raw.githubusercontent.com/nlee98/ADS-507-Data-Engineering/main/SalesTeam.csv"
)

## Explore CSV Files

### Invoice CSV

In [4]:
invoice_df.head(3)

Unnamed: 0,Order Id,Date,Meal Id,Company Id,Date of Meal,Participants,Meal Price,Type of Meal
0,839FKFW2LLX4LMBB,27-05-2016,INBUX904GIHI8YBD,LJKS5NK6788CYMUU,2016-05-31 07:00:00+02:00,['David Bishop'],469,Breakfast
1,97OX39BGVMHODLJM,27-09-2018,J0MMOOPP709DIDIE,LJKS5NK6788CYMUU,2018-10-01 20:00:00+02:00,['David Bishop'],22,Dinner
2,041ORQM5OIHTIU6L,24-08-2014,E4UJLQNCI16UX5CS,LJKS5NK6788CYMUU,2014-08-23 14:00:00+02:00,['Karen Stansell'],314,Lunch


#### Transformations
* Add Underscores to each column name
* Transform Date and Date of Meal to date/datetime data types
* Time of day column

In [5]:
# Replace spaces with underscores in all dataframe column names
invoice_df.columns = invoice_df.columns.str.replace(" ", "_")
orderleads_df.columns = orderleads_df.columns.str.replace(" ", "_")
salesteam_df.columns = salesteam_df.columns.str.replace(" ", "_")

In [6]:
# Date to Date ("d-m-Y")
invoice_df["Date"] = pd.to_datetime(
    invoice_df["Date"], format='%d-%m-%Y')

In [11]:
# Drop "+HH:MM:SS" to make all uniform to UTC timezone
invoice_df["Date_of_Meal"] = invoice_df["Date_of_Meal"].apply(
    lambda x: x.split("+")[0]
)

# Convert Date_of_Meal to Datetime format
invoice_df["Date_of_Meal"] = pd.to_datetime(
    invoice_df["Date_of_Meal"],
    format = "%Y-%m-%d %H:%M:%s"
)

In [14]:
# Convert Date_of_Meal to Datetime format
invoice_df["Date_of_Meal"] = pd.to_datetime(
    invoice_df["Date_of_Meal"],
    format = "%Y-%m-%d %H:%M:%S"
)

In [40]:
# Function defining hour of the day with the time of day
def time_of_day(x):
    day_hour = x.hour
    if (day_hour >= 5) and (day_hour <= 8): # 5am - 8am
        return "Early Morning"
    elif (day_hour > 8) and (day_hour <= 12): # 9am - 12pm
        return "Late Morning"
    elif (day_hour > 12) and (day_hour <= 15): # 1pm - 3pm
        return "Early Afternoon"
    elif (day_hour > 15) and (day_hour <= 19): # 4pm - 7pm
        return "Evening"
    elif (day_hour > 19) and (day_hour <= 23): # 8pm - 11pm
        return "Night"
    else: # 12am - 4am
        return "Late Night"

In [41]:
# Apply time_of_day function to Date_of_Meal column

invoice_df["Part_of_Day"] = invoice_df["Date_of_Meal"].apply(time_of_day)

In [44]:
# Add a field to count the number of participants
invoice_df["Number_of_Participants"] = invoice_df["Participants"].apply(lambda x:len(x))

### Unique Customer Names and Table
Create a table with each unique customer and use the row index plus one as the customer id.

In [45]:
# Function to convert string ['name' 'name2'] to list ['name', 'name2']
# Returns a list of participant names
def string_to_list(participant_string):
    return re.findall(r"'(.*?)'", participant_string)

invoice_df["Participants"] = invoice_df["Participants"].apply(string_to_list)

In [46]:
# Obtain an array of all unique customer names
customers = invoice_df["Participants"].explode().unique()

# Create new customer dataframe
customers_df = pd.DataFrame(
    customers,
    columns = ["CustomerName"]
)

# Add customer id
customers_df["customer_id"] = customers_df.index + 1

# Create a first_name and last_name column
customers_df["first_name"] = customers_df["CustomerName"].apply(lambda x: x.split(" ")[0])
## Splice the list 1: in the event the person has multiple last names
customers_df["last_name"] = customers_df["CustomerName"].apply(lambda x: x.split(" ")[1])

### Customer-Order Table
Connect the customer id to each order id the customer placed. This table will link the customer information to the invoice information.

In [50]:
cust_order_df = pd.DataFrame(column = ["cust_id", "order_id"])


for i in range(0, 10):
    # Pulls in the row list of participant(s)
    customer_list = invoice_df["Participants"][i]
    # Corresponding order_id
    order_id = invoice_df["Order_Id"][i]
    for j in range(0, len(customer_list)):
        # Iterates over each name in the row list
        name = customer_list[j]
        # Get customer_id
        cust_id = customers_df.loc[customers_df["CustomerName"] == name]
        cust_order_df.loc[len(cust_order_df.index)] = [cust_id, order_id]

        

['David Bishop']
['David Bishop']
['Karen Stansell']
['Addie Patino']
['Addie Patino', 'Susan Guerrero']
['David Bishop', 'Susan Guerrero', 'Karen Stansell']
['Susan Guerrero', 'David Bishop']
['Amanda Knowles', 'Cheryl Feaster', 'Ginger Hoagland', 'Michael White']
['Cheryl Feaster', 'Amanda Knowles', 'Ginger Hoagland']
['Glenn Gould', 'Amanda Knowles', 'Ginger Hoagland', 'Michael White']


In [63]:
cust_order_df = pd.DataFrame(columns = ["cust_id", "order_id"])


for i in range(0, len(invoice_df["Participants"])):
    # Pulls in the row list of participant(s)
    customer_list = invoice_df["Participants"][i]
    # Corresponding order_id
    order_id = invoice_df["Order_Id"][i]
    for j in range(0, len(customer_list)):
        # Iterates over each name in the row list
        name = customer_list[j]
        # Get customer_id belonging to the name
        cust_id = customers_df.loc[
            customers_df["CustomerName"] == name, "customer_id"
            ].item()
        # Add customer_id and order_id to dataframe
        cust_order_df.loc[len(cust_order_df.index)] = [cust_id, order_id]

### Order Leads CSV
* Converted Column - Whether or not a order was converted into a sale

In [6]:
orderleads_df.head(3)

Unnamed: 0,Order Id,Company Id,Company Name,Date,Order Value,Converted
0,80EYLOKP9E762WKG,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,18-02-2017,4875,1
1,TLEXR1HZWTUTBHPB,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,30-07-2015,8425,0
2,839FKFW2LLX4LMBB,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,27-05-2016,4837,0


In [14]:
orderleads_df.loc[orderleads_df["Order Id"] == "839FKFW2LLX4LMBB"]

Unnamed: 0,Order Id,Company Id,Company Name,Date,Order Value,Converted
2,839FKFW2LLX4LMBB,LJKS5NK6788CYMUU,Chimera-Chasing Casbah,27-05-2016,4837,0


### Sales Team CSV

In [7]:
salesteam_df.head(3)

Unnamed: 0,Sales Rep,Sales Rep Id,Company Name,Company Id
0,Jessie Mcallister,97UNNAT790E0WM4N,Chimera-Chasing Casbah,LJKS5NK6788CYMUU
1,Jessie Mcallister,97UNNAT790E0WM4N,Tangential Sheds,36MFTZOYMTAJP1RK
2,Jessie Mcallister,97UNNAT790E0WM4N,Two-Mile Grab,H3JRC7XX7WJAD4ZO


## Connection to MySQL Server

In [13]:
# Manually Login to MySQL
mysql_username = str(input("Enter MySQL Username: "))
mysql_password = str(input("Enter MySQL Password: "))

mysql_conn = pymysql.connect(
    host = "localhost",
    port = int(3306),
    user = mysql_username,
    passwd = mysql_password
)

### Create Supermarket Database - if it does not already exist

In [131]:
# Create ADS-507_Supermarket MySQL Database
mysql_conn.cursor().execute(
    """
    CREATE DATABASE IF NOT EXISTS ADS_507_Supermarket;
    """
)

# Navigate to Supermarket Database
mysql_conn.select_db("ADS_507_Supermarket")

## Upload dataframes as tables into MySQL
* Invoice
* Orders
* Sales Lead
* Customer
* Customer-order