In [1]:
import numpy as np
import pandas as pd
import scipy as scs
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
import datetime as dt

pd.set_option("display.max_columns", None)

In [3]:
from google.colab import auth
auth.authenticate_user()

In [4]:

from google.cloud import bigquery
proj_name = 'pg-duke-student-capstone-v1'
proj_num = '545682952716'
proj_id = 'pg-duke-student-capstone-v1'
client = bigquery.Client(project=proj_id)

### Importing the CSAT DATA

In [5]:
sql_query_csat = ('''SELECT *
                FROM pg-duke-student-capstone-v1.source_data.fact_csat
                ''')

In [6]:
df_csat = client.query(sql_query_csat).to_dataframe()


In [7]:
df_csat.head()
df_csat.shape

(6372, 20)

In [8]:
# browsing the data
df_csat.head()

Unnamed: 0,response_id,name,email,score,comment,response_timestamp,notes,tags,additional_question,application_id,assignee_name,conversation_id,customer_id,browser,device_type,os,source,internal_case_tags,lifecycle_stage_prior_to_withdrawn,withdrawn_lifecycle_stage
0,86944680,James Lawson,recordarchives@yahoo.com,1.0,You ignored all of my emails. If you're going...,2019-05-21 06:12:12+00:00,,,,1500545,,9bc56e88-1474-4b82-87fc-16b855,14750405,Chrome,Desktop,Windows,Email,,fake,withdrawn
1,133149867,Stanley Vigil,avigilstanley@reagan.com,1.0,,2020-07-07 23:00:28+00:00,,,,2696137,,bb80fa5e-1af7-4955-93fb-e017ee,31533636,Chrome,Desktop,Windows,Email,,fake,withdrawn
2,176595914,Anthony Spicer,tonyspicer1966@yahoo.com,2.0,,2021-06-12 14:16:27+00:00,,,,4014257,,05bd4b09-15c8-43a1-9eb0-91d025,48531364,Chrome,Desktop,Windows,Email,,fake,withdrawn
3,156203539,Helen Present,presentpt@gmail.com,1.0,Was looking for instant estimates and did not ...,2020-12-20 13:11:35+00:00,,,,3279081,,e5ffb75c-1a8b-4d40-9f92-ff6b78,37792405,Safari,Desktop,Mac,Email,,fake,withdrawn
4,161881765,,beachykeenrentals@gmail.com,4.0,,2021-02-08 12:27:23+00:00,,,,3510361,,71da4556-eb6a-4f26-97d8-9a9474,41155353,Safari,Desktop,Mac,Email,,fake,withdrawn


In [10]:
# we have 20 columns
df_csat.columns

Index(['response_id', 'name', 'email', 'score', 'comment',
       'response_timestamp', 'notes', 'tags', 'additional_question',
       'application_id', 'assignee_name', 'conversation_id', 'customer_id',
       'browser', 'device_type', 'os', 'source', 'internal_case_tags',
       'lifecycle_stage_prior_to_withdrawn', 'withdrawn_lifecycle_stage'],
      dtype='object')

In [11]:
# Application id looks fine, we don't seem to have product type here
df_csat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6372 entries, 0 to 6371
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   response_id                         6372 non-null   int64              
 1   name                                5991 non-null   object             
 2   email                               6372 non-null   object             
 3   score                               6372 non-null   float64            
 4   comment                             3486 non-null   object             
 5   response_timestamp                  6372 non-null   datetime64[ns, UTC]
 6   notes                               0 non-null      object             
 7   tags                                0 non-null      object             
 8   additional_question                 0 non-null      object             
 9   application_id                      6372 

In [12]:

## 
df_csat.lifecycle_stage_prior_to_withdrawn.value_counts()

application_requested                          2676
sign_and_exam                                  1724
application_in_underwriting                     541
application_not_started                         493
aps_ordered                                     305
contact_info_submitted_online                   295
application_started_by_client                   141
application_in_underwriting_tentative_offer      56
application_declined                             28
fake                                             24
application_approved_with_modifications          20
policy_rejected                                  14
policy_offer_postponed                           12
application_approved_as_applied                  11
phone_screen_decline                              8
policy_sent_to_client                             5
application_pending_client                        5
case_closed_incomplete                            4
application_in_underwriting_final_review          3
policy_in_fo

In [13]:
## checking duplicates in application id [No dupliactes in application_id]

df_csat.loc[df_csat.duplicated(keep=False)].sort_values(by="application_id")


Unnamed: 0,response_id,name,email,score,comment,response_timestamp,notes,tags,additional_question,application_id,assignee_name,conversation_id,customer_id,browser,device_type,os,source,internal_case_tags,lifecycle_stage_prior_to_withdrawn,withdrawn_lifecycle_stage


In [14]:
df_csat.columns

Index(['response_id', 'name', 'email', 'score', 'comment',
       'response_timestamp', 'notes', 'tags', 'additional_question',
       'application_id', 'assignee_name', 'conversation_id', 'customer_id',
       'browser', 'device_type', 'os', 'source', 'internal_case_tags',
       'lifecycle_stage_prior_to_withdrawn', 'withdrawn_lifecycle_stage'],
      dtype='object')

In [16]:
## reordering columns so that ids appear first

columnsTitles = ['application_id', 'response_id', 'response_timestamp', 'customer_id', 'conversation_id','name', 'email', 'score', 'comment', 'notes','tags', \
                 'additional_question', 'assignee_name', 'device_type', 'browser', 'os', 'source','internal_case_tags','lifecycle_stage_prior_to_withdrawn', \
                 'withdrawn_lifecycle_stage']
df_csat = df_csat.reindex(columns=columnsTitles)
df_csat.head()


Unnamed: 0,application_id,response_id,response_timestamp,customer_id,conversation_id,name,email,score,comment,notes,tags,additional_question,assignee_name,device_type,browser,os,source,internal_case_tags,lifecycle_stage_prior_to_withdrawn,withdrawn_lifecycle_stage
0,1500545,86944680,2019-05-21 06:12:12+00:00,14750405,9bc56e88-1474-4b82-87fc-16b855,James Lawson,recordarchives@yahoo.com,1.0,You ignored all of my emails. If you're going...,,,,,Desktop,Chrome,Windows,Email,,fake,withdrawn
1,2696137,133149867,2020-07-07 23:00:28+00:00,31533636,bb80fa5e-1af7-4955-93fb-e017ee,Stanley Vigil,avigilstanley@reagan.com,1.0,,,,,,Desktop,Chrome,Windows,Email,,fake,withdrawn
2,4014257,176595914,2021-06-12 14:16:27+00:00,48531364,05bd4b09-15c8-43a1-9eb0-91d025,Anthony Spicer,tonyspicer1966@yahoo.com,2.0,,,,,,Desktop,Chrome,Windows,Email,,fake,withdrawn
3,3279081,156203539,2020-12-20 13:11:35+00:00,37792405,e5ffb75c-1a8b-4d40-9f92-ff6b78,Helen Present,presentpt@gmail.com,1.0,Was looking for instant estimates and did not ...,,,,,Desktop,Safari,Mac,Email,,fake,withdrawn
4,3510361,161881765,2021-02-08 12:27:23+00:00,41155353,71da4556-eb6a-4f26-97d8-9a9474,,beachykeenrentals@gmail.com,4.0,,,,,,Desktop,Safari,Mac,Email,,fake,withdrawn


In [17]:
## check duplicates in other ids [No other ids are duplicated]

df_csat.loc[df_csat.duplicated("response_timestamp", keep=False)].sort_values("response_timestamp").head(10)


Unnamed: 0,application_id,response_id,response_timestamp,customer_id,conversation_id,name,email,score,comment,notes,tags,additional_question,assignee_name,device_type,browser,os,source,internal_case_tags,lifecycle_stage_prior_to_withdrawn,withdrawn_lifecycle_stage


In [19]:
## checking for any missing data [if comment is the equivalent of verbatim then we may throw away 45.3% of data]
missing = pd.DataFrame({"percent_missing" : df_csat.isna().sum()*100/len(df_csat)})
missing

Unnamed: 0,percent_missing
application_id,0.0
response_id,0.0
response_timestamp,0.0
customer_id,0.0
conversation_id,0.0
name,5.979284
email,0.0
score,0.0
comment,45.291902
notes,100.0


In [20]:
### Final CSTAT DATA
df_csat.shape


(6372, 20)

In [23]:
## save the data in CSV 
#df_csat.to_csv (r'C:\Users\Ron\Desktop\export_dataframe.csv', index = False, header=True)
df_csat.to_csv('df_csat.csv')
#print (df_csat)

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
